Choose language-specific search stemmer; fix #6957

We use the list of preferred languages for the user at the time of search table creation to pick the most relevant stemming algorithm for our search tokenizer. If we don't find a stemmer that matches any preferred language, we use the English stemming algorithm as the default.
2013-05-24 14:24:32 -07:00 · 2013-05-24 14:24:32 -07:00 · 8a4ed32908
commit 8a4ed32908
parent f54ea44977
2 changed files with 41 additions and 3 deletions
--- a/src/engine/imap-db/imap-db-database.vala
+++ b/src/engine/imap-db/imap-db-database.vala
@ -86,9 +86,11 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
    // Version 10.
    private void post_upgrade_add_search_table() {
        try {
+            string stemmer = find_appropriate_search_stemmer();
+            debug("Creating search table using %s stemmer", stemmer);
+            
            // This can't go in the .sql file because its schema (the stemmer
            // algorithm) is determined at runtime.
-            string stemmer = "english"; // TODO
            exec("""
                CREATE VIRTUAL TABLE MessageSearchTable USING fts4(
                    id INTEGER PRIMARY KEY,
@ -109,6 +111,40 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
        }
    }
    
+    private string find_appropriate_search_stemmer() {
+        // Unfortunately, the stemmer library only accepts the full language
+        // name for the stemming algorithm.  This translates between the user's
+        // preferred language ISO 639-1 code and our available stemmers.
+        // FIXME: the available list here is determined by what's included in
+        // src/sqlite3-unicodesn/CMakeLists.txt.  We should pass that list in
+        // instead of hardcoding it here.
+        foreach (string l in Intl.get_language_names()) {
+            switch (l) {
+                case "da": return "danish";
+                case "nl": return "dutch";
+                case "en": return "english";
+                case "fi": return "finnish";
+                case "fr": return "french";
+                case "de": return "german";
+                case "hu": return "hungarian";
+                case "it": return "italian";
+                case "no": return "norwegian";
+                case "pt": return "portuguese";
+                case "ro": return "romanian";
+                case "ru": return "russian";
+                case "es": return "spanish";
+                case "sv": return "swedish";
+                case "tr": return "turkish";
+            }
+        }
+        
+        // Default to English because it seems to be on average the language
+        // most likely to be present in emails, regardless of the user's
+        // language setting.  This is not an exact science, and search results
+        // should be ok either way in most cases.
+        return "english";
+    }
+    
    private void on_prepare_database_connection(Db.Connection cx) throws Error {
        cx.set_busy_timeout_msec(Db.Connection.RECOMMENDED_BUSY_TIMEOUT_MSEC);
        cx.set_foreign_keys(true);
--- a/src/sqlite3-unicodesn/CMakeLists.txt
+++ b/src/sqlite3-unicodesn/CMakeLists.txt
@ -1,10 +1,12 @@
 # CMake definitions to build sqlite3-unicodesn as a static library.  This file
 # was added for Geary based on the project's Makefile.

+# If you update this list, you should also double-check
+# find_appropriate_search_stemmer() in src/engine/imap-db/imap-db-database.vala
 set(STEMMERS
    danish dutch english finnish french german hungarian
-    italian norwegian porter portuguese romanian russian
-    spanish swedish
+    italian norwegian portuguese romanian russian spanish
+    swedish turkish
 )

 set(SQLITE3_UNICODESN_SRC