Choose language-specific search stemmer; fix #6957

We use the list of preferred languages for the user at the time of
search table creation to pick the most relevant stemming algorithm for
our search tokenizer.  If we don't find a stemmer that matches any
preferred language, we use the English stemming algorithm as the
default.
This commit is contained in:
Charles Lindsay 2013-05-24 14:24:32 -07:00
parent f54ea44977
commit 8a4ed32908
2 changed files with 41 additions and 3 deletions

View file

@ -86,9 +86,11 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
// Version 10.
private void post_upgrade_add_search_table() {
try {
string stemmer = find_appropriate_search_stemmer();
debug("Creating search table using %s stemmer", stemmer);
// This can't go in the .sql file because its schema (the stemmer
// algorithm) is determined at runtime.
string stemmer = "english"; // TODO
exec("""
CREATE VIRTUAL TABLE MessageSearchTable USING fts4(
id INTEGER PRIMARY KEY,
@ -109,6 +111,40 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
}
}
private string find_appropriate_search_stemmer() {
// Unfortunately, the stemmer library only accepts the full language
// name for the stemming algorithm. This translates between the user's
// preferred language ISO 639-1 code and our available stemmers.
// FIXME: the available list here is determined by what's included in
// src/sqlite3-unicodesn/CMakeLists.txt. We should pass that list in
// instead of hardcoding it here.
foreach (string l in Intl.get_language_names()) {
switch (l) {
case "da": return "danish";
case "nl": return "dutch";
case "en": return "english";
case "fi": return "finnish";
case "fr": return "french";
case "de": return "german";
case "hu": return "hungarian";
case "it": return "italian";
case "no": return "norwegian";
case "pt": return "portuguese";
case "ro": return "romanian";
case "ru": return "russian";
case "es": return "spanish";
case "sv": return "swedish";
case "tr": return "turkish";
}
}
// Default to English because it seems to be on average the language
// most likely to be present in emails, regardless of the user's
// language setting. This is not an exact science, and search results
// should be ok either way in most cases.
return "english";
}
private void on_prepare_database_connection(Db.Connection cx) throws Error {
cx.set_busy_timeout_msec(Db.Connection.RECOMMENDED_BUSY_TIMEOUT_MSEC);
cx.set_foreign_keys(true);

View file

@ -1,10 +1,12 @@
# CMake definitions to build sqlite3-unicodesn as a static library. This file
# was added for Geary based on the project's Makefile.
# If you update this list, you should also double-check
# find_appropriate_search_stemmer() in src/engine/imap-db/imap-db-database.vala
set(STEMMERS
danish dutch english finnish french german hungarian
italian norwegian porter portuguese romanian russian
spanish swedish
italian norwegian portuguese romanian russian spanish
swedish turkish
)
set(SQLITE3_UNICODESN_SRC