diff --git a/desktop/org.yorba.geary.gschema.xml b/desktop/org.yorba.geary.gschema.xml index b619d137..3e5bb333 100644 --- a/desktop/org.yorba.geary.gschema.xml +++ b/desktop/org.yorba.geary.gschema.xml @@ -74,6 +74,13 @@ whether to compose emails in HTML True to compose emails in HTML; false for plain text. + + + "conservative" + Advisory strategy for full-text searching + Acceptable values are EXACT, CONSERVATIVE, AGGRESSIVE, and HORIZON. + + diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index bbd8f91d..40184ce6 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -22,3 +22,4 @@ install(FILES version-019.sql DESTINATION ${SQL_DEST}) install(FILES version-020.sql DESTINATION ${SQL_DEST}) install(FILES version-021.sql DESTINATION ${SQL_DEST}) install(FILES version-022.sql DESTINATION ${SQL_DEST}) +install(FILES version-023.sql DESTINATION ${SQL_DEST}) diff --git a/sql/version-023.sql b/sql/version-023.sql new file mode 100644 index 00000000..d2825168 --- /dev/null +++ b/sql/version-023.sql @@ -0,0 +1,21 @@ +-- +-- Database upgrade to add FTS tokenize virtual table, which allows for querying the tokenizer +-- directly for stemmed words, and dropping the stemmed FTS table for an unstemmed one. We now +-- use the stemmer manually to generate search queries. +-- + +DROP TABLE MessageSearchTable; + +CREATE VIRTUAL TABLE MessageSearchTable USING fts4( + body, + attachment, + subject, + from_field, + receivers, + cc, + bcc, + + tokenize=simple, + prefix="2,4,6,8,10" +); + diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 13d1f2b4..dbd4d980 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -170,6 +170,8 @@ engine/imap-db/imap-db-email-identifier.vala engine/imap-db/imap-db-folder.vala engine/imap-db/imap-db-message-addresses.vala engine/imap-db/imap-db-message-row.vala +engine/imap-db/imap-db-search-query.vala +engine/imap-db/imap-db-search-term.vala engine/imap-db/imap-db-search-email-identifier.vala engine/imap-db/outbox/smtp-outbox-email-identifier.vala engine/imap-db/outbox/smtp-outbox-email-properties.vala diff --git a/src/client/application/geary-config.vala b/src/client/application/geary-config.vala index 2e843042..79761673 100644 --- a/src/client/application/geary-config.vala +++ b/src/client/application/geary-config.vala @@ -135,5 +135,43 @@ public class Configuration { if (!settings.set_boolean(name, value)) message("Unable to set configuration value %s = %s", name, value.to_string()); } + + public Geary.SearchQuery.Strategy get_search_strategy() { + switch (settings.get_string("search-strategy").down()) { + case "exact": + return Geary.SearchQuery.Strategy.EXACT; + + case "aggressive": + return Geary.SearchQuery.Strategy.AGGRESSIVE; + + case "horizon": + return Geary.SearchQuery.Strategy.HORIZON; + + case "conservative": + default: + return Geary.SearchQuery.Strategy.CONSERVATIVE; + } + } + + public void set_search_strategy(Geary.SearchQuery.Strategy strategy) { + switch (strategy) { + case Geary.SearchQuery.Strategy.EXACT: + settings.set_string("search-strategy", "exact"); + break; + + case Geary.SearchQuery.Strategy.AGGRESSIVE: + settings.set_string("search-strategy", "aggressive"); + break; + + case Geary.SearchQuery.Strategy.HORIZON: + settings.set_string("search-strategy", "horizon"); + break; + + case Geary.SearchQuery.Strategy.CONSERVATIVE: + default: + settings.set_string("search-strategy", "conservative"); + break; + } + } } diff --git a/src/client/application/geary-controller.vala b/src/client/application/geary-controller.vala index 808f3f4c..39f5c757 100644 --- a/src/client/application/geary-controller.vala +++ b/src/client/application/geary-controller.vala @@ -81,7 +81,7 @@ public class GearyController : Geary.BaseObject { private const string MOVE_MESSAGE_TOOLTIP_MULTIPLE = _("Move conversations"); private const int SELECT_FOLDER_TIMEOUT_USEC = 100 * 1000; - private const int SEARCH_TIMEOUT_MSEC = 100; + private const int SEARCH_TIMEOUT_MSEC = 250; private const string PROP_ATTEMPT_OPEN_ACCOUNT = "attempt-open-account"; @@ -2512,7 +2512,8 @@ public class GearyController : Geary.BaseObject { cancel_search(); // Stop any search in progress. - folder.set_search_query(search_text, cancellable_search); + folder.search(search_text, GearyApplication.instance.config.get_search_strategy(), + cancellable_search); main_window.folder_list.set_search(folder); search_text_changed(main_window.main_toolbar.search_text); @@ -2523,7 +2524,8 @@ public class GearyController : Geary.BaseObject { // search after a quick delay when they finish typing. if (search_timeout_id != 0) Source.remove(search_timeout_id); - search_timeout_id = Timeout.add(SEARCH_TIMEOUT_MSEC, on_search_timeout); + + search_timeout_id = Timeout.add(SEARCH_TIMEOUT_MSEC, on_search_timeout, Priority.LOW); } private bool on_search_timeout() { diff --git a/src/client/conversation-viewer/conversation-viewer.vala b/src/client/conversation-viewer/conversation-viewer.vala index ce113eba..5bb48476 100644 --- a/src/client/conversation-viewer/conversation-viewer.vala +++ b/src/client/conversation-viewer/conversation-viewer.vala @@ -457,11 +457,28 @@ public class ConversationViewer : Gtk.Box { } } - private void on_search_text_changed(string? query) { + private void on_search_text_changed(Geary.SearchQuery? query) { if (query != null) highlight_search_terms.begin(); } + // This applies a fudge-factor set of matches when the database results + // aren't entirely satisfactory, such as when you search for an email + // address and the database tokenizes out the @ and ., etc. It's not meant + // to be comprehensive, just a little extra highlighting applied to make + // the results look a little closer to what you typed. + private void add_literal_matches(string raw_query, Gee.Set? search_matches) { + foreach (string word in raw_query.split(" ")) { + if (word.has_suffix("\"")) + word = word.substring(0, word.length - 1); + if (word.has_prefix("\"")) + word = word.substring(1); + + if (!Geary.String.is_empty_or_whitespace(word)) + search_matches.add(word); + } + } + private async void highlight_search_terms() { if (search_folder == null) return; @@ -475,8 +492,13 @@ public class ConversationViewer : Gtk.Box { ids.add(email.id); try { - Gee.Collection? search_matches = yield search_folder.get_search_matches_async( + Gee.Set? search_matches = yield search_folder.get_search_matches_async( ids, cancellable_fetch); + if (search_matches == null) + search_matches = new Gee.HashSet(); + + if (search_folder.search_query != null) + add_literal_matches(search_folder.search_query.raw, search_matches); // Webkit's highlighting is ... weird. In order to actually see // all the highlighting you're applying, it seems necessary to @@ -484,8 +506,7 @@ public class ConversationViewer : Gtk.Box { // seems that shorter strings will overwrite longer ones, and // you're left with incomplete highlighting. Gee.ArrayList ordered_matches = new Gee.ArrayList(); - if (search_matches != null) - ordered_matches.add_all(search_matches); + ordered_matches.add_all(search_matches); ordered_matches.sort((a, b) => a.length - b.length); foreach(string match in ordered_matches) diff --git a/src/engine/abstract/geary-abstract-account.vala b/src/engine/abstract/geary-abstract-account.vala index 7c194f81..ea24d18b 100644 --- a/src/engine/abstract/geary-abstract-account.vala +++ b/src/engine/abstract/geary-abstract-account.vala @@ -118,11 +118,13 @@ public abstract class Geary.AbstractAccount : BaseObject, Geary.Account { public abstract async Geary.Email local_fetch_email_async(Geary.EmailIdentifier email_id, Geary.Email.Field required_fields, Cancellable? cancellable = null) throws Error; + public abstract Geary.SearchQuery open_search(string query, Geary.SearchQuery.Strategy strategy); + public abstract async Gee.Collection? local_search_async(Geary.SearchQuery query, int limit = 100, int offset = 0, Gee.Collection? folder_blacklist = null, Gee.Collection? search_ids = null, Cancellable? cancellable = null) throws Error; - public abstract async Gee.Collection? get_search_matches_async(Geary.SearchQuery query, + public abstract async Gee.Set? get_search_matches_async(Geary.SearchQuery query, Gee.Collection ids, Cancellable? cancellable = null) throws Error; public abstract async Gee.MultiMap? get_containing_folders_async( diff --git a/src/engine/api/geary-account.vala b/src/engine/api/geary-account.vala index a144566a..369bf041 100644 --- a/src/engine/api/geary-account.vala +++ b/src/engine/api/geary-account.vala @@ -322,6 +322,23 @@ public interface Geary.Account : BaseObject { public abstract async Geary.Email local_fetch_email_async(Geary.EmailIdentifier email_id, Geary.Email.Field required_fields, Cancellable? cancellable = null) throws Error; + /** + * Create a new {@link SearchQuery} for this {@link Account}. + * + * See {@link Geary.SearchQuery.Strategy} for more information about how its interpreted by the + * Engine. In particular, note that it's an advisory parameter only and may have no effect, + * especially on server searches. However, it may also have a dramatic effect on what search + * results are returned and so should be used with some caution. Whether this parameter is + * user-configurable, available through GSettings or another configuration mechanism, or simply + * baked into the caller's code is up to the caller. CONSERVATIVE is designed to be a good + * default. + * + * The SearchQuery object can only be used with calls into this Account. + * + * Dropping the last reference to the SearchQuery will close it. + */ + public abstract Geary.SearchQuery open_search(string query, Geary.SearchQuery.Strategy strategy); + /** * Performs a search with the given query. Optionally, a list of folders not to search * can be passed as well as a list of email identifiers to restrict the search to only those messages. @@ -335,9 +352,9 @@ public interface Geary.Account : BaseObject { Gee.Collection? search_ids = null, Cancellable? cancellable = null) throws Error; /** - * Given a list of mail IDs, returns a list of words that match for the query. + * Given a list of mail IDs, returns a set of casefolded words that match for the query. */ - public abstract async Gee.Collection? get_search_matches_async(Geary.SearchQuery query, + public abstract async Gee.Set? get_search_matches_async(Geary.SearchQuery query, Gee.Collection ids, Cancellable? cancellable = null) throws Error; /** diff --git a/src/engine/api/geary-search-folder.vala b/src/engine/api/geary-search-folder.vala index 4d03421e..c064a62f 100644 --- a/src/engine/api/geary-search-folder.vala +++ b/src/engine/api/geary-search-folder.vala @@ -49,6 +49,8 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport } } + public Geary.SearchQuery? search_query { get; private set; default = null; } + private Gee.HashSet exclude_folders = new Gee.HashSet(); private Geary.SpecialFolderType[] exclude_types = { Geary.SpecialFolderType.SPAM, @@ -56,7 +58,6 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport Geary.SpecialFolderType.DRAFTS, // Orphan emails (without a folder) are also excluded; see ctor. }; - private Geary.SearchQuery? search_query = null; private Gee.TreeSet search_results; private Geary.Nonblocking.Mutex result_mutex = new Geary.Nonblocking.Mutex(); @@ -64,7 +65,7 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport * Fired when the search query has changed. This signal is fired *after* the search * has completed. */ - public signal void search_query_changed(string? query); + public signal void search_query_changed(Geary.SearchQuery? query); public SearchFolder(Account account) { base(); @@ -203,8 +204,8 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport /** * Sets the keyword string for this search. */ - public void set_search_query(string query, Cancellable? cancellable = null) { - set_search_query_async.begin(query, cancellable, on_set_search_query_complete); + public void search(string query, SearchQuery.Strategy strategy, Cancellable? cancellable = null) { + set_search_query_async.begin(query, strategy, cancellable, on_set_search_query_complete); } private void on_set_search_query_complete(Object? source, AsyncResult result) { @@ -215,8 +216,9 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport } } - private async void set_search_query_async(string query, Cancellable? cancellable = null) throws Error { - Geary.SearchQuery search_query = new Geary.SearchQuery(query); + private async void set_search_query_async(string query, SearchQuery.Strategy strategy, + Cancellable? cancellable) throws Error { + Geary.SearchQuery search_query = account.open_search(query, strategy); int result_mutex_token = yield result_mutex.claim_async(); @@ -230,7 +232,7 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport result_mutex.release(ref result_mutex_token); this.search_query = search_query; - search_query_changed(search_query.raw); + search_query_changed(search_query); if (error != null) throw error; @@ -425,13 +427,14 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport } /** - * Given a list of mail IDs, returns a list of words that match for the current + * Given a list of mail IDs, returns a set of casefolded words that match for the current * search query. */ - public async Gee.Collection? get_search_matches_async( + public async Gee.Set? get_search_matches_async( Gee.Collection ids, Cancellable? cancellable = null) throws Error { if (search_query == null) return null; + return yield account.get_search_matches_async(search_query, ids, cancellable); } diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala index 51bcbd33..da9e1879 100644 --- a/src/engine/api/geary-search-query.vala +++ b/src/engine/api/geary-search-query.vala @@ -6,39 +6,63 @@ /** * An object to hold state for various search subsystems that might need to - * parse the same text string different ways. The only interaction the API - * user should have with this is creating new ones and then passing them off to - * the search methods in the engine. + * parse the same text string different ways. * - * TODO: support anything other than ImapDB.Account's search methods. + * The only interaction the API user should have with this is creating new ones and then passing + * them to the search methods in the Engine. + * + * @see Geary.Account.open_search */ -public class Geary.SearchQuery : BaseObject { + +public abstract class Geary.SearchQuery : BaseObject { + /** + * An advisory parameter regarding search quality, scope, and breadth. + * + * The Engine can perform searches based on (unspecified, uncontracted) textual variations of + * a query's search terms. Some of those variations may produce undesirable results due to + * "greedy" matching of terms. The Strategy parameter allows for an advisory to the Engine + * about how to use those textual variants, if any at all. + * + * This may be respected or ignored by the Engine. In particular, there's no guarantee it will + * have any effect on server search. + */ + public enum Strategy { + /** + * Only return exact matches, perform no searches for textual variants. + * + * Note that Geary's search syntax does prefix-matching for unquoted strings. EXACT means + * exact ''prefix-''matching in this case. + */ + EXACT, + /** + * Allow for searching for a small set of textual variants and small differences in search + * terms. This is a good default. + */ + CONSERVATIVE, + /** + * Allow for searching for a broad set of textual variants and larger differences in + * search terms. + */ + AGGRESSIVE, + /** + * Search for all textual variants, i.e. "the sky's the limit." + */ + HORIZON + } + + /** + * The original user search text. + */ public string raw { get; private set; } - public bool parsed { get; internal set; default = false; } - // Not using a MultiMap because we (might) need a guarantee of order. - private Gee.HashMap> field_map - = new Gee.HashMap>(); + /** + * The selected {@link Strategy} quality. + */ + public Strategy strategy { get; private set; } - public SearchQuery(string query) { - raw = query; - } - - internal void add_token(string? field, string token) { - if (!field_map.has_key(field)) - field_map.set(field, new Gee.ArrayList()); - - field_map.get(field).add(token); - } - - internal Gee.Collection get_fields() { - return field_map.keys; - } - - internal Gee.List? get_tokens(string? field) { - if (!field_map.has_key(field)) - return null; - - return field_map.get(field); + protected SearchQuery(string raw, Strategy strategy) { + this.raw = raw; + this.strategy = strategy; } } + diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala index c4a370e8..33b120e8 100644 --- a/src/engine/imap-db/imap-db-account.vala +++ b/src/engine/imap-db/imap-db-account.vala @@ -17,18 +17,6 @@ private class Geary.ImapDB.Account : BaseObject { } } - private class SearchOffset { - public int column; // Column in search table - public int byte_offset; // Offset (in bytes) of search term in string - public int size; // Size (in bytes) of the search term in string - - public SearchOffset(string[] offset_string) { - column = int.parse(offset_string[0]); - byte_offset = int.parse(offset_string[2]); - size = int.parse(offset_string[3]); - } - } - public signal void email_sent(Geary.RFC822.Message rfc822); // Only available when the Account is opened @@ -61,6 +49,14 @@ private class Geary.ImapDB.Account : BaseObject { throw new EngineError.OPEN_REQUIRED("Database not open"); } + private ImapDB.SearchQuery check_search_query(Geary.SearchQuery q) throws Error { + ImapDB.SearchQuery? query = q as ImapDB.SearchQuery; + if (query == null || query.account != this) + throw new EngineError.BAD_PARAMETERS("Geary.SearchQuery not associated with %s", name); + + return query; + } + public static void get_imap_db_storage_locations(File user_data_dir, out File db_file, out File attachments_dir) { db_file = ImapDB.Database.get_db_file(user_data_dir); @@ -715,7 +711,87 @@ private class Geary.ImapDB.Account : BaseObject { return null; } - private void prepare_search_query(Geary.SearchQuery query) { + /** + * This method is used to convert an unquoted user-entered search terms into a stemmed search + * term. + * + * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our + * tastes when coupled with prefix-matching of all unquoted terms (see + * https://bugzilla.gnome.org/show_bug.cgi?id=713179) This method is part of a larger strategy + * designed to dampen that aggressiveness without losing the benefits of stemming entirely. + * + * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one + * with no stemming (using only SQLite's "simple" tokenizer). It also creates a "magic" SQLite + * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which + * is still installed in Geary. Thus, we are now in the position to search for the original + * term and its stemmed variant, then do post-search processing to strip results which are + * too "greedy" due to prefix-matching the stemmed variant. + * + * Some heuristics are in place simply to determine if stemming should occur: + * + * # If stemming is unallowed, no stemming occurs. + * # If the term is < min. term length for stemming, no stemming occurs. + * # If the stemmer returns a stem that is the same as the original term, no stemming occurs. + * # If the difference between the stemmed word and the original term is more than + * maximum allowed, no stemming occurs. This works under the assumption that if + * the user has typed a long word, they do not want to "go back" to searching for a much + * shorter version of it. (For example, "accountancies" stems to "account"). + * + * Otherwise, the stem for the term is returned. + */ + private string? stem_search_term(ImapDB.SearchQuery query, string term) { + if (!query.allow_stemming) + return null; + + int term_length = term.length; + if (term_length < query.min_term_length_for_stemming) + return null; + + string? stemmed = null; + try { + Db.Statement stmt = db.prepare(""" + SELECT token + FROM TokenizerTable + WHERE input=? + """); + stmt.bind_string(0, term); + + // get stemmed string; if no result, fall through + Db.Result result = stmt.exec(); + if (!result.finished) + stemmed = result.string_at(0); + else + debug("No stemmed term returned for \"%s\"", term); + } catch (Error err) { + debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message); + + // fall-through + } + + if (String.is_empty(stemmed)) { + debug("Empty stemmed term returned for \"%s\"", term); + + return null; + } + + // If same term returned, treat as non-stemmed + if (stemmed == term) + return null; + + // Don't search for stemmed words that are significantly shorter than the user's search term + if (term_length - stemmed.length > query.max_difference_term_stem_lengths) { + debug("Stemmed \"%s\" dropped searching for \"%s\": too much distance in terms", + stemmed, term); + + return null; + } + + debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed); + + return stemmed; + } + + private void prepare_search_query(ImapDB.SearchQuery query) { if (query.parsed) return; @@ -753,16 +829,31 @@ private class Geary.ImapDB.Account : BaseObject { --quotes; } + SearchTerm? term; if (in_quote) { // HACK: this helps prevent a syntax error when the user types // something like from:"somebody". If we ever properly support // quotes after : we can get rid of this. - s = s.replace(":", " "); + term = new SearchTerm(s, s, null, s.replace(":", " "), null); } else { + string original = s; + + // some common search phrases we don't respect and therefore don't want to fall + // through to search results string lower = s.down(); - if (lower == "" || lower == "and" || lower == "or" || lower == "not" || lower == "near" - || lower.has_prefix("near/")) - continue; + switch (lower) { + case "": + case "and": + case "or": + case "not": + case "near": + continue; + + default: + if (lower.has_prefix("near/")) + continue; + break; + } if (s.has_prefix("-")) s = s.substring(1); @@ -775,13 +866,29 @@ private class Geary.ImapDB.Account : BaseObject { if (parts.length > 1) field = extract_field_from_token(parts, ref s); - s = "\"" + s + "*\""; + // SQL MATCH syntax for parsed term + string? sql_s = "%s*".printf(s); + + // stem the word, but if stemmed and stem is simply shorter version of original + // term, only prefix-match search for it (i.e. avoid searching for + // [archive* OR archiv*] when that's the same as [archiv*]), otherwise search for + // both + string? stemmed = stem_search_term(query, s); + + string? sql_stemmed = null; + if (stemmed != null) { + sql_stemmed = "%s*".printf(stemmed); + if (s.has_prefix(stemmed)) + sql_s = null; + } + + term = new SearchTerm(original, s, stemmed, sql_s, sql_stemmed); } if (in_quote && quotes % 2 != 0) in_quote = false; - query.add_token(field, s); + query.add_search_term(field, term); } assert(!in_quote); @@ -790,28 +897,53 @@ private class Geary.ImapDB.Account : BaseObject { } // Return a map of column -> phrase, to use as WHERE column MATCH 'phrase'. - private Gee.HashMap get_query_phrases(Geary.SearchQuery query) { + private Gee.HashMap get_query_phrases(ImapDB.SearchQuery query) { prepare_search_query(query); Gee.HashMap phrases = new Gee.HashMap(); foreach (string? field in query.get_fields()) { - string? phrase = null; - Gee.List? tokens = query.get_tokens(field); - if (tokens != null) { - string[] array = tokens.to_array(); - // HACK: work around a bug in vala where it's not null-terminating - // arrays created from generic-typed functions (Gee.Collection.to_array) - // before passing them off to g_strjoinv. Simply making a copy to a - // local proper string array adds the null for us. - string[] copy = new string[array.length]; - for (int i = 0; i < array.length; ++i) - copy[i] = array[i]; - phrase = string.joinv(" ", copy).strip(); + Gee.List? terms = query.get_search_terms(field); + if (terms == null || terms.size == 0) + continue; + + // Each SearchTerm is an AND but the SQL text within in are OR ... this allows for + // each user term to be AND but the variants of each term are or. So, if terms are + // [party] and [eventful] and stems are [parti] and [event], the search would be: + // + // (party* OR parti*) AND (eventful* OR event*) + // + // Obviously with stemming there's the possibility of the stemmed variant being nothing + // but a broader search of the original term (such as event* and eventful*) but do both + // to determine from each hit result which term caused the hit, and if it's too greedy + // a match of the stemmed variant, it can be stripped from the results. + // + // Note that this uses SQLite's "standard" query syntax for MATCH, where AND is implied + // (and would be treated as search term if included), parentheses are not allowed, and + // OR has a higher precendence than AND. So the above example in standard syntax is: + // + // party* OR parti* eventful* OR event* + StringBuilder builder = new StringBuilder(); + foreach (SearchTerm term in terms) { + if (term.sql.size == 0) + continue; + + if (term.is_exact) { + builder.append_printf("%s ", term.parsed); + } else { + bool is_first_sql = true; + foreach (string sql in term.sql) { + if (!is_first_sql) + builder.append(" OR "); + + builder.append_printf("%s ", sql); + is_first_sql = false; + } + } } - if (!Geary.String.is_empty(phrase)) - phrases.set((field == null ? "MessageSearchTable" : field), phrase); + phrases.set(field ?? "MessageSearchTable", builder.str); } + return phrases; } @@ -865,19 +997,39 @@ private class Geary.ImapDB.Account : BaseObject { return sql.str; } - public async Gee.Collection? search_async(Geary.SearchQuery query, + public async Gee.Collection? search_async(Geary.SearchQuery q, int limit = 100, int offset = 0, Gee.Collection? folder_blacklist = null, - Gee.Collection? search_ids = null, Cancellable? cancellable = null) throws Error { + Gee.Collection? search_ids = null, Cancellable? cancellable = null) + throws Error { check_open(); + ImapDB.SearchQuery query = check_search_query(q); Gee.HashMap query_phrases = get_query_phrases(query); if (query_phrases.size == 0) return null; - Gee.ArrayList search_results - = new Gee.ArrayList(); - + // Do this outside of transaction to catch invalid search ids up-front string? search_ids_sql = get_search_ids_sql(search_ids); + + // for some searches, results are stripped if they're too "greedy", but this requires + // examining the matched text, which has an expense to fetch, so avoid doing so unless + // necessary + bool strip_results = true; + + // HORIZON strategy is configured in such a way to allow all stemmed variants to match, + // so don't do any stripping in that case + // + // If any of the search terms is exact-match (no prefix matching) or none have stemmed + // variants, then don't do stripping of "greedy" stemmed matching (because in both cases, + // there are none) + if (query.strategy == Geary.SearchQuery.Strategy.HORIZON) + strip_results = false; + else if (traverse(query.get_all_terms()).any(term => term.stemmed == null || term.is_exact)) + strip_results = false; + + Gee.Set unstripped_ids = new Gee.HashSet(); + Gee.Map>? search_results = null; + yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => { string blacklisted_ids_sql = do_get_blacklisted_message_ids_sql( folder_blacklist, cx, cancellable); @@ -919,95 +1071,117 @@ private class Geary.ImapDB.Account : BaseObject { stmt.bind_int(bind_index++, offset); } + Gee.HashMap id_map = new Gee.HashMap( + Collection.int64_hash_func, Collection.int64_equal_func); + Db.Result result = stmt.exec(cancellable); while (!result.finished) { - int64 id = result.int64_at(0); + int64 message_id = result.int64_at(0); int64 internaldate_time_t = result.int64_at(1); DateTime? internaldate = (internaldate_time_t == -1 ? null : new DateTime.from_unix_local(internaldate_time_t)); - search_results.add(new ImapDB.SearchEmailIdentifier(id, internaldate)); + + ImapDB.EmailIdentifier id = new ImapDB.SearchEmailIdentifier(message_id, internaldate); + + unstripped_ids.add(id); + id_map.set(message_id, id); result.next(cancellable); } + if (!strip_results) + return Db.TransactionOutcome.DONE; + + search_results = do_get_search_matches(cx, query, id_map, cancellable); + return Db.TransactionOutcome.DONE; }, cancellable); - return (search_results.size == 0 ? null : search_results); - } - - // This applies a fudge-factor set of matches when the database results - // aren't entirely satisfactory, such as when you search for an email - // address and the database tokenizes out the @ and ., etc. It's not meant - // to be comprehensive, just a little extra highlighting applied to make - // the results look a little closer to what you typed. - private void add_literal_matches(string raw_query, Gee.Set search_matches) { - foreach (string word in raw_query.split(" ")) { - if (word.has_suffix("\"")) - word = word.substring(0, word.length - 1); - if (word.has_prefix("\"")) - word = word.substring(1); - - if (!String.is_empty_or_whitespace(word)) - search_matches.add(word); - } - } - - public async Gee.Collection? get_search_matches_async(Geary.SearchQuery query, - Gee.Collection ids, Cancellable? cancellable = null) throws Error { - check_open(); - - Gee.HashMap query_phrases = get_query_phrases(query); - if (query_phrases.size == 0) + if (unstripped_ids == null || unstripped_ids.size == 0) return null; - Gee.Set search_matches = new Gee.HashSet(); + if (!strip_results) + return unstripped_ids; - yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => { - StringBuilder sql = new StringBuilder(); - sql.append(""" - SELECT offsets(MessageSearchTable), * - FROM MessageSearchTable - WHERE docid IN ( - """); - sql_append_ids(sql, - Geary.traverse(ids).map(id => id.message_id).to_gee_iterable()); - sql.append(")"); - sql_add_query_phrases(sql, query_phrases); - - Db.Statement stmt = cx.prepare(sql.str); - sql_bind_query_phrases(stmt, 0, query_phrases); - - Db.Result result = stmt.exec(cancellable); - while (!result.finished) { - // Build a list of search offsets. - string[] offset_array = result.nonnull_string_at(0).split(" "); - Gee.ArrayList all_offsets = new Gee.ArrayList(); - int j = 0; - while (true) { - all_offsets.add(new SearchOffset(offset_array[j:j+4])); - - j += 4; - if (j >= offset_array.length) + // at this point, there should be some "full" search results to strip from + assert(search_results != null && search_results.size > 0); + + strip_greedy_results(query, search_results); + + return search_results.size == 0 ? null : search_results.keys; + } + + // Strip out search results that only contain a hit due to "greedy" matching of the stemmed + // variants on all search terms + private void strip_greedy_results(ImapDB.SearchQuery query, + Gee.Map> search_results) { + int prestripped_results = search_results.size; + Gee.MapIterator> iter = search_results.map_iterator(); + while (iter.next()) { + // For each matched string in this message, retain the message in the search results + // if it prefix-matches any of the straight-up parsed terms or matches a stemmed + // variant (with only max. difference in their lengths allowed, i.e. not a "greedy" + // match) + bool good_match_found = false; + foreach (string match in iter.get_value()) { + foreach (SearchTerm term in query.get_all_terms()) { + // if prefix-matches parsed term, then don't strip + if (match.has_prefix(term.parsed)) { + good_match_found = true; + break; + } + + // if prefix-matches stemmed term w/o doing so greedily, then don't strip + if (term.stemmed != null && match.has_prefix(term.stemmed)) { + int diff = match.length - term.stemmed.length; + if (diff <= query.max_difference_match_stem_lengths) { + good_match_found = true; + + break; + } + } } - // Iterate over the offset list, scrape strings from the database, and push - // the results into our return set. - foreach(SearchOffset offset in all_offsets) { - string text = result.nonnull_string_at(offset.column + 1); - search_matches.add(text[offset.byte_offset : offset.byte_offset + offset.size].down()); - } - - result.next(cancellable); + if (good_match_found) + break; } + if (!good_match_found) + iter.unset(); + } + + debug("Stripped %d emails from search for [%s] due to greedy stem matching", + prestripped_results - search_results.size, query.raw); + } + + public async Gee.Set? get_search_matches_async(Geary.SearchQuery q, + Gee.Collection ids, Cancellable? cancellable = null) throws Error { + check_open(); + ImapDB.SearchQuery query = check_search_query(q); + + Gee.Set? search_matches = null; + yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => { + Gee.HashMap id_map = new Gee.HashMap< + int64?, ImapDB.EmailIdentifier>(Collection.int64_hash_func, Collection.int64_equal_func); + foreach (ImapDB.EmailIdentifier id in ids) + id_map.set(id.message_id, id); + + Gee.Map>? match_map = + do_get_search_matches(cx, query, id_map, cancellable); + if (match_map == null || match_map.size == 0) + return Db.TransactionOutcome.DONE; + + strip_greedy_results(query, match_map); + + search_matches = new Gee.HashSet(); + foreach (Gee.Set matches in match_map.values) + search_matches.add_all(matches); + return Db.TransactionOutcome.DONE; }, cancellable); - add_literal_matches(query.raw, search_matches); - - return (search_matches.size == 0 ? null : search_matches); + return search_matches; } public async Geary.Email fetch_email_async(ImapDB.EmailIdentifier email_id, @@ -1561,5 +1735,69 @@ private class Geary.ImapDB.Account : BaseObject { unread_change.get(path)); } } + + // Not using a MultiMap because when traversing want to process all values at once per iteration, + // not per key-value + public Gee.Map>? do_get_search_matches(Db.Connection cx, + ImapDB.SearchQuery query, Gee.Map id_map, Cancellable? cancellable) + throws Error { + if (id_map.size == 0) + return null; + + Gee.HashMap query_phrases = get_query_phrases(query); + if (query_phrases.size == 0) + return null; + + StringBuilder sql = new StringBuilder(); + sql.append(""" + SELECT docid, offsets(MessageSearchTable), * + FROM MessageSearchTable + WHERE docid IN ( + """); + sql_append_ids(sql, id_map.keys); + sql.append(")"); + sql_add_query_phrases(sql, query_phrases); + + Db.Statement stmt = cx.prepare(sql.str); + sql_bind_query_phrases(stmt, 0, query_phrases); + + Gee.Map> search_matches = new Gee.HashMap< + ImapDB.EmailIdentifier, Gee.Set>(); + + Db.Result result = stmt.exec(cancellable); + while (!result.finished) { + int64 docid = result.rowid_at(0); + assert(id_map.contains(docid)); + ImapDB.EmailIdentifier id = id_map.get(docid); + + // offsets() function returns a list of 4 strings that are ints indicating position + // and length of match string in search table corpus + string[] offset_array = result.nonnull_string_at(1).split(" "); + + Gee.Set matches = new Gee.HashSet(); + + int j = 0; + while (true) { + unowned string[] offset_string = offset_array[j:j+4]; + + int column = int.parse(offset_string[0]); + int byte_offset = int.parse(offset_string[2]); + int size = int.parse(offset_string[3]); + + unowned string text = result.nonnull_string_at(column + 2); + matches.add(text[byte_offset : byte_offset + size].down()); + + j += 4; + if (j >= offset_array.length) + break; + } + + search_matches.set(id, matches); + + result.next(cancellable); + } + + return search_matches.size > 0 ? search_matches : null; + } } diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala index 704d7309..533209f3 100644 --- a/src/engine/imap-db/imap-db-database.vala +++ b/src/engine/imap-db/imap-db-database.vala @@ -107,7 +107,11 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase { break; case 22: - post_rebuild_attachments(); + post_upgrade_rebuild_attachments(); + break; + + case 23: + post_upgrade_add_tokenizer_table(); break; } } @@ -407,7 +411,7 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase { } // Version 22 - private void post_rebuild_attachments() { + private void post_upgrade_rebuild_attachments() { try { exec_transaction(Db.TransactionType.RW, (cx) => { Db.Statement stmt = cx.prepare(""" @@ -471,6 +475,25 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase { } } + // Version 23 + private void post_upgrade_add_tokenizer_table() { + try { + string stemmer = find_appropriate_search_stemmer(); + debug("Creating tokenizer table using %s stemmer", stemmer); + + // These can't go in the .sql file because its schema (the stemmer + // algorithm) is determined at runtime. + exec(""" + CREATE VIRTUAL TABLE TokenizerTable USING fts3tokenize( + unicodesn, + "stemmer=%s" + ); + """.printf(stemmer)); + } catch (Error e) { + error("Error creating tokenizer table: %s", e.message); + } + } + private void on_prepare_database_connection(Db.Connection cx) throws Error { cx.set_busy_timeout_msec(Db.Connection.RECOMMENDED_BUSY_TIMEOUT_MSEC); cx.set_foreign_keys(true); diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala new file mode 100644 index 00000000..52d0ba94 --- /dev/null +++ b/src/engine/imap-db/imap-db-search-query.vala @@ -0,0 +1,121 @@ +/* Copyright 2014 Yorba Foundation + * + * This software is licensed under the GNU Lesser General Public License + * (version 2.1 or later). See the COPYING file in this distribution. + */ + +/** + * Internal implementation of {@link Geary.SearchQuery}. + */ + +private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { + /** + * Associated {@link ImapDB.Account}. + */ + public weak ImapDB.Account account { get; private set; } + + /** + * Whether or not the query has been parsed and processed prior to search submission. + */ + public bool parsed { get; set; default = false; } + + /** + * Determined by {@link strategy}. + */ + public bool allow_stemming { get; private set; } + + /** + * Minimum length of the term before stemming is allowed. + * + * This prevents short words that might be stemmed from being stemmed. + * + * Overridden by {@link allow_stemming}. Determined by {@link strategy}. + */ + public int min_term_length_for_stemming { get; private set; } + + /** + * Maximum difference in lengths between term and stemmed variant. + * + * This prevents long words from being stemmed to much shorter words (which creates + * opportunities for greedy matching). + * + * Overridden by {@link allow_stemming}. Determined by {@link strategy}. + */ + public int max_difference_term_stem_lengths { get; private set; } + + /** + * Maximum difference in lengths between a matched word and the stemmed variant it matched + * against. + * + * This prevents long words being matched to short stem variants (which creates opportunities + * for greedy matching). + * + * Overridden by {@link allow_stemming}. Determined by {@link strategy}. + */ + public int max_difference_match_stem_lengths { get; private set; } + + // Not using a MultiMap because we (might) need a guarantee of order. + private Gee.HashMap> field_map + = new Gee.HashMap>(); + private Gee.ArrayList all = new Gee.ArrayList(); + + public SearchQuery(ImapDB.Account account, string query, Geary.SearchQuery.Strategy strategy) { + base (query, strategy); + + this.account = account; + + switch (strategy) { + case Strategy.EXACT: + allow_stemming = false; + min_term_length_for_stemming = int.MAX; + max_difference_term_stem_lengths = 0; + max_difference_match_stem_lengths = 0; + break; + + case Strategy.CONSERVATIVE: + allow_stemming = true; + min_term_length_for_stemming = 6; + max_difference_term_stem_lengths = 2; + max_difference_match_stem_lengths = 2; + break; + + case Strategy.AGGRESSIVE: + allow_stemming = true; + min_term_length_for_stemming = 4; + max_difference_term_stem_lengths = 4; + max_difference_match_stem_lengths = 3; + break; + + case Strategy.HORIZON: + allow_stemming = true; + min_term_length_for_stemming = 0; + max_difference_term_stem_lengths = int.MAX; + max_difference_match_stem_lengths = int.MAX; + break; + + default: + assert_not_reached(); + } + } + + public void add_search_term(string? field, SearchTerm term) { + if (!field_map.has_key(field)) + field_map.set(field, new Gee.ArrayList()); + + field_map.get(field).add(term); + all.add(term); + } + + public Gee.Collection get_fields() { + return field_map.keys; + } + + public Gee.List? get_search_terms(string? field) { + return field_map.has_key(field) ? field_map.get(field) : null; + } + + public Gee.List? get_all_terms() { + return all; + } +} + diff --git a/src/engine/imap-db/imap-db-search-term.vala b/src/engine/imap-db/imap-db-search-term.vala new file mode 100644 index 00000000..c56a0471 --- /dev/null +++ b/src/engine/imap-db/imap-db-search-term.vala @@ -0,0 +1,62 @@ +/* Copyright 2014 Yorba Foundation + * + * This software is licensed under the GNU Lesser General Public License + * (version 2.1 or later). See the COPYING file in this distribution. + */ + +/** + * Various associated state with a single term in a {@link ImapDB.SearchQuery}. + */ + +private class Geary.ImapDB.SearchTerm : BaseObject { + /** + * The original tokenized search term with minimal other processing performed. + * + * For example, punctuation might be removed, but no casefolding has occurred. + */ + public string original { get; private set; } + + /** + * The parsed tokenized search term. + * + * Casefolding and other normalizing text operations have been performed. + */ + public string parsed { get; private set; } + + /** + * The stemmed search term. + * + * Only used if stemming is being done ''and'' the stem is different than the {@link parsed} + * term. + */ + public string? stemmed { get; private set; } + + /** + * A list of terms ready for binding to an SQLite statement. + * + * This should include prefix operators and quotes (i.e. ["party"] or [party*]). These texts + * are guaranteed not to be null or empty strings. + */ + public Gee.List sql { get; private set; default = new Gee.ArrayList(); } + + /** + * Returns true if the {@link parsed} term is exact-match only (i.e. starts with quotes) and + * there is no {@link stemmed} variant. + */ + public bool is_exact { get { return parsed.has_prefix("\"") && stemmed == null; } } + + public SearchTerm(string original, string parsed, string? stemmed, string? sql_parsed, string? sql_stemmed) { + this.original = original; + this.parsed = parsed; + this.stemmed = stemmed; + + // for now, only two variations: the parsed string and the stemmed; since stem is usually + // shorter (and will be first in the OR statement), include it first + if (!String.is_empty(sql_stemmed)) + sql.add(sql_stemmed); + + if (!String.is_empty(sql_parsed)) + sql.add(sql_parsed); + } +} + diff --git a/src/engine/imap-engine/imap-engine-generic-account.vala b/src/engine/imap-engine/imap-engine-generic-account.vala index ef84ee6c..fc7c7e5f 100644 --- a/src/engine/imap-engine/imap-engine-generic-account.vala +++ b/src/engine/imap-engine/imap-engine-generic-account.vala @@ -824,6 +824,10 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.AbstractAccount { return yield local.fetch_email_async(check_id(email_id), required_fields, cancellable); } + public override Geary.SearchQuery open_search(string query, SearchQuery.Strategy strategy) { + return new ImapDB.SearchQuery(local, query, strategy); + } + public override async Gee.Collection? local_search_async(Geary.SearchQuery query, int limit = 100, int offset = 0, Gee.Collection? folder_blacklist = null, Gee.Collection? search_ids = null, Cancellable? cancellable = null) throws Error { @@ -833,7 +837,7 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.AbstractAccount { return yield local.search_async(query, limit, offset, folder_blacklist, search_ids, cancellable); } - public override async Gee.Collection? get_search_matches_async(Geary.SearchQuery query, + public override async Gee.Set? get_search_matches_async(Geary.SearchQuery query, Gee.Collection ids, Cancellable? cancellable = null) throws Error { return yield local.get_search_matches_async(query, check_ids(ids), cancellable); }