diff --git a/desktop/org.yorba.geary.gschema.xml b/desktop/org.yorba.geary.gschema.xml
index b619d137..3e5bb333 100644
--- a/desktop/org.yorba.geary.gschema.xml
+++ b/desktop/org.yorba.geary.gschema.xml
@@ -74,6 +74,13 @@
whether to compose emails in HTML
True to compose emails in HTML; false for plain text.
+
+
+ "conservative"
+ Advisory strategy for full-text searching
+ Acceptable values are EXACT, CONSERVATIVE, AGGRESSIVE, and HORIZON.
+
+
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index bbd8f91d..40184ce6 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -22,3 +22,4 @@ install(FILES version-019.sql DESTINATION ${SQL_DEST})
install(FILES version-020.sql DESTINATION ${SQL_DEST})
install(FILES version-021.sql DESTINATION ${SQL_DEST})
install(FILES version-022.sql DESTINATION ${SQL_DEST})
+install(FILES version-023.sql DESTINATION ${SQL_DEST})
diff --git a/sql/version-023.sql b/sql/version-023.sql
new file mode 100644
index 00000000..d2825168
--- /dev/null
+++ b/sql/version-023.sql
@@ -0,0 +1,21 @@
+--
+-- Database upgrade to add FTS tokenize virtual table, which allows for querying the tokenizer
+-- directly for stemmed words, and dropping the stemmed FTS table for an unstemmed one. We now
+-- use the stemmer manually to generate search queries.
+--
+
+DROP TABLE MessageSearchTable;
+
+CREATE VIRTUAL TABLE MessageSearchTable USING fts4(
+ body,
+ attachment,
+ subject,
+ from_field,
+ receivers,
+ cc,
+ bcc,
+
+ tokenize=simple,
+ prefix="2,4,6,8,10"
+);
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 13d1f2b4..dbd4d980 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -170,6 +170,8 @@ engine/imap-db/imap-db-email-identifier.vala
engine/imap-db/imap-db-folder.vala
engine/imap-db/imap-db-message-addresses.vala
engine/imap-db/imap-db-message-row.vala
+engine/imap-db/imap-db-search-query.vala
+engine/imap-db/imap-db-search-term.vala
engine/imap-db/imap-db-search-email-identifier.vala
engine/imap-db/outbox/smtp-outbox-email-identifier.vala
engine/imap-db/outbox/smtp-outbox-email-properties.vala
diff --git a/src/client/application/geary-config.vala b/src/client/application/geary-config.vala
index 2e843042..79761673 100644
--- a/src/client/application/geary-config.vala
+++ b/src/client/application/geary-config.vala
@@ -135,5 +135,43 @@ public class Configuration {
if (!settings.set_boolean(name, value))
message("Unable to set configuration value %s = %s", name, value.to_string());
}
+
+ public Geary.SearchQuery.Strategy get_search_strategy() {
+ switch (settings.get_string("search-strategy").down()) {
+ case "exact":
+ return Geary.SearchQuery.Strategy.EXACT;
+
+ case "aggressive":
+ return Geary.SearchQuery.Strategy.AGGRESSIVE;
+
+ case "horizon":
+ return Geary.SearchQuery.Strategy.HORIZON;
+
+ case "conservative":
+ default:
+ return Geary.SearchQuery.Strategy.CONSERVATIVE;
+ }
+ }
+
+ public void set_search_strategy(Geary.SearchQuery.Strategy strategy) {
+ switch (strategy) {
+ case Geary.SearchQuery.Strategy.EXACT:
+ settings.set_string("search-strategy", "exact");
+ break;
+
+ case Geary.SearchQuery.Strategy.AGGRESSIVE:
+ settings.set_string("search-strategy", "aggressive");
+ break;
+
+ case Geary.SearchQuery.Strategy.HORIZON:
+ settings.set_string("search-strategy", "horizon");
+ break;
+
+ case Geary.SearchQuery.Strategy.CONSERVATIVE:
+ default:
+ settings.set_string("search-strategy", "conservative");
+ break;
+ }
+ }
}
diff --git a/src/client/application/geary-controller.vala b/src/client/application/geary-controller.vala
index 808f3f4c..39f5c757 100644
--- a/src/client/application/geary-controller.vala
+++ b/src/client/application/geary-controller.vala
@@ -81,7 +81,7 @@ public class GearyController : Geary.BaseObject {
private const string MOVE_MESSAGE_TOOLTIP_MULTIPLE = _("Move conversations");
private const int SELECT_FOLDER_TIMEOUT_USEC = 100 * 1000;
- private const int SEARCH_TIMEOUT_MSEC = 100;
+ private const int SEARCH_TIMEOUT_MSEC = 250;
private const string PROP_ATTEMPT_OPEN_ACCOUNT = "attempt-open-account";
@@ -2512,7 +2512,8 @@ public class GearyController : Geary.BaseObject {
cancel_search(); // Stop any search in progress.
- folder.set_search_query(search_text, cancellable_search);
+ folder.search(search_text, GearyApplication.instance.config.get_search_strategy(),
+ cancellable_search);
main_window.folder_list.set_search(folder);
search_text_changed(main_window.main_toolbar.search_text);
@@ -2523,7 +2524,8 @@ public class GearyController : Geary.BaseObject {
// search after a quick delay when they finish typing.
if (search_timeout_id != 0)
Source.remove(search_timeout_id);
- search_timeout_id = Timeout.add(SEARCH_TIMEOUT_MSEC, on_search_timeout);
+
+ search_timeout_id = Timeout.add(SEARCH_TIMEOUT_MSEC, on_search_timeout, Priority.LOW);
}
private bool on_search_timeout() {
diff --git a/src/client/conversation-viewer/conversation-viewer.vala b/src/client/conversation-viewer/conversation-viewer.vala
index ce113eba..5bb48476 100644
--- a/src/client/conversation-viewer/conversation-viewer.vala
+++ b/src/client/conversation-viewer/conversation-viewer.vala
@@ -457,11 +457,28 @@ public class ConversationViewer : Gtk.Box {
}
}
- private void on_search_text_changed(string? query) {
+ private void on_search_text_changed(Geary.SearchQuery? query) {
if (query != null)
highlight_search_terms.begin();
}
+ // This applies a fudge-factor set of matches when the database results
+ // aren't entirely satisfactory, such as when you search for an email
+ // address and the database tokenizes out the @ and ., etc. It's not meant
+ // to be comprehensive, just a little extra highlighting applied to make
+ // the results look a little closer to what you typed.
+ private void add_literal_matches(string raw_query, Gee.Set? search_matches) {
+ foreach (string word in raw_query.split(" ")) {
+ if (word.has_suffix("\""))
+ word = word.substring(0, word.length - 1);
+ if (word.has_prefix("\""))
+ word = word.substring(1);
+
+ if (!Geary.String.is_empty_or_whitespace(word))
+ search_matches.add(word);
+ }
+ }
+
private async void highlight_search_terms() {
if (search_folder == null)
return;
@@ -475,8 +492,13 @@ public class ConversationViewer : Gtk.Box {
ids.add(email.id);
try {
- Gee.Collection? search_matches = yield search_folder.get_search_matches_async(
+ Gee.Set? search_matches = yield search_folder.get_search_matches_async(
ids, cancellable_fetch);
+ if (search_matches == null)
+ search_matches = new Gee.HashSet();
+
+ if (search_folder.search_query != null)
+ add_literal_matches(search_folder.search_query.raw, search_matches);
// Webkit's highlighting is ... weird. In order to actually see
// all the highlighting you're applying, it seems necessary to
@@ -484,8 +506,7 @@ public class ConversationViewer : Gtk.Box {
// seems that shorter strings will overwrite longer ones, and
// you're left with incomplete highlighting.
Gee.ArrayList ordered_matches = new Gee.ArrayList();
- if (search_matches != null)
- ordered_matches.add_all(search_matches);
+ ordered_matches.add_all(search_matches);
ordered_matches.sort((a, b) => a.length - b.length);
foreach(string match in ordered_matches)
diff --git a/src/engine/abstract/geary-abstract-account.vala b/src/engine/abstract/geary-abstract-account.vala
index 7c194f81..ea24d18b 100644
--- a/src/engine/abstract/geary-abstract-account.vala
+++ b/src/engine/abstract/geary-abstract-account.vala
@@ -118,11 +118,13 @@ public abstract class Geary.AbstractAccount : BaseObject, Geary.Account {
public abstract async Geary.Email local_fetch_email_async(Geary.EmailIdentifier email_id,
Geary.Email.Field required_fields, Cancellable? cancellable = null) throws Error;
+ public abstract Geary.SearchQuery open_search(string query, Geary.SearchQuery.Strategy strategy);
+
public abstract async Gee.Collection? local_search_async(Geary.SearchQuery query,
int limit = 100, int offset = 0, Gee.Collection? folder_blacklist = null,
Gee.Collection? search_ids = null, Cancellable? cancellable = null) throws Error;
- public abstract async Gee.Collection? get_search_matches_async(Geary.SearchQuery query,
+ public abstract async Gee.Set? get_search_matches_async(Geary.SearchQuery query,
Gee.Collection ids, Cancellable? cancellable = null) throws Error;
public abstract async Gee.MultiMap? get_containing_folders_async(
diff --git a/src/engine/api/geary-account.vala b/src/engine/api/geary-account.vala
index a144566a..369bf041 100644
--- a/src/engine/api/geary-account.vala
+++ b/src/engine/api/geary-account.vala
@@ -322,6 +322,23 @@ public interface Geary.Account : BaseObject {
public abstract async Geary.Email local_fetch_email_async(Geary.EmailIdentifier email_id,
Geary.Email.Field required_fields, Cancellable? cancellable = null) throws Error;
+ /**
+ * Create a new {@link SearchQuery} for this {@link Account}.
+ *
+ * See {@link Geary.SearchQuery.Strategy} for more information about how its interpreted by the
+ * Engine. In particular, note that it's an advisory parameter only and may have no effect,
+ * especially on server searches. However, it may also have a dramatic effect on what search
+ * results are returned and so should be used with some caution. Whether this parameter is
+ * user-configurable, available through GSettings or another configuration mechanism, or simply
+ * baked into the caller's code is up to the caller. CONSERVATIVE is designed to be a good
+ * default.
+ *
+ * The SearchQuery object can only be used with calls into this Account.
+ *
+ * Dropping the last reference to the SearchQuery will close it.
+ */
+ public abstract Geary.SearchQuery open_search(string query, Geary.SearchQuery.Strategy strategy);
+
/**
* Performs a search with the given query. Optionally, a list of folders not to search
* can be passed as well as a list of email identifiers to restrict the search to only those messages.
@@ -335,9 +352,9 @@ public interface Geary.Account : BaseObject {
Gee.Collection? search_ids = null, Cancellable? cancellable = null) throws Error;
/**
- * Given a list of mail IDs, returns a list of words that match for the query.
+ * Given a list of mail IDs, returns a set of casefolded words that match for the query.
*/
- public abstract async Gee.Collection? get_search_matches_async(Geary.SearchQuery query,
+ public abstract async Gee.Set? get_search_matches_async(Geary.SearchQuery query,
Gee.Collection ids, Cancellable? cancellable = null) throws Error;
/**
diff --git a/src/engine/api/geary-search-folder.vala b/src/engine/api/geary-search-folder.vala
index 4d03421e..c064a62f 100644
--- a/src/engine/api/geary-search-folder.vala
+++ b/src/engine/api/geary-search-folder.vala
@@ -49,6 +49,8 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
}
}
+ public Geary.SearchQuery? search_query { get; private set; default = null; }
+
private Gee.HashSet exclude_folders = new Gee.HashSet();
private Geary.SpecialFolderType[] exclude_types = {
Geary.SpecialFolderType.SPAM,
@@ -56,7 +58,6 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
Geary.SpecialFolderType.DRAFTS,
// Orphan emails (without a folder) are also excluded; see ctor.
};
- private Geary.SearchQuery? search_query = null;
private Gee.TreeSet search_results;
private Geary.Nonblocking.Mutex result_mutex = new Geary.Nonblocking.Mutex();
@@ -64,7 +65,7 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
* Fired when the search query has changed. This signal is fired *after* the search
* has completed.
*/
- public signal void search_query_changed(string? query);
+ public signal void search_query_changed(Geary.SearchQuery? query);
public SearchFolder(Account account) {
base();
@@ -203,8 +204,8 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
/**
* Sets the keyword string for this search.
*/
- public void set_search_query(string query, Cancellable? cancellable = null) {
- set_search_query_async.begin(query, cancellable, on_set_search_query_complete);
+ public void search(string query, SearchQuery.Strategy strategy, Cancellable? cancellable = null) {
+ set_search_query_async.begin(query, strategy, cancellable, on_set_search_query_complete);
}
private void on_set_search_query_complete(Object? source, AsyncResult result) {
@@ -215,8 +216,9 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
}
}
- private async void set_search_query_async(string query, Cancellable? cancellable = null) throws Error {
- Geary.SearchQuery search_query = new Geary.SearchQuery(query);
+ private async void set_search_query_async(string query, SearchQuery.Strategy strategy,
+ Cancellable? cancellable) throws Error {
+ Geary.SearchQuery search_query = account.open_search(query, strategy);
int result_mutex_token = yield result_mutex.claim_async();
@@ -230,7 +232,7 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
result_mutex.release(ref result_mutex_token);
this.search_query = search_query;
- search_query_changed(search_query.raw);
+ search_query_changed(search_query);
if (error != null)
throw error;
@@ -425,13 +427,14 @@ public class Geary.SearchFolder : Geary.AbstractLocalFolder, Geary.FolderSupport
}
/**
- * Given a list of mail IDs, returns a list of words that match for the current
+ * Given a list of mail IDs, returns a set of casefolded words that match for the current
* search query.
*/
- public async Gee.Collection? get_search_matches_async(
+ public async Gee.Set? get_search_matches_async(
Gee.Collection ids, Cancellable? cancellable = null) throws Error {
if (search_query == null)
return null;
+
return yield account.get_search_matches_async(search_query, ids, cancellable);
}
diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala
index 51bcbd33..da9e1879 100644
--- a/src/engine/api/geary-search-query.vala
+++ b/src/engine/api/geary-search-query.vala
@@ -6,39 +6,63 @@
/**
* An object to hold state for various search subsystems that might need to
- * parse the same text string different ways. The only interaction the API
- * user should have with this is creating new ones and then passing them off to
- * the search methods in the engine.
+ * parse the same text string different ways.
*
- * TODO: support anything other than ImapDB.Account's search methods.
+ * The only interaction the API user should have with this is creating new ones and then passing
+ * them to the search methods in the Engine.
+ *
+ * @see Geary.Account.open_search
*/
-public class Geary.SearchQuery : BaseObject {
+
+public abstract class Geary.SearchQuery : BaseObject {
+ /**
+ * An advisory parameter regarding search quality, scope, and breadth.
+ *
+ * The Engine can perform searches based on (unspecified, uncontracted) textual variations of
+ * a query's search terms. Some of those variations may produce undesirable results due to
+ * "greedy" matching of terms. The Strategy parameter allows for an advisory to the Engine
+ * about how to use those textual variants, if any at all.
+ *
+ * This may be respected or ignored by the Engine. In particular, there's no guarantee it will
+ * have any effect on server search.
+ */
+ public enum Strategy {
+ /**
+ * Only return exact matches, perform no searches for textual variants.
+ *
+ * Note that Geary's search syntax does prefix-matching for unquoted strings. EXACT means
+ * exact ''prefix-''matching in this case.
+ */
+ EXACT,
+ /**
+ * Allow for searching for a small set of textual variants and small differences in search
+ * terms. This is a good default.
+ */
+ CONSERVATIVE,
+ /**
+ * Allow for searching for a broad set of textual variants and larger differences in
+ * search terms.
+ */
+ AGGRESSIVE,
+ /**
+ * Search for all textual variants, i.e. "the sky's the limit."
+ */
+ HORIZON
+ }
+
+ /**
+ * The original user search text.
+ */
public string raw { get; private set; }
- public bool parsed { get; internal set; default = false; }
- // Not using a MultiMap because we (might) need a guarantee of order.
- private Gee.HashMap> field_map
- = new Gee.HashMap>();
+ /**
+ * The selected {@link Strategy} quality.
+ */
+ public Strategy strategy { get; private set; }
- public SearchQuery(string query) {
- raw = query;
- }
-
- internal void add_token(string? field, string token) {
- if (!field_map.has_key(field))
- field_map.set(field, new Gee.ArrayList());
-
- field_map.get(field).add(token);
- }
-
- internal Gee.Collection get_fields() {
- return field_map.keys;
- }
-
- internal Gee.List? get_tokens(string? field) {
- if (!field_map.has_key(field))
- return null;
-
- return field_map.get(field);
+ protected SearchQuery(string raw, Strategy strategy) {
+ this.raw = raw;
+ this.strategy = strategy;
}
}
+
diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala
index c4a370e8..33b120e8 100644
--- a/src/engine/imap-db/imap-db-account.vala
+++ b/src/engine/imap-db/imap-db-account.vala
@@ -17,18 +17,6 @@ private class Geary.ImapDB.Account : BaseObject {
}
}
- private class SearchOffset {
- public int column; // Column in search table
- public int byte_offset; // Offset (in bytes) of search term in string
- public int size; // Size (in bytes) of the search term in string
-
- public SearchOffset(string[] offset_string) {
- column = int.parse(offset_string[0]);
- byte_offset = int.parse(offset_string[2]);
- size = int.parse(offset_string[3]);
- }
- }
-
public signal void email_sent(Geary.RFC822.Message rfc822);
// Only available when the Account is opened
@@ -61,6 +49,14 @@ private class Geary.ImapDB.Account : BaseObject {
throw new EngineError.OPEN_REQUIRED("Database not open");
}
+ private ImapDB.SearchQuery check_search_query(Geary.SearchQuery q) throws Error {
+ ImapDB.SearchQuery? query = q as ImapDB.SearchQuery;
+ if (query == null || query.account != this)
+ throw new EngineError.BAD_PARAMETERS("Geary.SearchQuery not associated with %s", name);
+
+ return query;
+ }
+
public static void get_imap_db_storage_locations(File user_data_dir, out File db_file,
out File attachments_dir) {
db_file = ImapDB.Database.get_db_file(user_data_dir);
@@ -715,7 +711,87 @@ private class Geary.ImapDB.Account : BaseObject {
return null;
}
- private void prepare_search_query(Geary.SearchQuery query) {
+ /**
+ * This method is used to convert an unquoted user-entered search terms into a stemmed search
+ * term.
+ *
+ * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
+ * tastes when coupled with prefix-matching of all unquoted terms (see
+ * https://bugzilla.gnome.org/show_bug.cgi?id=713179) This method is part of a larger strategy
+ * designed to dampen that aggressiveness without losing the benefits of stemming entirely.
+ *
+ * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
+ * with no stemming (using only SQLite's "simple" tokenizer). It also creates a "magic" SQLite
+ * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
+ * is still installed in Geary. Thus, we are now in the position to search for the original
+ * term and its stemmed variant, then do post-search processing to strip results which are
+ * too "greedy" due to prefix-matching the stemmed variant.
+ *
+ * Some heuristics are in place simply to determine if stemming should occur:
+ *
+ * # If stemming is unallowed, no stemming occurs.
+ * # If the term is < min. term length for stemming, no stemming occurs.
+ * # If the stemmer returns a stem that is the same as the original term, no stemming occurs.
+ * # If the difference between the stemmed word and the original term is more than
+ * maximum allowed, no stemming occurs. This works under the assumption that if
+ * the user has typed a long word, they do not want to "go back" to searching for a much
+ * shorter version of it. (For example, "accountancies" stems to "account").
+ *
+ * Otherwise, the stem for the term is returned.
+ */
+ private string? stem_search_term(ImapDB.SearchQuery query, string term) {
+ if (!query.allow_stemming)
+ return null;
+
+ int term_length = term.length;
+ if (term_length < query.min_term_length_for_stemming)
+ return null;
+
+ string? stemmed = null;
+ try {
+ Db.Statement stmt = db.prepare("""
+ SELECT token
+ FROM TokenizerTable
+ WHERE input=?
+ """);
+ stmt.bind_string(0, term);
+
+ // get stemmed string; if no result, fall through
+ Db.Result result = stmt.exec();
+ if (!result.finished)
+ stemmed = result.string_at(0);
+ else
+ debug("No stemmed term returned for \"%s\"", term);
+ } catch (Error err) {
+ debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
+
+ // fall-through
+ }
+
+ if (String.is_empty(stemmed)) {
+ debug("Empty stemmed term returned for \"%s\"", term);
+
+ return null;
+ }
+
+ // If same term returned, treat as non-stemmed
+ if (stemmed == term)
+ return null;
+
+ // Don't search for stemmed words that are significantly shorter than the user's search term
+ if (term_length - stemmed.length > query.max_difference_term_stem_lengths) {
+ debug("Stemmed \"%s\" dropped searching for \"%s\": too much distance in terms",
+ stemmed, term);
+
+ return null;
+ }
+
+ debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed);
+
+ return stemmed;
+ }
+
+ private void prepare_search_query(ImapDB.SearchQuery query) {
if (query.parsed)
return;
@@ -753,16 +829,31 @@ private class Geary.ImapDB.Account : BaseObject {
--quotes;
}
+ SearchTerm? term;
if (in_quote) {
// HACK: this helps prevent a syntax error when the user types
// something like from:"somebody". If we ever properly support
// quotes after : we can get rid of this.
- s = s.replace(":", " ");
+ term = new SearchTerm(s, s, null, s.replace(":", " "), null);
} else {
+ string original = s;
+
+ // some common search phrases we don't respect and therefore don't want to fall
+ // through to search results
string lower = s.down();
- if (lower == "" || lower == "and" || lower == "or" || lower == "not" || lower == "near"
- || lower.has_prefix("near/"))
- continue;
+ switch (lower) {
+ case "":
+ case "and":
+ case "or":
+ case "not":
+ case "near":
+ continue;
+
+ default:
+ if (lower.has_prefix("near/"))
+ continue;
+ break;
+ }
if (s.has_prefix("-"))
s = s.substring(1);
@@ -775,13 +866,29 @@ private class Geary.ImapDB.Account : BaseObject {
if (parts.length > 1)
field = extract_field_from_token(parts, ref s);
- s = "\"" + s + "*\"";
+ // SQL MATCH syntax for parsed term
+ string? sql_s = "%s*".printf(s);
+
+ // stem the word, but if stemmed and stem is simply shorter version of original
+ // term, only prefix-match search for it (i.e. avoid searching for
+ // [archive* OR archiv*] when that's the same as [archiv*]), otherwise search for
+ // both
+ string? stemmed = stem_search_term(query, s);
+
+ string? sql_stemmed = null;
+ if (stemmed != null) {
+ sql_stemmed = "%s*".printf(stemmed);
+ if (s.has_prefix(stemmed))
+ sql_s = null;
+ }
+
+ term = new SearchTerm(original, s, stemmed, sql_s, sql_stemmed);
}
if (in_quote && quotes % 2 != 0)
in_quote = false;
- query.add_token(field, s);
+ query.add_search_term(field, term);
}
assert(!in_quote);
@@ -790,28 +897,53 @@ private class Geary.ImapDB.Account : BaseObject {
}
// Return a map of column -> phrase, to use as WHERE column MATCH 'phrase'.
- private Gee.HashMap get_query_phrases(Geary.SearchQuery query) {
+ private Gee.HashMap get_query_phrases(ImapDB.SearchQuery query) {
prepare_search_query(query);
Gee.HashMap phrases = new Gee.HashMap();
foreach (string? field in query.get_fields()) {
- string? phrase = null;
- Gee.List? tokens = query.get_tokens(field);
- if (tokens != null) {
- string[] array = tokens.to_array();
- // HACK: work around a bug in vala where it's not null-terminating
- // arrays created from generic-typed functions (Gee.Collection.to_array)
- // before passing them off to g_strjoinv. Simply making a copy to a
- // local proper string array adds the null for us.
- string[] copy = new string[array.length];
- for (int i = 0; i < array.length; ++i)
- copy[i] = array[i];
- phrase = string.joinv(" ", copy).strip();
+ Gee.List? terms = query.get_search_terms(field);
+ if (terms == null || terms.size == 0)
+ continue;
+
+ // Each SearchTerm is an AND but the SQL text within in are OR ... this allows for
+ // each user term to be AND but the variants of each term are or. So, if terms are
+ // [party] and [eventful] and stems are [parti] and [event], the search would be:
+ //
+ // (party* OR parti*) AND (eventful* OR event*)
+ //
+ // Obviously with stemming there's the possibility of the stemmed variant being nothing
+ // but a broader search of the original term (such as event* and eventful*) but do both
+ // to determine from each hit result which term caused the hit, and if it's too greedy
+ // a match of the stemmed variant, it can be stripped from the results.
+ //
+ // Note that this uses SQLite's "standard" query syntax for MATCH, where AND is implied
+ // (and would be treated as search term if included), parentheses are not allowed, and
+ // OR has a higher precendence than AND. So the above example in standard syntax is:
+ //
+ // party* OR parti* eventful* OR event*
+ StringBuilder builder = new StringBuilder();
+ foreach (SearchTerm term in terms) {
+ if (term.sql.size == 0)
+ continue;
+
+ if (term.is_exact) {
+ builder.append_printf("%s ", term.parsed);
+ } else {
+ bool is_first_sql = true;
+ foreach (string sql in term.sql) {
+ if (!is_first_sql)
+ builder.append(" OR ");
+
+ builder.append_printf("%s ", sql);
+ is_first_sql = false;
+ }
+ }
}
- if (!Geary.String.is_empty(phrase))
- phrases.set((field == null ? "MessageSearchTable" : field), phrase);
+ phrases.set(field ?? "MessageSearchTable", builder.str);
}
+
return phrases;
}
@@ -865,19 +997,39 @@ private class Geary.ImapDB.Account : BaseObject {
return sql.str;
}
- public async Gee.Collection? search_async(Geary.SearchQuery query,
+ public async Gee.Collection? search_async(Geary.SearchQuery q,
int limit = 100, int offset = 0, Gee.Collection? folder_blacklist = null,
- Gee.Collection? search_ids = null, Cancellable? cancellable = null) throws Error {
+ Gee.Collection? search_ids = null, Cancellable? cancellable = null)
+ throws Error {
check_open();
+ ImapDB.SearchQuery query = check_search_query(q);
Gee.HashMap query_phrases = get_query_phrases(query);
if (query_phrases.size == 0)
return null;
- Gee.ArrayList search_results
- = new Gee.ArrayList();
-
+ // Do this outside of transaction to catch invalid search ids up-front
string? search_ids_sql = get_search_ids_sql(search_ids);
+
+ // for some searches, results are stripped if they're too "greedy", but this requires
+ // examining the matched text, which has an expense to fetch, so avoid doing so unless
+ // necessary
+ bool strip_results = true;
+
+ // HORIZON strategy is configured in such a way to allow all stemmed variants to match,
+ // so don't do any stripping in that case
+ //
+ // If any of the search terms is exact-match (no prefix matching) or none have stemmed
+ // variants, then don't do stripping of "greedy" stemmed matching (because in both cases,
+ // there are none)
+ if (query.strategy == Geary.SearchQuery.Strategy.HORIZON)
+ strip_results = false;
+ else if (traverse(query.get_all_terms()).any(term => term.stemmed == null || term.is_exact))
+ strip_results = false;
+
+ Gee.Set unstripped_ids = new Gee.HashSet();
+ Gee.Map>? search_results = null;
+
yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => {
string blacklisted_ids_sql = do_get_blacklisted_message_ids_sql(
folder_blacklist, cx, cancellable);
@@ -919,95 +1071,117 @@ private class Geary.ImapDB.Account : BaseObject {
stmt.bind_int(bind_index++, offset);
}
+ Gee.HashMap id_map = new Gee.HashMap(
+ Collection.int64_hash_func, Collection.int64_equal_func);
+
Db.Result result = stmt.exec(cancellable);
while (!result.finished) {
- int64 id = result.int64_at(0);
+ int64 message_id = result.int64_at(0);
int64 internaldate_time_t = result.int64_at(1);
DateTime? internaldate = (internaldate_time_t == -1
? null : new DateTime.from_unix_local(internaldate_time_t));
- search_results.add(new ImapDB.SearchEmailIdentifier(id, internaldate));
+
+ ImapDB.EmailIdentifier id = new ImapDB.SearchEmailIdentifier(message_id, internaldate);
+
+ unstripped_ids.add(id);
+ id_map.set(message_id, id);
result.next(cancellable);
}
+ if (!strip_results)
+ return Db.TransactionOutcome.DONE;
+
+ search_results = do_get_search_matches(cx, query, id_map, cancellable);
+
return Db.TransactionOutcome.DONE;
}, cancellable);
- return (search_results.size == 0 ? null : search_results);
- }
-
- // This applies a fudge-factor set of matches when the database results
- // aren't entirely satisfactory, such as when you search for an email
- // address and the database tokenizes out the @ and ., etc. It's not meant
- // to be comprehensive, just a little extra highlighting applied to make
- // the results look a little closer to what you typed.
- private void add_literal_matches(string raw_query, Gee.Set search_matches) {
- foreach (string word in raw_query.split(" ")) {
- if (word.has_suffix("\""))
- word = word.substring(0, word.length - 1);
- if (word.has_prefix("\""))
- word = word.substring(1);
-
- if (!String.is_empty_or_whitespace(word))
- search_matches.add(word);
- }
- }
-
- public async Gee.Collection? get_search_matches_async(Geary.SearchQuery query,
- Gee.Collection ids, Cancellable? cancellable = null) throws Error {
- check_open();
-
- Gee.HashMap query_phrases = get_query_phrases(query);
- if (query_phrases.size == 0)
+ if (unstripped_ids == null || unstripped_ids.size == 0)
return null;
- Gee.Set search_matches = new Gee.HashSet();
+ if (!strip_results)
+ return unstripped_ids;
- yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => {
- StringBuilder sql = new StringBuilder();
- sql.append("""
- SELECT offsets(MessageSearchTable), *
- FROM MessageSearchTable
- WHERE docid IN (
- """);
- sql_append_ids(sql,
- Geary.traverse(ids).map(id => id.message_id).to_gee_iterable());
- sql.append(")");
- sql_add_query_phrases(sql, query_phrases);
-
- Db.Statement stmt = cx.prepare(sql.str);
- sql_bind_query_phrases(stmt, 0, query_phrases);
-
- Db.Result result = stmt.exec(cancellable);
- while (!result.finished) {
- // Build a list of search offsets.
- string[] offset_array = result.nonnull_string_at(0).split(" ");
- Gee.ArrayList all_offsets = new Gee.ArrayList();
- int j = 0;
- while (true) {
- all_offsets.add(new SearchOffset(offset_array[j:j+4]));
-
- j += 4;
- if (j >= offset_array.length)
+ // at this point, there should be some "full" search results to strip from
+ assert(search_results != null && search_results.size > 0);
+
+ strip_greedy_results(query, search_results);
+
+ return search_results.size == 0 ? null : search_results.keys;
+ }
+
+ // Strip out search results that only contain a hit due to "greedy" matching of the stemmed
+ // variants on all search terms
+ private void strip_greedy_results(ImapDB.SearchQuery query,
+ Gee.Map> search_results) {
+ int prestripped_results = search_results.size;
+ Gee.MapIterator> iter = search_results.map_iterator();
+ while (iter.next()) {
+ // For each matched string in this message, retain the message in the search results
+ // if it prefix-matches any of the straight-up parsed terms or matches a stemmed
+ // variant (with only max. difference in their lengths allowed, i.e. not a "greedy"
+ // match)
+ bool good_match_found = false;
+ foreach (string match in iter.get_value()) {
+ foreach (SearchTerm term in query.get_all_terms()) {
+ // if prefix-matches parsed term, then don't strip
+ if (match.has_prefix(term.parsed)) {
+ good_match_found = true;
+
break;
+ }
+
+ // if prefix-matches stemmed term w/o doing so greedily, then don't strip
+ if (term.stemmed != null && match.has_prefix(term.stemmed)) {
+ int diff = match.length - term.stemmed.length;
+ if (diff <= query.max_difference_match_stem_lengths) {
+ good_match_found = true;
+
+ break;
+ }
+ }
}
- // Iterate over the offset list, scrape strings from the database, and push
- // the results into our return set.
- foreach(SearchOffset offset in all_offsets) {
- string text = result.nonnull_string_at(offset.column + 1);
- search_matches.add(text[offset.byte_offset : offset.byte_offset + offset.size].down());
- }
-
- result.next(cancellable);
+ if (good_match_found)
+ break;
}
+ if (!good_match_found)
+ iter.unset();
+ }
+
+ debug("Stripped %d emails from search for [%s] due to greedy stem matching",
+ prestripped_results - search_results.size, query.raw);
+ }
+
+ public async Gee.Set? get_search_matches_async(Geary.SearchQuery q,
+ Gee.Collection ids, Cancellable? cancellable = null) throws Error {
+ check_open();
+ ImapDB.SearchQuery query = check_search_query(q);
+
+ Gee.Set? search_matches = null;
+ yield db.exec_transaction_async(Db.TransactionType.RO, (cx) => {
+ Gee.HashMap id_map = new Gee.HashMap<
+ int64?, ImapDB.EmailIdentifier>(Collection.int64_hash_func, Collection.int64_equal_func);
+ foreach (ImapDB.EmailIdentifier id in ids)
+ id_map.set(id.message_id, id);
+
+ Gee.Map>? match_map =
+ do_get_search_matches(cx, query, id_map, cancellable);
+ if (match_map == null || match_map.size == 0)
+ return Db.TransactionOutcome.DONE;
+
+ strip_greedy_results(query, match_map);
+
+ search_matches = new Gee.HashSet();
+ foreach (Gee.Set matches in match_map.values)
+ search_matches.add_all(matches);
+
return Db.TransactionOutcome.DONE;
}, cancellable);
- add_literal_matches(query.raw, search_matches);
-
- return (search_matches.size == 0 ? null : search_matches);
+ return search_matches;
}
public async Geary.Email fetch_email_async(ImapDB.EmailIdentifier email_id,
@@ -1561,5 +1735,69 @@ private class Geary.ImapDB.Account : BaseObject {
unread_change.get(path));
}
}
+
+ // Not using a MultiMap because when traversing want to process all values at once per iteration,
+ // not per key-value
+ public Gee.Map>? do_get_search_matches(Db.Connection cx,
+ ImapDB.SearchQuery query, Gee.Map id_map, Cancellable? cancellable)
+ throws Error {
+ if (id_map.size == 0)
+ return null;
+
+ Gee.HashMap query_phrases = get_query_phrases(query);
+ if (query_phrases.size == 0)
+ return null;
+
+ StringBuilder sql = new StringBuilder();
+ sql.append("""
+ SELECT docid, offsets(MessageSearchTable), *
+ FROM MessageSearchTable
+ WHERE docid IN (
+ """);
+ sql_append_ids(sql, id_map.keys);
+ sql.append(")");
+ sql_add_query_phrases(sql, query_phrases);
+
+ Db.Statement stmt = cx.prepare(sql.str);
+ sql_bind_query_phrases(stmt, 0, query_phrases);
+
+ Gee.Map> search_matches = new Gee.HashMap<
+ ImapDB.EmailIdentifier, Gee.Set>();
+
+ Db.Result result = stmt.exec(cancellable);
+ while (!result.finished) {
+ int64 docid = result.rowid_at(0);
+ assert(id_map.contains(docid));
+ ImapDB.EmailIdentifier id = id_map.get(docid);
+
+ // offsets() function returns a list of 4 strings that are ints indicating position
+ // and length of match string in search table corpus
+ string[] offset_array = result.nonnull_string_at(1).split(" ");
+
+ Gee.Set matches = new Gee.HashSet();
+
+ int j = 0;
+ while (true) {
+ unowned string[] offset_string = offset_array[j:j+4];
+
+ int column = int.parse(offset_string[0]);
+ int byte_offset = int.parse(offset_string[2]);
+ int size = int.parse(offset_string[3]);
+
+ unowned string text = result.nonnull_string_at(column + 2);
+ matches.add(text[byte_offset : byte_offset + size].down());
+
+ j += 4;
+ if (j >= offset_array.length)
+ break;
+ }
+
+ search_matches.set(id, matches);
+
+ result.next(cancellable);
+ }
+
+ return search_matches.size > 0 ? search_matches : null;
+ }
}
diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala
index 704d7309..533209f3 100644
--- a/src/engine/imap-db/imap-db-database.vala
+++ b/src/engine/imap-db/imap-db-database.vala
@@ -107,7 +107,11 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
break;
case 22:
- post_rebuild_attachments();
+ post_upgrade_rebuild_attachments();
+ break;
+
+ case 23:
+ post_upgrade_add_tokenizer_table();
break;
}
}
@@ -407,7 +411,7 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
}
// Version 22
- private void post_rebuild_attachments() {
+ private void post_upgrade_rebuild_attachments() {
try {
exec_transaction(Db.TransactionType.RW, (cx) => {
Db.Statement stmt = cx.prepare("""
@@ -471,6 +475,25 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
}
}
+ // Version 23
+ private void post_upgrade_add_tokenizer_table() {
+ try {
+ string stemmer = find_appropriate_search_stemmer();
+ debug("Creating tokenizer table using %s stemmer", stemmer);
+
+ // These can't go in the .sql file because its schema (the stemmer
+ // algorithm) is determined at runtime.
+ exec("""
+ CREATE VIRTUAL TABLE TokenizerTable USING fts3tokenize(
+ unicodesn,
+ "stemmer=%s"
+ );
+ """.printf(stemmer));
+ } catch (Error e) {
+ error("Error creating tokenizer table: %s", e.message);
+ }
+ }
+
private void on_prepare_database_connection(Db.Connection cx) throws Error {
cx.set_busy_timeout_msec(Db.Connection.RECOMMENDED_BUSY_TIMEOUT_MSEC);
cx.set_foreign_keys(true);
diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala
new file mode 100644
index 00000000..52d0ba94
--- /dev/null
+++ b/src/engine/imap-db/imap-db-search-query.vala
@@ -0,0 +1,121 @@
+/* Copyright 2014 Yorba Foundation
+ *
+ * This software is licensed under the GNU Lesser General Public License
+ * (version 2.1 or later). See the COPYING file in this distribution.
+ */
+
+/**
+ * Internal implementation of {@link Geary.SearchQuery}.
+ */
+
+private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
+ /**
+ * Associated {@link ImapDB.Account}.
+ */
+ public weak ImapDB.Account account { get; private set; }
+
+ /**
+ * Whether or not the query has been parsed and processed prior to search submission.
+ */
+ public bool parsed { get; set; default = false; }
+
+ /**
+ * Determined by {@link strategy}.
+ */
+ public bool allow_stemming { get; private set; }
+
+ /**
+ * Minimum length of the term before stemming is allowed.
+ *
+ * This prevents short words that might be stemmed from being stemmed.
+ *
+ * Overridden by {@link allow_stemming}. Determined by {@link strategy}.
+ */
+ public int min_term_length_for_stemming { get; private set; }
+
+ /**
+ * Maximum difference in lengths between term and stemmed variant.
+ *
+ * This prevents long words from being stemmed to much shorter words (which creates
+ * opportunities for greedy matching).
+ *
+ * Overridden by {@link allow_stemming}. Determined by {@link strategy}.
+ */
+ public int max_difference_term_stem_lengths { get; private set; }
+
+ /**
+ * Maximum difference in lengths between a matched word and the stemmed variant it matched
+ * against.
+ *
+ * This prevents long words being matched to short stem variants (which creates opportunities
+ * for greedy matching).
+ *
+ * Overridden by {@link allow_stemming}. Determined by {@link strategy}.
+ */
+ public int max_difference_match_stem_lengths { get; private set; }
+
+ // Not using a MultiMap because we (might) need a guarantee of order.
+ private Gee.HashMap> field_map
+ = new Gee.HashMap>();
+ private Gee.ArrayList all = new Gee.ArrayList();
+
+ public SearchQuery(ImapDB.Account account, string query, Geary.SearchQuery.Strategy strategy) {
+ base (query, strategy);
+
+ this.account = account;
+
+ switch (strategy) {
+ case Strategy.EXACT:
+ allow_stemming = false;
+ min_term_length_for_stemming = int.MAX;
+ max_difference_term_stem_lengths = 0;
+ max_difference_match_stem_lengths = 0;
+ break;
+
+ case Strategy.CONSERVATIVE:
+ allow_stemming = true;
+ min_term_length_for_stemming = 6;
+ max_difference_term_stem_lengths = 2;
+ max_difference_match_stem_lengths = 2;
+ break;
+
+ case Strategy.AGGRESSIVE:
+ allow_stemming = true;
+ min_term_length_for_stemming = 4;
+ max_difference_term_stem_lengths = 4;
+ max_difference_match_stem_lengths = 3;
+ break;
+
+ case Strategy.HORIZON:
+ allow_stemming = true;
+ min_term_length_for_stemming = 0;
+ max_difference_term_stem_lengths = int.MAX;
+ max_difference_match_stem_lengths = int.MAX;
+ break;
+
+ default:
+ assert_not_reached();
+ }
+ }
+
+ public void add_search_term(string? field, SearchTerm term) {
+ if (!field_map.has_key(field))
+ field_map.set(field, new Gee.ArrayList());
+
+ field_map.get(field).add(term);
+ all.add(term);
+ }
+
+ public Gee.Collection get_fields() {
+ return field_map.keys;
+ }
+
+ public Gee.List? get_search_terms(string? field) {
+ return field_map.has_key(field) ? field_map.get(field) : null;
+ }
+
+ public Gee.List? get_all_terms() {
+ return all;
+ }
+}
+
diff --git a/src/engine/imap-db/imap-db-search-term.vala b/src/engine/imap-db/imap-db-search-term.vala
new file mode 100644
index 00000000..c56a0471
--- /dev/null
+++ b/src/engine/imap-db/imap-db-search-term.vala
@@ -0,0 +1,62 @@
+/* Copyright 2014 Yorba Foundation
+ *
+ * This software is licensed under the GNU Lesser General Public License
+ * (version 2.1 or later). See the COPYING file in this distribution.
+ */
+
+/**
+ * Various associated state with a single term in a {@link ImapDB.SearchQuery}.
+ */
+
+private class Geary.ImapDB.SearchTerm : BaseObject {
+ /**
+ * The original tokenized search term with minimal other processing performed.
+ *
+ * For example, punctuation might be removed, but no casefolding has occurred.
+ */
+ public string original { get; private set; }
+
+ /**
+ * The parsed tokenized search term.
+ *
+ * Casefolding and other normalizing text operations have been performed.
+ */
+ public string parsed { get; private set; }
+
+ /**
+ * The stemmed search term.
+ *
+ * Only used if stemming is being done ''and'' the stem is different than the {@link parsed}
+ * term.
+ */
+ public string? stemmed { get; private set; }
+
+ /**
+ * A list of terms ready for binding to an SQLite statement.
+ *
+ * This should include prefix operators and quotes (i.e. ["party"] or [party*]). These texts
+ * are guaranteed not to be null or empty strings.
+ */
+ public Gee.List sql { get; private set; default = new Gee.ArrayList(); }
+
+ /**
+ * Returns true if the {@link parsed} term is exact-match only (i.e. starts with quotes) and
+ * there is no {@link stemmed} variant.
+ */
+ public bool is_exact { get { return parsed.has_prefix("\"") && stemmed == null; } }
+
+ public SearchTerm(string original, string parsed, string? stemmed, string? sql_parsed, string? sql_stemmed) {
+ this.original = original;
+ this.parsed = parsed;
+ this.stemmed = stemmed;
+
+ // for now, only two variations: the parsed string and the stemmed; since stem is usually
+ // shorter (and will be first in the OR statement), include it first
+ if (!String.is_empty(sql_stemmed))
+ sql.add(sql_stemmed);
+
+ if (!String.is_empty(sql_parsed))
+ sql.add(sql_parsed);
+ }
+}
+
diff --git a/src/engine/imap-engine/imap-engine-generic-account.vala b/src/engine/imap-engine/imap-engine-generic-account.vala
index ef84ee6c..fc7c7e5f 100644
--- a/src/engine/imap-engine/imap-engine-generic-account.vala
+++ b/src/engine/imap-engine/imap-engine-generic-account.vala
@@ -824,6 +824,10 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.AbstractAccount {
return yield local.fetch_email_async(check_id(email_id), required_fields, cancellable);
}
+ public override Geary.SearchQuery open_search(string query, SearchQuery.Strategy strategy) {
+ return new ImapDB.SearchQuery(local, query, strategy);
+ }
+
public override async Gee.Collection? local_search_async(Geary.SearchQuery query,
int limit = 100, int offset = 0, Gee.Collection? folder_blacklist = null,
Gee.Collection? search_ids = null, Cancellable? cancellable = null) throws Error {
@@ -833,7 +837,7 @@ private abstract class Geary.ImapEngine.GenericAccount : Geary.AbstractAccount {
return yield local.search_async(query, limit, offset, folder_blacklist, search_ids, cancellable);
}
- public override async Gee.Collection? get_search_matches_async(Geary.SearchQuery query,
+ public override async Gee.Set? get_search_matches_async(Geary.SearchQuery query,
Gee.Collection ids, Cancellable? cancellable = null) throws Error {
return yield local.get_search_matches_async(query, check_ids(ids), cancellable);
}