From 58cae0ae40858682097de7bb06056de4a03ec74a Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Thu, 14 Jan 2021 21:20:25 +1100 Subject: [PATCH] Geary.ImapDB.Account: Drop post-search stemmed term greedy match removal Stop post-processing search results by dropping results that contain a matched term that is longer by some criterion than a stemmed term. Since this cannot be specified by SQLite's FTS queries, it has to be done outside of the search, which can have a substantial impact on performance, and either means running multiple queries outside of a transaction to get the required number of search results (potentially a large number of times), running the pruning within a transaction (potentially blocking the DB for a large length of time), or returning the wrong number of search results (potentially confusing the caller). Because of these disadvantages, and since SearchQuery's maximum difference in lengths between term and stemmed variant helps to prevent greedy matching anyway, just drop the post processing. --- src/engine/api/geary-search-query.vala | 26 ----------- src/engine/imap-db/imap-db-account.vala | 62 ------------------------- 2 files changed, 88 deletions(-) diff --git a/src/engine/api/geary-search-query.vala b/src/engine/api/geary-search-query.vala index 62f7416d..d9603d66 100644 --- a/src/engine/api/geary-search-query.vala +++ b/src/engine/api/geary-search-query.vala @@ -121,32 +121,6 @@ public abstract class Geary.SearchQuery : BaseObject { return max; } - /** - * Maximum difference in lengths between a matched word and the stemmed variant it matched - * against. - * - * This prevents long words being matched to short stem - * variants (which creates opportunities for greedy matching). - */ - internal int get_max_difference_match_stem_lengths() { - var max = 0; - switch (this) { - case EXACT: - max = 0; - break; - case CONSERVATIVE: - max = 2; - break; - case AGGRESSIVE: - max = 3; - break; - case HORIZON: - max = int.MAX; - break; - } - return max; - } - } diff --git a/src/engine/imap-db/imap-db-account.vala b/src/engine/imap-db/imap-db-account.vala index e5998afb..e9d3abc0 100644 --- a/src/engine/imap-db/imap-db-account.vala +++ b/src/engine/imap-db/imap-db-account.vala @@ -633,67 +633,9 @@ private class Geary.ImapDB.Account : BaseObject { }, cancellable); debug("Matching emails found: %d", matching_ids.size); - - if (query.has_stemmed_terms && search_matches != null) { - strip_greedy_results(query, matching_ids, search_matches); - } - - debug("Final search matches: %d", matching_ids.size); return matching_ids.is_empty ? null : matching_ids; } - // Strip out from the given collection of matching ids and results - // for any search results that only contain a hit due to "greedy" - // matching of the stemmed variants on all search terms. - private void strip_greedy_results(SearchQuery query, - Gee.Collection matches, - Gee.Map> results) { - int prestripped_results = matches.size; - // Gee.Iterator iter = matches.iterator(); - // while (iter.next()) { - // // For each matched string in this message, retain the message in the search results - // // if it prefix-matches any of the straight-up parsed terms or matches a stemmed - // // variant (with only max. difference in their lengths allowed, i.e. not a "greedy" - // // match) - // EmailIdentifier id = iter.get(); - // bool good_match_found = false; - // Gee.Set? result = results.get(id); - // if (result != null) { - // foreach (string match in result) { - // foreach (SearchQuery.Term term in query.get_all_terms()) { - // // if prefix-matches parsed term, then don't strip - // if (match.has_prefix(term.parsed)) { - // good_match_found = true; - // break; - // } - - // // if prefix-matches stemmed term w/o doing so - // // greedily, then don't strip - // if (term.stemmed != null && match.has_prefix(term.stemmed)) { - // int diff = match.length - term.stemmed.length; - // if (diff <= query.max_difference_match_stem_lengths) { - // good_match_found = true; - // break; - // } - // } - // } - // } - - // if (good_match_found) { - // break; - // } - // } - - // if (!good_match_found) { - // iter.remove(); - // matches.remove(id); - // } - // } - - debug("Stripped %d emails from search for [%s] due to greedy stem matching", - prestripped_results - matches.size, query.raw); - } - public async Gee.Set? get_search_matches_async(Geary.SearchQuery q, Gee.Collection ids, Cancellable? cancellable = null) throws Error { check_open(); @@ -714,10 +656,6 @@ private class Geary.ImapDB.Account : BaseObject { return Db.TransactionOutcome.DONE; } - if (query.has_stemmed_terms) { - strip_greedy_results(query, ids, match_map); - } - search_matches = new Gee.HashSet(); foreach (Gee.Set matches in match_map.values) search_matches.add_all(matches);