From 968bc1a9e82d275ffbfd3825d3aa9e4597d9bd4b Mon Sep 17 00:00:00 2001 From: Michael Gratton Date: Sun, 13 Sep 2020 16:27:51 +1000 Subject: [PATCH] ImapDb.SearchQuery: Use libstemmer for stemming Instead of shipping our own forked stemmer via SQLite, add a dependency on libstemmer and use that instead. --- .gitlab-ci.yml | 10 +- BUILDING.md | 8 +- bindings/vapi/libstemmer.vapi | 38 +++++++ meson.build | 8 ++ src/engine/imap-db/imap-db-search-query.vala | 102 +++++++++++-------- src/engine/meson.build | 1 + src/meson.build | 1 + 7 files changed, 118 insertions(+), 50 deletions(-) create mode 100644 bindings/vapi/libstemmer.vapi diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 82f6ab3b..80be0f43 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,9 +26,9 @@ variables: meson vala desktop-file-utils enchant2-devel folks-devel gcr-devel glib2-devel gmime30-devel gnome-online-accounts-devel gspell-devel gsound-devel gtk3-devel iso-codes-devel json-glib-devel itstool - libappstream-glib-devel libgee-devel libhandy1-devel libpeas-devel - libsecret-devel libunwind-devel libxml2-devel libytnef-devel - sqlite-devel webkitgtk4-devel + libappstream-glib-devel libgee-devel libhandy1-devel + libpeas-devel libsecret-devel libstemmer-devel libunwind-devel + libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel FEDORA_TEST_DEPS: glibc-langpack-en gnutls-utils tar Xvfb xz # Ubuntu packages @@ -37,8 +37,8 @@ variables: itstool libappstream-glib-dev libenchant-2-dev libfolks-dev libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev - libhandy-1-dev libjson-glib-dev libmessaging-menu-dev - libpeas-dev libsecret-1-dev libsqlite3-dev libunwind-dev + libhandy-1-dev libjson-glib-dev libmessaging-menu-dev libpeas-dev + libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev UBUNTU_TEST_DEPS: gnutls-bin librsvg2-common locales xauth xvfb diff --git a/BUILDING.md b/BUILDING.md index c00b1901..ff2c7e46 100644 --- a/BUILDING.md +++ b/BUILDING.md @@ -91,8 +91,8 @@ sudo dnf install meson vala desktop-file-utils enchant2-devel \ gnome-online-accounts-devel gspell-devel gsound-devel \ gtk3-devel iso-codes-devel itstool json-glib-devel \ libappstream-glib-devel libgee-devel libhandy1-devel \ - libpeas-devel libsecret-devel libunwind-devel libxml2-devel \ - libytnef-devel sqlite-devel webkitgtk4-devel + libpeas-devel libsecret-devel libstemmer-devel libunwind-devel \ + libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel ``` Installing dependencies on Ubuntu/Debian @@ -107,8 +107,8 @@ sudo apt-get install meson build-essential valac \ libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev \ libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev \ libjson-glib-dev libhandy-1-dev libpeas-dev libsecret-1-dev \ - libsqlite3-dev libunwind-dev libwebkit2gtk-4.0-dev libxml2-dev \ - libytnef0-dev + libsqlite3-dev libstemmer-dev libunwind-dev \ + libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev ``` And for Ubuntu Messaging Menu integration: diff --git a/bindings/vapi/libstemmer.vapi b/bindings/vapi/libstemmer.vapi new file mode 100644 index 00000000..1d2e01c5 --- /dev/null +++ b/bindings/vapi/libstemmer.vapi @@ -0,0 +1,38 @@ +/*- + * Copyright (c) 2010-2013 Giulio Paci + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +[CCode (cheader_filename = "libstemmer.h", lower_case_cprefix = "sb_", cprefix="sb_")] +namespace SnowBall { + [Compact] + [CCode (cname = "struct sb_stemmer", free_function="sb_stemmer_delete")] + public class Stemmer { + public unowned string stem(string word, int size); + public int length(); + public Stemmer(string language, string encoding = "UTF_8"); + [CCode (array_length = false, array_null_terminated = true)] + public static unowned string[] list(); + } +} \ No newline at end of file diff --git a/meson.build b/meson.build index 84cb43f9..85f256e9 100644 --- a/meson.build +++ b/meson.build @@ -93,6 +93,7 @@ libmath = cc.find_library('m') libpeas = dependency('libpeas-1.0', version: '>= 1.24.0') libpeas_gtk = dependency('libpeas-gtk-1.0', version: '>= 1.24.0') libsecret = dependency('libsecret-1', version: '>= 0.11') +libstemmer_dep = cc.find_library('stemmer') libsoup = dependency('libsoup-2.4', version: '>= 2.48') libunwind_dep = dependency( 'libunwind', version: '>= 1.1', required: get_option('libunwind') @@ -122,6 +123,13 @@ if libunwind_dep.found() ) endif +libstemmer = declare_dependency( + dependencies: [ + valac.find_library('libstemmer', dirs: [vapi_dir]), + libstemmer_dep, + ], +) + # Optional dependencies appstream_util = find_program('appstream-util', required: false) desktop_file_validate = find_program('desktop-file-validate', required: false) diff --git a/src/engine/imap-db/imap-db-search-query.vala b/src/engine/imap-db/imap-db-search-query.vala index 85f556fa..968ac6ea 100644 --- a/src/engine/imap-db/imap-db-search-query.vala +++ b/src/engine/imap-db/imap-db-search-query.vala @@ -318,6 +318,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { // A list of all search terms, regardless of search op field name private Gee.ArrayList all = new Gee.ArrayList(); + private SnowBall.Stemmer stemmer; + + public async SearchQuery(Geary.Account owner, ImapDB.Account local, string query, @@ -325,6 +328,7 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { GLib.Cancellable? cancellable) { base(owner, query, strategy); this.account = local; + this.stemmer = new SnowBall.Stemmer(find_appropriate_search_stemmer()); switch (strategy) { case Strategy.EXACT: @@ -611,20 +615,19 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { } /** - * This method is used to convert an unquoted user-entered search terms into a stemmed search - * term. + * Converts unquoted search terms into a stemmed search term. * - * Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our - * tastes when coupled with prefix-matching of all unquoted terms (see - * https://bugzilla.gnome.org/show_bug.cgi?id=713179) This method is part of a larger strategy - * designed to dampen that aggressiveness without losing the benefits of stemming entirely. + * Prior experience with the Snowball stemmer indicates it is too + * aggressive for our tastes when coupled with prefix-matching of + * all unquoted terms (see + * https://bugzilla.gnome.org/show_bug.cgi?id=713179). * - * Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one - * with no stemming (using only SQLite's "simple" tokenizer). It also creates a "magic" SQLite - * table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which - * is still installed in Geary. Thus, we are now in the position to search for the original - * term and its stemmed variant, then do post-search processing to strip results which are - * too "greedy" due to prefix-matching the stemmed variant. + * This method is part of a larger strategy designed to dampen + * that aggressiveness without losing the benefits of stemming + * entirely: The database's FTS table uses no stemming, but + * libstemmer is used to generate stemmed search terms. + * Post-search processing is then to strip results which are too + * "greedy" due to prefix-matching the stemmed variant. * * Some heuristics are in place simply to determine if stemming should occur: * @@ -647,36 +650,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { if (term_length < this.min_term_length_for_stemming) return null; - string? stemmed = null; - try { - yield this.account.db.exec_transaction_async(RO, - (cx, cancellable) => { - Db.Statement stmt = cx.prepare(""" - SELECT token - FROM TokenizerTable - WHERE input=? - """); - stmt.bind_string(0, term); - - // get stemmed string; if no result, fall through - Db.Result result = stmt.exec(cancellable); - if (!result.finished) { - stemmed = result.string_at(0); - } else { - debug("No stemmed term returned for \"%s\"", term); - } - return COMMIT; - }, cancellable - ); - } catch (Error err) { - debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message); - - // fall-through - } - + string? stemmed = this.stemmer.stem(term, term.length); if (String.is_empty(stemmed)) { debug("Empty stemmed term returned for \"%s\"", term); - return null; } @@ -693,8 +669,52 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery { } debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed); - return stemmed; } + private string find_appropriate_search_stemmer() { + // Unfortunately, the stemmer library only accepts the full language + // name for the stemming algorithm. This translates between the user's + // preferred language ISO 639-1 code and our available stemmers. + // FIXME: the available list here is determined by what's included in + // src/sqlite3-unicodesn/CMakeLists.txt. We should pass that list in + // instead of hardcoding it here. + foreach (string l in Intl.get_language_names()) { + switch (l) { + case "ar": return "arabic"; + case "eu": return "basque"; + case "ca": return "catalan"; + case "da": return "danish"; + case "nl": return "dutch"; + case "en": return "english"; + case "fi": return "finnish"; + case "fr": return "french"; + case "de": return "german"; + case "el": return "greek"; + case "hi": return "hindi"; + case "hu": return "hungarian"; + case "id": return "indonesian"; + case "ga": return "irish"; + case "it": return "italian"; + case "lt": return "lithuanian"; + case "ne": return "nepali"; + case "no": return "norwegian"; + case "pt": return "portuguese"; + case "ro": return "romanian"; + case "ru": return "russian"; + case "sr": return "serbian"; + case "es": return "spanish"; + case "sv": return "swedish"; + case "ta": return "tamil"; + case "tr": return "turkish"; + } + } + + // Default to English because it seems to be on average the language + // most likely to be present in emails, regardless of the user's + // language setting. This is not an exact science, and search results + // should be ok either way in most cases. + return "english"; + } + } diff --git a/src/engine/meson.build b/src/engine/meson.build index 1133f7b8..377e22d2 100644 --- a/src/engine/meson.build +++ b/src/engine/meson.build @@ -323,6 +323,7 @@ engine_dependencies = [ glib, gmime, libmath, + libstemmer, libxml, posix, sqlite diff --git a/src/meson.build b/src/meson.build index 14f08c18..eef8c476 100644 --- a/src/meson.build +++ b/src/meson.build @@ -150,6 +150,7 @@ foreach dep : valadoc_dependencies valadoc_dep_args += '--pkg' valadoc_dep_args += dep.name() endforeach +valadoc_dep_args += [ '--pkg', 'libstemmer' ] valadoc_dep_args += [ '--pkg', 'posix' ] valadoc_vapidir_args = []