ImapDb.SearchQuery: Use libstemmer for stemming
Instead of shipping our own forked stemmer via SQLite, add a dependency on libstemmer and use that instead.
This commit is contained in:
parent
7c0ad051bd
commit
968bc1a9e8
7 changed files with 118 additions and 50 deletions
|
|
@ -26,9 +26,9 @@ variables:
|
||||||
meson vala desktop-file-utils enchant2-devel folks-devel gcr-devel
|
meson vala desktop-file-utils enchant2-devel folks-devel gcr-devel
|
||||||
glib2-devel gmime30-devel gnome-online-accounts-devel gspell-devel
|
glib2-devel gmime30-devel gnome-online-accounts-devel gspell-devel
|
||||||
gsound-devel gtk3-devel iso-codes-devel json-glib-devel itstool
|
gsound-devel gtk3-devel iso-codes-devel json-glib-devel itstool
|
||||||
libappstream-glib-devel libgee-devel libhandy1-devel libpeas-devel
|
libappstream-glib-devel libgee-devel libhandy1-devel
|
||||||
libsecret-devel libunwind-devel libxml2-devel libytnef-devel
|
libpeas-devel libsecret-devel libstemmer-devel libunwind-devel
|
||||||
sqlite-devel webkitgtk4-devel
|
libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
|
||||||
FEDORA_TEST_DEPS: glibc-langpack-en gnutls-utils tar Xvfb xz
|
FEDORA_TEST_DEPS: glibc-langpack-en gnutls-utils tar Xvfb xz
|
||||||
|
|
||||||
# Ubuntu packages
|
# Ubuntu packages
|
||||||
|
|
@ -37,8 +37,8 @@ variables:
|
||||||
itstool libappstream-glib-dev libenchant-2-dev libfolks-dev
|
itstool libappstream-glib-dev libenchant-2-dev libfolks-dev
|
||||||
libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev
|
libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev
|
||||||
libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev
|
libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev
|
||||||
libhandy-1-dev libjson-glib-dev libmessaging-menu-dev
|
libhandy-1-dev libjson-glib-dev libmessaging-menu-dev libpeas-dev
|
||||||
libpeas-dev libsecret-1-dev libsqlite3-dev libunwind-dev
|
libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev
|
||||||
libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
|
libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
|
||||||
UBUNTU_TEST_DEPS: gnutls-bin librsvg2-common locales xauth xvfb
|
UBUNTU_TEST_DEPS: gnutls-bin librsvg2-common locales xauth xvfb
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -91,8 +91,8 @@ sudo dnf install meson vala desktop-file-utils enchant2-devel \
|
||||||
gnome-online-accounts-devel gspell-devel gsound-devel \
|
gnome-online-accounts-devel gspell-devel gsound-devel \
|
||||||
gtk3-devel iso-codes-devel itstool json-glib-devel \
|
gtk3-devel iso-codes-devel itstool json-glib-devel \
|
||||||
libappstream-glib-devel libgee-devel libhandy1-devel \
|
libappstream-glib-devel libgee-devel libhandy1-devel \
|
||||||
libpeas-devel libsecret-devel libunwind-devel libxml2-devel \
|
libpeas-devel libsecret-devel libstemmer-devel libunwind-devel \
|
||||||
libytnef-devel sqlite-devel webkitgtk4-devel
|
libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
|
||||||
```
|
```
|
||||||
|
|
||||||
Installing dependencies on Ubuntu/Debian
|
Installing dependencies on Ubuntu/Debian
|
||||||
|
|
@ -107,8 +107,8 @@ sudo apt-get install meson build-essential valac \
|
||||||
libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev \
|
libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev \
|
||||||
libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev \
|
libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev \
|
||||||
libjson-glib-dev libhandy-1-dev libpeas-dev libsecret-1-dev \
|
libjson-glib-dev libhandy-1-dev libpeas-dev libsecret-1-dev \
|
||||||
libsqlite3-dev libunwind-dev libwebkit2gtk-4.0-dev libxml2-dev \
|
libsqlite3-dev libstemmer-dev libunwind-dev \
|
||||||
libytnef0-dev
|
libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
|
||||||
```
|
```
|
||||||
|
|
||||||
And for Ubuntu Messaging Menu integration:
|
And for Ubuntu Messaging Menu integration:
|
||||||
|
|
|
||||||
38
bindings/vapi/libstemmer.vapi
Normal file
38
bindings/vapi/libstemmer.vapi
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
/*-
|
||||||
|
* Copyright (c) 2010-2013 Giulio Paci <giuliopaci@gmail.com>
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||||
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||||
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||||
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
[CCode (cheader_filename = "libstemmer.h", lower_case_cprefix = "sb_", cprefix="sb_")]
|
||||||
|
namespace SnowBall {
|
||||||
|
[Compact]
|
||||||
|
[CCode (cname = "struct sb_stemmer", free_function="sb_stemmer_delete")]
|
||||||
|
public class Stemmer {
|
||||||
|
public unowned string stem(string word, int size);
|
||||||
|
public int length();
|
||||||
|
public Stemmer(string language, string encoding = "UTF_8");
|
||||||
|
[CCode (array_length = false, array_null_terminated = true)]
|
||||||
|
public static unowned string[] list();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -93,6 +93,7 @@ libmath = cc.find_library('m')
|
||||||
libpeas = dependency('libpeas-1.0', version: '>= 1.24.0')
|
libpeas = dependency('libpeas-1.0', version: '>= 1.24.0')
|
||||||
libpeas_gtk = dependency('libpeas-gtk-1.0', version: '>= 1.24.0')
|
libpeas_gtk = dependency('libpeas-gtk-1.0', version: '>= 1.24.0')
|
||||||
libsecret = dependency('libsecret-1', version: '>= 0.11')
|
libsecret = dependency('libsecret-1', version: '>= 0.11')
|
||||||
|
libstemmer_dep = cc.find_library('stemmer')
|
||||||
libsoup = dependency('libsoup-2.4', version: '>= 2.48')
|
libsoup = dependency('libsoup-2.4', version: '>= 2.48')
|
||||||
libunwind_dep = dependency(
|
libunwind_dep = dependency(
|
||||||
'libunwind', version: '>= 1.1', required: get_option('libunwind')
|
'libunwind', version: '>= 1.1', required: get_option('libunwind')
|
||||||
|
|
@ -122,6 +123,13 @@ if libunwind_dep.found()
|
||||||
)
|
)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
libstemmer = declare_dependency(
|
||||||
|
dependencies: [
|
||||||
|
valac.find_library('libstemmer', dirs: [vapi_dir]),
|
||||||
|
libstemmer_dep,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
# Optional dependencies
|
# Optional dependencies
|
||||||
appstream_util = find_program('appstream-util', required: false)
|
appstream_util = find_program('appstream-util', required: false)
|
||||||
desktop_file_validate = find_program('desktop-file-validate', required: false)
|
desktop_file_validate = find_program('desktop-file-validate', required: false)
|
||||||
|
|
|
||||||
|
|
@ -318,6 +318,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
|
||||||
// A list of all search terms, regardless of search op field name
|
// A list of all search terms, regardless of search op field name
|
||||||
private Gee.ArrayList<Term> all = new Gee.ArrayList<Term>();
|
private Gee.ArrayList<Term> all = new Gee.ArrayList<Term>();
|
||||||
|
|
||||||
|
private SnowBall.Stemmer stemmer;
|
||||||
|
|
||||||
|
|
||||||
public async SearchQuery(Geary.Account owner,
|
public async SearchQuery(Geary.Account owner,
|
||||||
ImapDB.Account local,
|
ImapDB.Account local,
|
||||||
string query,
|
string query,
|
||||||
|
|
@ -325,6 +328,7 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
|
||||||
GLib.Cancellable? cancellable) {
|
GLib.Cancellable? cancellable) {
|
||||||
base(owner, query, strategy);
|
base(owner, query, strategy);
|
||||||
this.account = local;
|
this.account = local;
|
||||||
|
this.stemmer = new SnowBall.Stemmer(find_appropriate_search_stemmer());
|
||||||
|
|
||||||
switch (strategy) {
|
switch (strategy) {
|
||||||
case Strategy.EXACT:
|
case Strategy.EXACT:
|
||||||
|
|
@ -611,20 +615,19 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This method is used to convert an unquoted user-entered search terms into a stemmed search
|
* Converts unquoted search terms into a stemmed search term.
|
||||||
* term.
|
|
||||||
*
|
*
|
||||||
* Prior experience with the Unicode Snowball stemmer indicates it's too aggressive for our
|
* Prior experience with the Snowball stemmer indicates it is too
|
||||||
* tastes when coupled with prefix-matching of all unquoted terms (see
|
* aggressive for our tastes when coupled with prefix-matching of
|
||||||
* https://bugzilla.gnome.org/show_bug.cgi?id=713179) This method is part of a larger strategy
|
* all unquoted terms (see
|
||||||
* designed to dampen that aggressiveness without losing the benefits of stemming entirely.
|
* https://bugzilla.gnome.org/show_bug.cgi?id=713179).
|
||||||
*
|
*
|
||||||
* Database upgrade 23 removes the old Snowball-stemmed FTS table and replaces it with one
|
* This method is part of a larger strategy designed to dampen
|
||||||
* with no stemming (using only SQLite's "simple" tokenizer). It also creates a "magic" SQLite
|
* that aggressiveness without losing the benefits of stemming
|
||||||
* table called TokenizerTable which allows for uniform queries to the Snowball stemmer, which
|
* entirely: The database's FTS table uses no stemming, but
|
||||||
* is still installed in Geary. Thus, we are now in the position to search for the original
|
* libstemmer is used to generate stemmed search terms.
|
||||||
* term and its stemmed variant, then do post-search processing to strip results which are
|
* Post-search processing is then to strip results which are too
|
||||||
* too "greedy" due to prefix-matching the stemmed variant.
|
* "greedy" due to prefix-matching the stemmed variant.
|
||||||
*
|
*
|
||||||
* Some heuristics are in place simply to determine if stemming should occur:
|
* Some heuristics are in place simply to determine if stemming should occur:
|
||||||
*
|
*
|
||||||
|
|
@ -647,36 +650,9 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
|
||||||
if (term_length < this.min_term_length_for_stemming)
|
if (term_length < this.min_term_length_for_stemming)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
string? stemmed = null;
|
string? stemmed = this.stemmer.stem(term, term.length);
|
||||||
try {
|
|
||||||
yield this.account.db.exec_transaction_async(RO,
|
|
||||||
(cx, cancellable) => {
|
|
||||||
Db.Statement stmt = cx.prepare("""
|
|
||||||
SELECT token
|
|
||||||
FROM TokenizerTable
|
|
||||||
WHERE input=?
|
|
||||||
""");
|
|
||||||
stmt.bind_string(0, term);
|
|
||||||
|
|
||||||
// get stemmed string; if no result, fall through
|
|
||||||
Db.Result result = stmt.exec(cancellable);
|
|
||||||
if (!result.finished) {
|
|
||||||
stemmed = result.string_at(0);
|
|
||||||
} else {
|
|
||||||
debug("No stemmed term returned for \"%s\"", term);
|
|
||||||
}
|
|
||||||
return COMMIT;
|
|
||||||
}, cancellable
|
|
||||||
);
|
|
||||||
} catch (Error err) {
|
|
||||||
debug("Unable to query tokenizer table for stemmed term for \"%s\": %s", term, err.message);
|
|
||||||
|
|
||||||
// fall-through
|
|
||||||
}
|
|
||||||
|
|
||||||
if (String.is_empty(stemmed)) {
|
if (String.is_empty(stemmed)) {
|
||||||
debug("Empty stemmed term returned for \"%s\"", term);
|
debug("Empty stemmed term returned for \"%s\"", term);
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -693,8 +669,52 @@ private class Geary.ImapDB.SearchQuery : Geary.SearchQuery {
|
||||||
}
|
}
|
||||||
|
|
||||||
debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed);
|
debug("Search processing: term -> stem is \"%s\" -> \"%s\"", term, stemmed);
|
||||||
|
|
||||||
return stemmed;
|
return stemmed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private string find_appropriate_search_stemmer() {
|
||||||
|
// Unfortunately, the stemmer library only accepts the full language
|
||||||
|
// name for the stemming algorithm. This translates between the user's
|
||||||
|
// preferred language ISO 639-1 code and our available stemmers.
|
||||||
|
// FIXME: the available list here is determined by what's included in
|
||||||
|
// src/sqlite3-unicodesn/CMakeLists.txt. We should pass that list in
|
||||||
|
// instead of hardcoding it here.
|
||||||
|
foreach (string l in Intl.get_language_names()) {
|
||||||
|
switch (l) {
|
||||||
|
case "ar": return "arabic";
|
||||||
|
case "eu": return "basque";
|
||||||
|
case "ca": return "catalan";
|
||||||
|
case "da": return "danish";
|
||||||
|
case "nl": return "dutch";
|
||||||
|
case "en": return "english";
|
||||||
|
case "fi": return "finnish";
|
||||||
|
case "fr": return "french";
|
||||||
|
case "de": return "german";
|
||||||
|
case "el": return "greek";
|
||||||
|
case "hi": return "hindi";
|
||||||
|
case "hu": return "hungarian";
|
||||||
|
case "id": return "indonesian";
|
||||||
|
case "ga": return "irish";
|
||||||
|
case "it": return "italian";
|
||||||
|
case "lt": return "lithuanian";
|
||||||
|
case "ne": return "nepali";
|
||||||
|
case "no": return "norwegian";
|
||||||
|
case "pt": return "portuguese";
|
||||||
|
case "ro": return "romanian";
|
||||||
|
case "ru": return "russian";
|
||||||
|
case "sr": return "serbian";
|
||||||
|
case "es": return "spanish";
|
||||||
|
case "sv": return "swedish";
|
||||||
|
case "ta": return "tamil";
|
||||||
|
case "tr": return "turkish";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default to English because it seems to be on average the language
|
||||||
|
// most likely to be present in emails, regardless of the user's
|
||||||
|
// language setting. This is not an exact science, and search results
|
||||||
|
// should be ok either way in most cases.
|
||||||
|
return "english";
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -323,6 +323,7 @@ engine_dependencies = [
|
||||||
glib,
|
glib,
|
||||||
gmime,
|
gmime,
|
||||||
libmath,
|
libmath,
|
||||||
|
libstemmer,
|
||||||
libxml,
|
libxml,
|
||||||
posix,
|
posix,
|
||||||
sqlite
|
sqlite
|
||||||
|
|
|
||||||
|
|
@ -150,6 +150,7 @@ foreach dep : valadoc_dependencies
|
||||||
valadoc_dep_args += '--pkg'
|
valadoc_dep_args += '--pkg'
|
||||||
valadoc_dep_args += dep.name()
|
valadoc_dep_args += dep.name()
|
||||||
endforeach
|
endforeach
|
||||||
|
valadoc_dep_args += [ '--pkg', 'libstemmer' ]
|
||||||
valadoc_dep_args += [ '--pkg', 'posix' ]
|
valadoc_dep_args += [ '--pkg', 'posix' ]
|
||||||
|
|
||||||
valadoc_vapidir_args = []
|
valadoc_vapidir_args = []
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue