ImapDb.Database: Register new ICU-based tokeniser for FTS

The SQLite tokeniser does not deal with scripts that do not use spaces for word breaking (CJK, Thai, etc), thus searching in those languages does not work well. This adds a custom SQLite tokeniser based on ICU that breaks words for all languages supported by that library, and uses NFKC_Casefold normalisation to handle normalisation, case folding, and dropping of ignorable characters. Fixes #121
2020-11-13 08:41:08 +11:00 · 2020-11-13 08:41:08 +11:00 · 7e38198287
commit 7e38198287
parent 90711f234e
7 changed files with 325 additions and 13 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -26,7 +26,7 @@ variables:
    meson vala desktop-file-utils enchant2-devel folks-devel gcr-devel
    glib2-devel gmime30-devel gnome-online-accounts-devel gspell-devel
    gsound-devel gtk3-devel iso-codes-devel json-glib-devel itstool
-    libappstream-glib-devel libgee-devel libhandy1-devel
+    libappstream-glib-devel libgee-devel libhandy1-devel libicu-devel
    libpeas-devel libsecret-devel libstemmer-devel libunwind-devel
    libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
  FEDORA_TEST_DEPS: glibc-langpack-en gnutls-utils tar Xvfb xz
@ -37,9 +37,9 @@ variables:
    itstool libappstream-glib-dev libenchant-2-dev libfolks-dev
    libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev
    libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev
-    libhandy-1-dev libjson-glib-dev libmessaging-menu-dev libpeas-dev
-    libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev
-    libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
+    libhandy-1-dev libicu-dev libjson-glib-dev libmessaging-menu-dev
+    libpeas-dev libsecret-1-dev libsqlite3-dev libstemmer-dev
+    libunwind-dev libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
  UBUNTU_TEST_DEPS: gnutls-bin librsvg2-common locales xauth xvfb

 fedora:
--- a/BUILDING.md
+++ b/BUILDING.md
@ -93,8 +93,9 @@ sudo dnf install meson vala desktop-file-utils enchant2-devel \
    gnome-online-accounts-devel gspell-devel gsound-devel \
    gtk3-devel iso-codes-devel itstool json-glib-devel \
    libappstream-glib-devel libgee-devel libhandy1-devel \
-    libpeas-devel libsecret-devel libstemmer-devel libunwind-devel \
-    libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
+    libpeas-devel libsecret-devel libicu-devel libstemmer-devel \
+    libunwind-devel libxml2-devel libytnef-devel sqlite-devel \
+    webkitgtk4-devel
 ```

 Installing dependencies on Ubuntu/Debian
@ -108,8 +109,8 @@ sudo apt-get install meson build-essential valac \
    libappstream-glib-dev libenchant-2-dev libfolks-dev \
    libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev \
    libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev \
-    libjson-glib-dev libhandy-1-dev libpeas-dev libsecret-1-dev \
-    libsqlite3-dev libstemmer-dev libunwind-dev \
+    libjson-glib-dev libhandy-1-dev libicu-dev libpeas-dev \
+    libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev \
    libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
 ```

--- a/meson.build
+++ b/meson.build
@ -85,6 +85,7 @@ goa = dependency('goa-1.0')
 gsound = dependency('gsound')
 gspell = dependency('gspell-1')
 gthread = dependency('gthread-2.0', version: '>=' + target_glib)
+icu_uc = dependency('icu-uc', version: '>=60')
 iso_codes = dependency('iso-codes')
 javascriptcoregtk = dependency('javascriptcoregtk-4.0', version: '>=' + target_webkit)
 json_glib = dependency('json-glib-1.0', version: '>= 1.0')
@ -130,6 +131,15 @@ libstemmer = declare_dependency(
  ],
 )

+# Faux ICU dependency to prevent ICU being passed to valac as a
+# package by meson
+icu = declare_dependency(
+  dependencies: [
+    cc.find_library('icuuc'),
+    cc.find_library('icudata'),
+  ],
+)
+
 # Optional dependencies
 appstream_util = find_program('appstream-util', required: false)
 desktop_file_validate = find_program('desktop-file-validate', required: false)
--- a/sql/version-030.sql
+++ b/sql/version-030.sql
@ -14,6 +14,6 @@ CREATE VIRTUAL TABLE MessageSearchTable USING fts5(
    bcc,
    flags,

-    tokenize="unicode61 remove_diacritics 2",
+    tokenize="geary_tokeniser",
    prefix="2,4,6,8,10"
 )
--- a/src/engine/imap-db/imap-db-database.vala
+++ b/src/engine/imap-db/imap-db-database.vala
@ -7,6 +7,7 @@

 [CCode (cname = "g_utf8_collate_key")]
 extern string utf8_collate_key(string data, ssize_t len);
+extern int sqlite3_register_fts5_tokeniser(Sqlite.Database db);
 extern int sqlite3_register_fts5_matches(Sqlite.Database db);
 extern int sqlite3_register_legacy_tokenizer(Sqlite.Database db);

@ -630,8 +631,13 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
            sqlite3_register_legacy_tokenizer(cx.db);
        }

-        // Register custom `geary_matches()` FTS5 function to obtain
-        // matching tokens from FTS queries.
+        // Register custom FTS5 tokeniser that uses ICU to correctly
+        // segment at both Latin and on-Latin (e.g. CJK, Thai) word
+        // boundaries.
+        sqlite3_register_fts5_tokeniser(cx.db);
+
+        // Register custom `geary_matches()` FTS5 function that
+        // obtains matching tokens from FTS queries.
        sqlite3_register_fts5_matches(cx.db);

        if (cx.db.create_function(
--- a/src/engine/imap-db/imap-db-fts5-tokeniser.c
+++ b/src/engine/imap-db/imap-db-fts5-tokeniser.c
@ -0,0 +1,275 @@
+/*
+ * Copyright © 2020 Michael Gratton <mike@vee.net>
+ *
+ * This software is licensed under the GNU Lesser General Public License
+ * (version 2.1 or later). See the COPYING file in this distribution.
+ */
+
+#include <sqlite3ext.h>
+SQLITE_EXTENSION_INIT1
+
+#include <glib.h>
+#include <gmodule.h>
+#include <unicode/ubrk.h>
+#include <unicode/unorm2.h>
+#include <unicode/ustring.h>
+#include "unicode/utf.h"
+#include "unicode/utypes.h"
+
+// Full text search tokeniser for SQLite. This exists since SQLite's
+// existing Unicode tokeniser doesn't work with languages that don't
+// use spaces as word boundaries.
+//
+// When generating tokens, the follow process is applied to text using
+// the ICU library:
+//
+// 1. ICU NFKC_Casefold normalisation, handles normalisation, case
+//    folding and removal of ignorable characters such as accents.
+//
+// 2. ICU word-boundary tokenisation, splits both on words at spaces
+//    and other punctuation, and also using a dictionary lookup for
+//    languages that do not use spaces (CJK, Thai, etc)
+//
+// Note: Since SQLite is single-threaded, it's safe to use single
+// instances of ICU services for all calls for a single tokeniser.
+
+#define NORM_BUF_LEN 8
+#define TOKEN_BUF_LEN 8
+
+typedef struct {
+    // Singleton object, threadsafe, does not need to be deleted.
+    const UNormalizer2 * norm;
+
+    // Stateful object, not threadsafe, must be deleted.
+    UBreakIterator *iter;
+} IcuTokeniser;
+
+
+static int icu_create(void *context,
+                      const char **args,
+                      int n_args,
+                      Fts5Tokenizer **ret) {
+    const UNormalizer2 *norm;
+    UBreakIterator *iter;
+    IcuTokeniser *tokeniser;
+    UErrorCode err = U_ZERO_ERROR;
+
+    norm = unorm2_getNFKCCasefoldInstance(&err);
+    if (U_FAILURE(err)) {
+        g_warning("Error constructing ICU normaliser: %s", u_errorName(err));
+        return SQLITE_ABORT;
+    }
+
+    // The given locale doesn't matter here since it ICU doesn't
+    // (currently) use different rules for different word breaking
+    // languages that uses spaces as word boundaries, and uses
+    // dictionary look-ups for CJK and other scripts that don't.
+    iter = ubrk_open(UBRK_WORD, "en", NULL, 0, &err);
+    if (U_FAILURE(err)) {
+        g_warning("Error constructing ICU word-breaker: %s", u_errorName(err));
+        ubrk_close(tokeniser->iter);
+        return SQLITE_ABORT;
+    }
+
+    tokeniser = g_new0(IcuTokeniser, 1);
+    tokeniser->norm = norm;
+    tokeniser->iter = iter;
+    *ret = (Fts5Tokenizer *) tokeniser;
+
+    return SQLITE_OK;
+}
+
+static void icu_delete(Fts5Tokenizer *fts5_tokeniser) {
+    IcuTokeniser *tokeniser = (IcuTokeniser *) fts5_tokeniser;
+
+    ubrk_close(tokeniser->iter);
+    g_free(tokeniser);
+}
+
+static int icu_tokenise(Fts5Tokenizer *fts5_tokeniser,
+                        void *context,
+                        int flags,
+                        const char *chars,
+                        int32_t chars_len,
+                        int (*token_callback)(void*, int, const char*, int, int, int)) {
+    int ret = SQLITE_OK;
+    IcuTokeniser *tokeniser = (IcuTokeniser *) fts5_tokeniser;
+    UErrorCode err = U_ZERO_ERROR;
+
+    const UNormalizer2 *norm = tokeniser->norm;
+    GArray *wide_chars = NULL;
+    GArray *wide_offsets = NULL;
+    UChar *wide_data = NULL;
+    gsize wide_data_len_long = 0;
+    int32_t wide_data_len = 0;
+
+    UChar norm_buf[NORM_BUF_LEN] = {0};
+
+    UBreakIterator *iter = tokeniser->iter;
+    int32_t start_index, current_index = 0;
+    char *token_buf = NULL;
+    int32_t token_buf_len = NORM_BUF_LEN;
+
+    // Normalisation.
+    //
+    // SQLite needs the byte-index of tokens found in the chars, but
+    // ICU doesn't support UTF-8-based normalisation. So convert UTF-8
+    // input to UTF-16 char-by-char and record the byte offsets for
+    // each, so that when converting back to UTF-8 the byte offsets
+    // can be determined.
+
+    wide_chars = g_array_sized_new(FALSE, FALSE, sizeof(UChar), chars_len);
+    wide_offsets = g_array_sized_new(FALSE, FALSE, sizeof(int32_t), chars_len);
+
+    for (int32_t byte_offset = 0; byte_offset < chars_len;) {
+        UChar wide_char;
+        int32_t norm_len;
+        int32_t start_byte_offset = byte_offset;
+
+        U8_NEXT_OR_FFFD(chars, byte_offset, chars_len, wide_char);
+        norm_len = unorm2_normalize(norm,
+                                    &wide_char, 1,
+                                    norm_buf, NORM_BUF_LEN,
+                                    &err);
+        if (U_FAILURE(err)) {
+            g_warning("Token text normalisation failed");
+            err = SQLITE_ABORT;
+            goto cleanup;
+        }
+
+        // NFKC may decompose a single character into multiple
+        // characters, e.g. 'ﬁ' into "fi", '…' into "...".
+        for (int i = 0; i < norm_len; i++) {
+            g_array_append_val(wide_chars, norm_buf[i]);
+            g_array_append_val(wide_offsets, start_byte_offset);
+        }
+    }
+
+    // Word breaking.
+    //
+    // UTF-16 is passed to the tokeniser, hence its indexes are
+    // character-based. Use the offset array to convert those back to
+    // byte indexes for individual tokens.
+
+    wide_data = (UChar *) g_array_steal(wide_chars, &wide_data_len_long);
+    wide_data_len = (int32_t) wide_data_len_long;
+
+    ubrk_setText(iter, wide_data, wide_data_len, &err);
+    if (U_FAILURE(err)) {
+        err = SQLITE_ABORT;
+        g_warning("Setting word break iterator text failed");
+        goto cleanup;
+    }
+    start_index = 0;
+    current_index = ubrk_first(iter);
+    token_buf = g_malloc0(sizeof(char) * token_buf_len);
+    while (current_index != UBRK_DONE && ret == SQLITE_OK) {
+        int32_t status = ubrk_getRuleStatus(iter);
+        int32_t token_char_len = current_index - start_index;
+        if (token_char_len > 0 &&
+            !(status >= UBRK_WORD_NONE && status < UBRK_WORD_NONE_LIMIT) &&
+            !(status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)) {
+            int32_t token_byte_len = 0;
+            int32_t token_byte_start = 0;
+            int32_t token_byte_end = 0;
+
+            for (;;) {
+                u_strToUTF8WithSub(token_buf, token_buf_len, &token_byte_len,
+                                   wide_data + start_index, token_char_len,
+                                   0xFFFD, NULL,
+                                   &err);
+
+                if (U_SUCCESS(err)) {
+                    break;
+                } else if (err == U_BUFFER_OVERFLOW_ERROR) {
+                    token_buf_len *= 2;
+                    token_buf = g_realloc(token_buf, sizeof(char) * token_buf_len);
+                    err = U_ZERO_ERROR;
+                } else {
+                    err = SQLITE_ABORT;
+                    g_warning("Conversion to UTF-8 failed");
+                    goto cleanup;
+                }
+            }
+
+            token_byte_start = g_array_index(wide_offsets, int32_t, start_index);
+            if (current_index < wide_data_len) {
+                token_byte_end = g_array_index(wide_offsets, int32_t, current_index);
+            } else {
+                token_byte_end = chars_len;
+            }
+
+            ret = token_callback(context,
+                                 0,
+                                 token_buf,
+                                 token_byte_len,
+                                 token_byte_start,
+                                 token_byte_end);
+        }
+
+        start_index = current_index;
+        current_index = ubrk_next(iter);
+    }
+
+ cleanup:
+    g_free(wide_data);
+    g_array_unref(wide_chars);
+    g_array_unref(wide_offsets);
+    g_free(token_buf);
+
+    return ret;
+}
+
+static fts5_api *get_fts5_api(sqlite3 *db) {
+    int rc = SQLITE_OK;
+    sqlite3_stmt *stmt;
+    fts5_api *api = NULL;
+
+    rc = sqlite3_prepare_v2(db, "SELECT fts5(?1)",
+                                -1, &stmt, 0);
+    if (rc != SQLITE_OK) {
+        return NULL;
+    }
+
+    sqlite3_bind_pointer(stmt, 1, (void*) &api, "fts5_api_ptr", NULL);
+    sqlite3_step(stmt);
+    sqlite3_finalize(stmt);
+
+    return api;
+}
+
+static const fts5_tokenizer icu_tokeniser = {
+    icu_create,
+    icu_delete,
+    icu_tokenise
+};
+
+gboolean sqlite3_register_fts5_tokeniser(sqlite3 *db) {
+    fts5_api *api;
+    fts5_tokenizer *tokeniser = (fts5_tokenizer *) &icu_tokeniser;
+    int rc = SQLITE_OK;
+
+    api = get_fts5_api(db);
+    if (!api) {
+        return FALSE;
+    }
+
+    rc = api->xCreateTokenizer(api,
+                               "geary_tokeniser",
+                               NULL,
+                               tokeniser,
+                               NULL);
+
+    return (rc == SQLITE_OK) ? TRUE : FALSE;
+}
+
+// Entry point for external loadable library, required when using
+// command line SQLite tool. The name of this function must match the
+// name of the shared module.
+int sqlite3_gearytokeniser_init(sqlite3 *db,
+                                char **error_message,
+                                const sqlite3_api_routines *api) {
+    g_info("Loading geary_tokeniser\n");
+    SQLITE_EXTENSION_INIT2(api);
+    return sqlite3_register_fts5_tokeniser(db) ? SQLITE_OK : SQLITE_ABORT;
+}
--- a/src/engine/meson.build
+++ b/src/engine/meson.build
@ -178,6 +178,7 @@ engine_vala_sources = files(
  'imap-db/imap-db-email-identifier.vala',
  'imap-db/imap-db-folder.vala',
  'imap-db/imap-db-fts5-matches.c',
+  'imap-db/imap-db-fts5-tokeniser.c',
  'imap-db/imap-db-gc.vala',
  'imap-db/imap-db-message-row.vala',
  'imap-db/imap-db-sqlite.c',
@ -324,6 +325,7 @@ engine_dependencies = [
  gio,
  glib,
  gmime,
+  icu,
  libmath,
  libstemmer,
  libxml,
@ -337,10 +339,17 @@ endif

 engine_build_dir = meson.current_build_dir()

+engine_c_args = geary_c_args
+engine_vala_args = geary_vala_args
+
+# Suppress SQLite loadable module init code
+engine_c_args += [
+  '-D', 'SQLITE_CORE',
+]
+
 # Generate internal VAPI for unit testing. See Meson issue
 # https://github.com/mesonbuild/meson/issues/1781 for official
 # internal VAPI support.
-engine_vala_args = geary_vala_args
 engine_vala_args += [
  '--internal-header=@0@/geary-engine-internal.h'.format(engine_build_dir),
  '--internal-vapi=@0@/geary-engine-internal.vapi'.format(engine_build_dir)
@ -364,7 +373,7 @@ engine_lib = static_library('geary-engine',
  dependencies: engine_dependencies,
  include_directories: config_h_dir,
  vala_args: engine_vala_args,
-  c_args: geary_c_args,
+  c_args: engine_c_args,
 )

 # Dummy target to tell Meson about the internal VAPI given the
@ -402,3 +411,14 @@ engine_internal_dep = declare_dependency(
  include_directories: include_directories('.'),
  sources: engine_internal_header_fixup
 )
+
+# Compile a loadable library containing the custom tokeniser so SQLite
+# command line app can still be used.
+tokeniser_lib = shared_library('geary-tokeniser',
+  files('imap-db/imap-db-fts5-tokeniser.c'),
+  dependencies: [ glib, icu, sqlite ],
+  c_args: [
+    # Enable GLib structured logging
+    '-DG_LOG_USE_STRUCTURED',
+  ],
+)