From 7e381982872d74b5b135c183ee9188586bb83645 Mon Sep 17 00:00:00 2001
From: Michael Gratton <mike@vee.net>
Date: Fri, 13 Nov 2020 08:41:08 +1100
Subject: [PATCH] ImapDb.Database: Register new ICU-based tokeniser for FTS

The SQLite tokeniser does not deal with scripts that do not use spaces
for word breaking (CJK, Thai, etc), thus searching in those languages
does not work well.

This adds a custom SQLite tokeniser based on ICU that breaks words for
all languages supported by that library, and uses NFKC_Casefold
normalisation to handle normalisation, case folding, and dropping of
ignorable characters.

Fixes #121
---
 .gitlab-ci.yml                              |   8 +-
 BUILDING.md                                 |   9 +-
 meson.build                                 |  10 +
 sql/version-030.sql                         |   2 +-
 src/engine/imap-db/imap-db-database.vala    |  10 +-
 src/engine/imap-db/imap-db-fts5-tokeniser.c | 275 ++++++++++++++++++++
 src/engine/meson.build                      |  24 +-
 7 files changed, 325 insertions(+), 13 deletions(-)
 create mode 100644 src/engine/imap-db/imap-db-fts5-tokeniser.c

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 80be0f43..2b02d93e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -26,7 +26,7 @@ variables:
     meson vala desktop-file-utils enchant2-devel folks-devel gcr-devel
     glib2-devel gmime30-devel gnome-online-accounts-devel gspell-devel
     gsound-devel gtk3-devel iso-codes-devel json-glib-devel itstool
-    libappstream-glib-devel libgee-devel libhandy1-devel
+    libappstream-glib-devel libgee-devel libhandy1-devel libicu-devel
     libpeas-devel libsecret-devel libstemmer-devel libunwind-devel
     libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
   FEDORA_TEST_DEPS: glibc-langpack-en gnutls-utils tar Xvfb xz
@@ -37,9 +37,9 @@ variables:
     itstool libappstream-glib-dev libenchant-2-dev libfolks-dev
     libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev
     libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev
-    libhandy-1-dev libjson-glib-dev libmessaging-menu-dev libpeas-dev
-    libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev
-    libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
+    libhandy-1-dev libicu-dev libjson-glib-dev libmessaging-menu-dev
+    libpeas-dev libsecret-1-dev libsqlite3-dev libstemmer-dev
+    libunwind-dev libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
   UBUNTU_TEST_DEPS: gnutls-bin librsvg2-common locales xauth xvfb
 
 fedora:
diff --git a/BUILDING.md b/BUILDING.md
index f8ca45ae..f63b1041 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -93,8 +93,9 @@ sudo dnf install meson vala desktop-file-utils enchant2-devel \
     gnome-online-accounts-devel gspell-devel gsound-devel \
     gtk3-devel iso-codes-devel itstool json-glib-devel \
     libappstream-glib-devel libgee-devel libhandy1-devel \
-    libpeas-devel libsecret-devel libstemmer-devel libunwind-devel \
-    libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel
+    libpeas-devel libsecret-devel libicu-devel libstemmer-devel \
+    libunwind-devel libxml2-devel libytnef-devel sqlite-devel \
+    webkitgtk4-devel
 ```
 
 Installing dependencies on Ubuntu/Debian
@@ -108,8 +109,8 @@ sudo apt-get install meson build-essential valac \
     libappstream-glib-dev libenchant-2-dev libfolks-dev \
     libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev \
     libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev \
-    libjson-glib-dev libhandy-1-dev libpeas-dev libsecret-1-dev \
-    libsqlite3-dev libstemmer-dev libunwind-dev \
+    libjson-glib-dev libhandy-1-dev libicu-dev libpeas-dev \
+    libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev \
     libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev
 ```
 
diff --git a/meson.build b/meson.build
index ce279f55..32584995 100644
--- a/meson.build
+++ b/meson.build
@@ -85,6 +85,7 @@ goa = dependency('goa-1.0')
 gsound = dependency('gsound')
 gspell = dependency('gspell-1')
 gthread = dependency('gthread-2.0', version: '>=' + target_glib)
+icu_uc = dependency('icu-uc', version: '>=60')
 iso_codes = dependency('iso-codes')
 javascriptcoregtk = dependency('javascriptcoregtk-4.0', version: '>=' + target_webkit)
 json_glib = dependency('json-glib-1.0', version: '>= 1.0')
@@ -130,6 +131,15 @@ libstemmer = declare_dependency(
   ],
 )
 
+# Faux ICU dependency to prevent ICU being passed to valac as a
+# package by meson
+icu = declare_dependency(
+  dependencies: [
+    cc.find_library('icuuc'),
+    cc.find_library('icudata'),
+  ],
+)
+
 # Optional dependencies
 appstream_util = find_program('appstream-util', required: false)
 desktop_file_validate = find_program('desktop-file-validate', required: false)
diff --git a/sql/version-030.sql b/sql/version-030.sql
index 48af04df..4fbde30d 100644
--- a/sql/version-030.sql
+++ b/sql/version-030.sql
@@ -14,6 +14,6 @@ CREATE VIRTUAL TABLE MessageSearchTable USING fts5(
     bcc,
     flags,
 
-    tokenize="unicode61 remove_diacritics 2",
+    tokenize="geary_tokeniser",
     prefix="2,4,6,8,10"
 )
diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala
index 9365f876..45286f9b 100644
--- a/src/engine/imap-db/imap-db-database.vala
+++ b/src/engine/imap-db/imap-db-database.vala
@@ -7,6 +7,7 @@
 
 [CCode (cname = "g_utf8_collate_key")]
 extern string utf8_collate_key(string data, ssize_t len);
+extern int sqlite3_register_fts5_tokeniser(Sqlite.Database db);
 extern int sqlite3_register_fts5_matches(Sqlite.Database db);
 extern int sqlite3_register_legacy_tokenizer(Sqlite.Database db);
 
@@ -630,8 +631,13 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
             sqlite3_register_legacy_tokenizer(cx.db);
         }
 
-        // Register custom `geary_matches()` FTS5 function to obtain
-        // matching tokens from FTS queries.
+        // Register custom FTS5 tokeniser that uses ICU to correctly
+        // segment at both Latin and on-Latin (e.g. CJK, Thai) word
+        // boundaries.
+        sqlite3_register_fts5_tokeniser(cx.db);
+
+        // Register custom `geary_matches()` FTS5 function that
+        // obtains matching tokens from FTS queries.
         sqlite3_register_fts5_matches(cx.db);
 
         if (cx.db.create_function(
diff --git a/src/engine/imap-db/imap-db-fts5-tokeniser.c b/src/engine/imap-db/imap-db-fts5-tokeniser.c
new file mode 100644
index 00000000..2991a56e
--- /dev/null
+++ b/src/engine/imap-db/imap-db-fts5-tokeniser.c
@@ -0,0 +1,275 @@
+/*
+ * Copyright © 2020 Michael Gratton <mike@vee.net>
+ *
+ * This software is licensed under the GNU Lesser General Public License
+ * (version 2.1 or later). See the COPYING file in this distribution.
+ */
+
+#include <sqlite3ext.h>
+SQLITE_EXTENSION_INIT1
+
+#include <glib.h>
+#include <gmodule.h>
+#include <unicode/ubrk.h>
+#include <unicode/unorm2.h>
+#include <unicode/ustring.h>
+#include "unicode/utf.h"
+#include "unicode/utypes.h"
+
+// Full text search tokeniser for SQLite. This exists since SQLite's
+// existing Unicode tokeniser doesn't work with languages that don't
+// use spaces as word boundaries.
+//
+// When generating tokens, the follow process is applied to text using
+// the ICU library:
+//
+// 1. ICU NFKC_Casefold normalisation, handles normalisation, case
+//    folding and removal of ignorable characters such as accents.
+//
+// 2. ICU word-boundary tokenisation, splits both on words at spaces
+//    and other punctuation, and also using a dictionary lookup for
+//    languages that do not use spaces (CJK, Thai, etc)
+//
+// Note: Since SQLite is single-threaded, it's safe to use single
+// instances of ICU services for all calls for a single tokeniser.
+
+#define NORM_BUF_LEN 8
+#define TOKEN_BUF_LEN 8
+
+typedef struct {
+    // Singleton object, threadsafe, does not need to be deleted.
+    const UNormalizer2 * norm;
+
+    // Stateful object, not threadsafe, must be deleted.
+    UBreakIterator *iter;
+} IcuTokeniser;
+
+
+static int icu_create(void *context,
+                      const char **args,
+                      int n_args,
+                      Fts5Tokenizer **ret) {
+    const UNormalizer2 *norm;
+    UBreakIterator *iter;
+    IcuTokeniser *tokeniser;
+    UErrorCode err = U_ZERO_ERROR;
+
+    norm = unorm2_getNFKCCasefoldInstance(&err);
+    if (U_FAILURE(err)) {
+        g_warning("Error constructing ICU normaliser: %s", u_errorName(err));
+        return SQLITE_ABORT;
+    }
+
+    // The given locale doesn't matter here since it ICU doesn't
+    // (currently) use different rules for different word breaking
+    // languages that uses spaces as word boundaries, and uses
+    // dictionary look-ups for CJK and other scripts that don't.
+    iter = ubrk_open(UBRK_WORD, "en", NULL, 0, &err);
+    if (U_FAILURE(err)) {
+        g_warning("Error constructing ICU word-breaker: %s", u_errorName(err));
+        ubrk_close(tokeniser->iter);
+        return SQLITE_ABORT;
+    }
+
+    tokeniser = g_new0(IcuTokeniser, 1);
+    tokeniser->norm = norm;
+    tokeniser->iter = iter;
+    *ret = (Fts5Tokenizer *) tokeniser;
+
+    return SQLITE_OK;
+}
+
+static void icu_delete(Fts5Tokenizer *fts5_tokeniser) {
+    IcuTokeniser *tokeniser = (IcuTokeniser *) fts5_tokeniser;
+
+    ubrk_close(tokeniser->iter);
+    g_free(tokeniser);
+}
+
+static int icu_tokenise(Fts5Tokenizer *fts5_tokeniser,
+                        void *context,
+                        int flags,
+                        const char *chars,
+                        int32_t chars_len,
+                        int (*token_callback)(void*, int, const char*, int, int, int)) {
+    int ret = SQLITE_OK;
+    IcuTokeniser *tokeniser = (IcuTokeniser *) fts5_tokeniser;
+    UErrorCode err = U_ZERO_ERROR;
+
+    const UNormalizer2 *norm = tokeniser->norm;
+    GArray *wide_chars = NULL;
+    GArray *wide_offsets = NULL;
+    UChar *wide_data = NULL;
+    gsize wide_data_len_long = 0;
+    int32_t wide_data_len = 0;
+
+    UChar norm_buf[NORM_BUF_LEN] = {0};
+
+    UBreakIterator *iter = tokeniser->iter;
+    int32_t start_index, current_index = 0;
+    char *token_buf = NULL;
+    int32_t token_buf_len = NORM_BUF_LEN;
+
+    // Normalisation.
+    //
+    // SQLite needs the byte-index of tokens found in the chars, but
+    // ICU doesn't support UTF-8-based normalisation. So convert UTF-8
+    // input to UTF-16 char-by-char and record the byte offsets for
+    // each, so that when converting back to UTF-8 the byte offsets
+    // can be determined.
+
+    wide_chars = g_array_sized_new(FALSE, FALSE, sizeof(UChar), chars_len);
+    wide_offsets = g_array_sized_new(FALSE, FALSE, sizeof(int32_t), chars_len);
+
+    for (int32_t byte_offset = 0; byte_offset < chars_len;) {
+        UChar wide_char;
+        int32_t norm_len;
+        int32_t start_byte_offset = byte_offset;
+
+        U8_NEXT_OR_FFFD(chars, byte_offset, chars_len, wide_char);
+        norm_len = unorm2_normalize(norm,
+                                    &wide_char, 1,
+                                    norm_buf, NORM_BUF_LEN,
+                                    &err);
+        if (U_FAILURE(err)) {
+            g_warning("Token text normalisation failed");
+            err = SQLITE_ABORT;
+            goto cleanup;
+        }
+
+        // NFKC may decompose a single character into multiple
+        // characters, e.g. 'ﬁ' into "fi", '…' into "...".
+        for (int i = 0; i < norm_len; i++) {
+            g_array_append_val(wide_chars, norm_buf[i]);
+            g_array_append_val(wide_offsets, start_byte_offset);
+        }
+    }
+
+    // Word breaking.
+    //
+    // UTF-16 is passed to the tokeniser, hence its indexes are
+    // character-based. Use the offset array to convert those back to
+    // byte indexes for individual tokens.
+
+    wide_data = (UChar *) g_array_steal(wide_chars, &wide_data_len_long);
+    wide_data_len = (int32_t) wide_data_len_long;
+
+    ubrk_setText(iter, wide_data, wide_data_len, &err);
+    if (U_FAILURE(err)) {
+        err = SQLITE_ABORT;
+        g_warning("Setting word break iterator text failed");
+        goto cleanup;
+    }
+    start_index = 0;
+    current_index = ubrk_first(iter);
+    token_buf = g_malloc0(sizeof(char) * token_buf_len);
+    while (current_index != UBRK_DONE && ret == SQLITE_OK) {
+        int32_t status = ubrk_getRuleStatus(iter);
+        int32_t token_char_len = current_index - start_index;
+        if (token_char_len > 0 &&
+            !(status >= UBRK_WORD_NONE && status < UBRK_WORD_NONE_LIMIT) &&
+            !(status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)) {
+            int32_t token_byte_len = 0;
+            int32_t token_byte_start = 0;
+            int32_t token_byte_end = 0;
+
+            for (;;) {
+                u_strToUTF8WithSub(token_buf, token_buf_len, &token_byte_len,
+                                   wide_data + start_index, token_char_len,
+                                   0xFFFD, NULL,
+                                   &err);
+
+                if (U_SUCCESS(err)) {
+                    break;
+                } else if (err == U_BUFFER_OVERFLOW_ERROR) {
+                    token_buf_len *= 2;
+                    token_buf = g_realloc(token_buf, sizeof(char) * token_buf_len);
+                    err = U_ZERO_ERROR;
+                } else {
+                    err = SQLITE_ABORT;
+                    g_warning("Conversion to UTF-8 failed");
+                    goto cleanup;
+                }
+            }
+
+            token_byte_start = g_array_index(wide_offsets, int32_t, start_index);
+            if (current_index < wide_data_len) {
+                token_byte_end = g_array_index(wide_offsets, int32_t, current_index);
+            } else {
+                token_byte_end = chars_len;
+            }
+
+            ret = token_callback(context,
+                                 0,
+                                 token_buf,
+                                 token_byte_len,
+                                 token_byte_start,
+                                 token_byte_end);
+        }
+
+        start_index = current_index;
+        current_index = ubrk_next(iter);
+    }
+
+ cleanup:
+    g_free(wide_data);
+    g_array_unref(wide_chars);
+    g_array_unref(wide_offsets);
+    g_free(token_buf);
+
+    return ret;
+}
+
+static fts5_api *get_fts5_api(sqlite3 *db) {
+    int rc = SQLITE_OK;
+    sqlite3_stmt *stmt;
+    fts5_api *api = NULL;
+
+    rc = sqlite3_prepare_v2(db, "SELECT fts5(?1)",
+                                -1, &stmt, 0);
+    if (rc != SQLITE_OK) {
+        return NULL;
+    }
+
+    sqlite3_bind_pointer(stmt, 1, (void*) &api, "fts5_api_ptr", NULL);
+    sqlite3_step(stmt);
+    sqlite3_finalize(stmt);
+
+    return api;
+}
+
+static const fts5_tokenizer icu_tokeniser = {
+    icu_create,
+    icu_delete,
+    icu_tokenise
+};
+
+gboolean sqlite3_register_fts5_tokeniser(sqlite3 *db) {
+    fts5_api *api;
+    fts5_tokenizer *tokeniser = (fts5_tokenizer *) &icu_tokeniser;
+    int rc = SQLITE_OK;
+
+    api = get_fts5_api(db);
+    if (!api) {
+        return FALSE;
+    }
+
+    rc = api->xCreateTokenizer(api,
+                               "geary_tokeniser",
+                               NULL,
+                               tokeniser,
+                               NULL);
+
+    return (rc == SQLITE_OK) ? TRUE : FALSE;
+}
+
+// Entry point for external loadable library, required when using
+// command line SQLite tool. The name of this function must match the
+// name of the shared module.
+int sqlite3_gearytokeniser_init(sqlite3 *db,
+                                char **error_message,
+                                const sqlite3_api_routines *api) {
+    g_info("Loading geary_tokeniser\n");
+    SQLITE_EXTENSION_INIT2(api);
+    return sqlite3_register_fts5_tokeniser(db) ? SQLITE_OK : SQLITE_ABORT;
+}
diff --git a/src/engine/meson.build b/src/engine/meson.build
index b3727861..ba9941ff 100644
--- a/src/engine/meson.build
+++ b/src/engine/meson.build
@@ -178,6 +178,7 @@ engine_vala_sources = files(
   'imap-db/imap-db-email-identifier.vala',
   'imap-db/imap-db-folder.vala',
   'imap-db/imap-db-fts5-matches.c',
+  'imap-db/imap-db-fts5-tokeniser.c',
   'imap-db/imap-db-gc.vala',
   'imap-db/imap-db-message-row.vala',
   'imap-db/imap-db-sqlite.c',
@@ -324,6 +325,7 @@ engine_dependencies = [
   gio,
   glib,
   gmime,
+  icu,
   libmath,
   libstemmer,
   libxml,
@@ -337,10 +339,17 @@ endif
 
 engine_build_dir = meson.current_build_dir()
 
+engine_c_args = geary_c_args
+engine_vala_args = geary_vala_args
+
+# Suppress SQLite loadable module init code
+engine_c_args += [
+  '-D', 'SQLITE_CORE',
+]
+
 # Generate internal VAPI for unit testing. See Meson issue
 # https://github.com/mesonbuild/meson/issues/1781 for official
 # internal VAPI support.
-engine_vala_args = geary_vala_args
 engine_vala_args += [
   '--internal-header=@0@/geary-engine-internal.h'.format(engine_build_dir),
   '--internal-vapi=@0@/geary-engine-internal.vapi'.format(engine_build_dir)
@@ -364,7 +373,7 @@ engine_lib = static_library('geary-engine',
   dependencies: engine_dependencies,
   include_directories: config_h_dir,
   vala_args: engine_vala_args,
-  c_args: geary_c_args,
+  c_args: engine_c_args,
 )
 
 # Dummy target to tell Meson about the internal VAPI given the
@@ -402,3 +411,14 @@ engine_internal_dep = declare_dependency(
   include_directories: include_directories('.'),
   sources: engine_internal_header_fixup
 )
+
+# Compile a loadable library containing the custom tokeniser so SQLite
+# command line app can still be used.
+tokeniser_lib = shared_library('geary-tokeniser',
+  files('imap-db/imap-db-fts5-tokeniser.c'),
+  dependencies: [ glib, icu, sqlite ],
+  c_args: [
+    # Enable GLib structured logging
+    '-DG_LOG_USE_STRUCTURED',
+  ],
+)