diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 80be0f43..2b02d93e 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,7 +26,7 @@ variables: meson vala desktop-file-utils enchant2-devel folks-devel gcr-devel glib2-devel gmime30-devel gnome-online-accounts-devel gspell-devel gsound-devel gtk3-devel iso-codes-devel json-glib-devel itstool - libappstream-glib-devel libgee-devel libhandy1-devel + libappstream-glib-devel libgee-devel libhandy1-devel libicu-devel libpeas-devel libsecret-devel libstemmer-devel libunwind-devel libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel FEDORA_TEST_DEPS: glibc-langpack-en gnutls-utils tar Xvfb xz @@ -37,9 +37,9 @@ variables: itstool libappstream-glib-dev libenchant-2-dev libfolks-dev libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev - libhandy-1-dev libjson-glib-dev libmessaging-menu-dev libpeas-dev - libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev - libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev + libhandy-1-dev libicu-dev libjson-glib-dev libmessaging-menu-dev + libpeas-dev libsecret-1-dev libsqlite3-dev libstemmer-dev + libunwind-dev libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev UBUNTU_TEST_DEPS: gnutls-bin librsvg2-common locales xauth xvfb fedora: diff --git a/BUILDING.md b/BUILDING.md index f8ca45ae..f63b1041 100644 --- a/BUILDING.md +++ b/BUILDING.md @@ -93,8 +93,9 @@ sudo dnf install meson vala desktop-file-utils enchant2-devel \ gnome-online-accounts-devel gspell-devel gsound-devel \ gtk3-devel iso-codes-devel itstool json-glib-devel \ libappstream-glib-devel libgee-devel libhandy1-devel \ - libpeas-devel libsecret-devel libstemmer-devel libunwind-devel \ - libxml2-devel libytnef-devel sqlite-devel webkitgtk4-devel + libpeas-devel libsecret-devel libicu-devel libstemmer-devel \ + libunwind-devel libxml2-devel libytnef-devel sqlite-devel \ + webkitgtk4-devel ``` Installing dependencies on Ubuntu/Debian @@ -108,8 +109,8 @@ sudo apt-get install meson build-essential valac \ libappstream-glib-dev libenchant-2-dev libfolks-dev \ libgcr-3-dev libgee-0.8-dev libglib2.0-dev libgmime-3.0-dev \ libgoa-1.0-dev libgspell-1-dev libgsound-dev libgtk-3-dev \ - libjson-glib-dev libhandy-1-dev libpeas-dev libsecret-1-dev \ - libsqlite3-dev libstemmer-dev libunwind-dev \ + libjson-glib-dev libhandy-1-dev libicu-dev libpeas-dev \ + libsecret-1-dev libsqlite3-dev libstemmer-dev libunwind-dev \ libwebkit2gtk-4.0-dev libxml2-dev libytnef0-dev ``` diff --git a/meson.build b/meson.build index ce279f55..32584995 100644 --- a/meson.build +++ b/meson.build @@ -85,6 +85,7 @@ goa = dependency('goa-1.0') gsound = dependency('gsound') gspell = dependency('gspell-1') gthread = dependency('gthread-2.0', version: '>=' + target_glib) +icu_uc = dependency('icu-uc', version: '>=60') iso_codes = dependency('iso-codes') javascriptcoregtk = dependency('javascriptcoregtk-4.0', version: '>=' + target_webkit) json_glib = dependency('json-glib-1.0', version: '>= 1.0') @@ -130,6 +131,15 @@ libstemmer = declare_dependency( ], ) +# Faux ICU dependency to prevent ICU being passed to valac as a +# package by meson +icu = declare_dependency( + dependencies: [ + cc.find_library('icuuc'), + cc.find_library('icudata'), + ], +) + # Optional dependencies appstream_util = find_program('appstream-util', required: false) desktop_file_validate = find_program('desktop-file-validate', required: false) diff --git a/sql/version-030.sql b/sql/version-030.sql index 48af04df..4fbde30d 100644 --- a/sql/version-030.sql +++ b/sql/version-030.sql @@ -14,6 +14,6 @@ CREATE VIRTUAL TABLE MessageSearchTable USING fts5( bcc, flags, - tokenize="unicode61 remove_diacritics 2", + tokenize="geary_tokeniser", prefix="2,4,6,8,10" ) diff --git a/src/engine/imap-db/imap-db-database.vala b/src/engine/imap-db/imap-db-database.vala index 9365f876..45286f9b 100644 --- a/src/engine/imap-db/imap-db-database.vala +++ b/src/engine/imap-db/imap-db-database.vala @@ -7,6 +7,7 @@ [CCode (cname = "g_utf8_collate_key")] extern string utf8_collate_key(string data, ssize_t len); +extern int sqlite3_register_fts5_tokeniser(Sqlite.Database db); extern int sqlite3_register_fts5_matches(Sqlite.Database db); extern int sqlite3_register_legacy_tokenizer(Sqlite.Database db); @@ -630,8 +631,13 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase { sqlite3_register_legacy_tokenizer(cx.db); } - // Register custom `geary_matches()` FTS5 function to obtain - // matching tokens from FTS queries. + // Register custom FTS5 tokeniser that uses ICU to correctly + // segment at both Latin and on-Latin (e.g. CJK, Thai) word + // boundaries. + sqlite3_register_fts5_tokeniser(cx.db); + + // Register custom `geary_matches()` FTS5 function that + // obtains matching tokens from FTS queries. sqlite3_register_fts5_matches(cx.db); if (cx.db.create_function( diff --git a/src/engine/imap-db/imap-db-fts5-tokeniser.c b/src/engine/imap-db/imap-db-fts5-tokeniser.c new file mode 100644 index 00000000..2991a56e --- /dev/null +++ b/src/engine/imap-db/imap-db-fts5-tokeniser.c @@ -0,0 +1,275 @@ +/* + * Copyright © 2020 Michael Gratton + * + * This software is licensed under the GNU Lesser General Public License + * (version 2.1 or later). See the COPYING file in this distribution. + */ + +#include +SQLITE_EXTENSION_INIT1 + +#include +#include +#include +#include +#include +#include "unicode/utf.h" +#include "unicode/utypes.h" + +// Full text search tokeniser for SQLite. This exists since SQLite's +// existing Unicode tokeniser doesn't work with languages that don't +// use spaces as word boundaries. +// +// When generating tokens, the follow process is applied to text using +// the ICU library: +// +// 1. ICU NFKC_Casefold normalisation, handles normalisation, case +// folding and removal of ignorable characters such as accents. +// +// 2. ICU word-boundary tokenisation, splits both on words at spaces +// and other punctuation, and also using a dictionary lookup for +// languages that do not use spaces (CJK, Thai, etc) +// +// Note: Since SQLite is single-threaded, it's safe to use single +// instances of ICU services for all calls for a single tokeniser. + +#define NORM_BUF_LEN 8 +#define TOKEN_BUF_LEN 8 + +typedef struct { + // Singleton object, threadsafe, does not need to be deleted. + const UNormalizer2 * norm; + + // Stateful object, not threadsafe, must be deleted. + UBreakIterator *iter; +} IcuTokeniser; + + +static int icu_create(void *context, + const char **args, + int n_args, + Fts5Tokenizer **ret) { + const UNormalizer2 *norm; + UBreakIterator *iter; + IcuTokeniser *tokeniser; + UErrorCode err = U_ZERO_ERROR; + + norm = unorm2_getNFKCCasefoldInstance(&err); + if (U_FAILURE(err)) { + g_warning("Error constructing ICU normaliser: %s", u_errorName(err)); + return SQLITE_ABORT; + } + + // The given locale doesn't matter here since it ICU doesn't + // (currently) use different rules for different word breaking + // languages that uses spaces as word boundaries, and uses + // dictionary look-ups for CJK and other scripts that don't. + iter = ubrk_open(UBRK_WORD, "en", NULL, 0, &err); + if (U_FAILURE(err)) { + g_warning("Error constructing ICU word-breaker: %s", u_errorName(err)); + ubrk_close(tokeniser->iter); + return SQLITE_ABORT; + } + + tokeniser = g_new0(IcuTokeniser, 1); + tokeniser->norm = norm; + tokeniser->iter = iter; + *ret = (Fts5Tokenizer *) tokeniser; + + return SQLITE_OK; +} + +static void icu_delete(Fts5Tokenizer *fts5_tokeniser) { + IcuTokeniser *tokeniser = (IcuTokeniser *) fts5_tokeniser; + + ubrk_close(tokeniser->iter); + g_free(tokeniser); +} + +static int icu_tokenise(Fts5Tokenizer *fts5_tokeniser, + void *context, + int flags, + const char *chars, + int32_t chars_len, + int (*token_callback)(void*, int, const char*, int, int, int)) { + int ret = SQLITE_OK; + IcuTokeniser *tokeniser = (IcuTokeniser *) fts5_tokeniser; + UErrorCode err = U_ZERO_ERROR; + + const UNormalizer2 *norm = tokeniser->norm; + GArray *wide_chars = NULL; + GArray *wide_offsets = NULL; + UChar *wide_data = NULL; + gsize wide_data_len_long = 0; + int32_t wide_data_len = 0; + + UChar norm_buf[NORM_BUF_LEN] = {0}; + + UBreakIterator *iter = tokeniser->iter; + int32_t start_index, current_index = 0; + char *token_buf = NULL; + int32_t token_buf_len = NORM_BUF_LEN; + + // Normalisation. + // + // SQLite needs the byte-index of tokens found in the chars, but + // ICU doesn't support UTF-8-based normalisation. So convert UTF-8 + // input to UTF-16 char-by-char and record the byte offsets for + // each, so that when converting back to UTF-8 the byte offsets + // can be determined. + + wide_chars = g_array_sized_new(FALSE, FALSE, sizeof(UChar), chars_len); + wide_offsets = g_array_sized_new(FALSE, FALSE, sizeof(int32_t), chars_len); + + for (int32_t byte_offset = 0; byte_offset < chars_len;) { + UChar wide_char; + int32_t norm_len; + int32_t start_byte_offset = byte_offset; + + U8_NEXT_OR_FFFD(chars, byte_offset, chars_len, wide_char); + norm_len = unorm2_normalize(norm, + &wide_char, 1, + norm_buf, NORM_BUF_LEN, + &err); + if (U_FAILURE(err)) { + g_warning("Token text normalisation failed"); + err = SQLITE_ABORT; + goto cleanup; + } + + // NFKC may decompose a single character into multiple + // characters, e.g. 'fi' into "fi", '…' into "...". + for (int i = 0; i < norm_len; i++) { + g_array_append_val(wide_chars, norm_buf[i]); + g_array_append_val(wide_offsets, start_byte_offset); + } + } + + // Word breaking. + // + // UTF-16 is passed to the tokeniser, hence its indexes are + // character-based. Use the offset array to convert those back to + // byte indexes for individual tokens. + + wide_data = (UChar *) g_array_steal(wide_chars, &wide_data_len_long); + wide_data_len = (int32_t) wide_data_len_long; + + ubrk_setText(iter, wide_data, wide_data_len, &err); + if (U_FAILURE(err)) { + err = SQLITE_ABORT; + g_warning("Setting word break iterator text failed"); + goto cleanup; + } + start_index = 0; + current_index = ubrk_first(iter); + token_buf = g_malloc0(sizeof(char) * token_buf_len); + while (current_index != UBRK_DONE && ret == SQLITE_OK) { + int32_t status = ubrk_getRuleStatus(iter); + int32_t token_char_len = current_index - start_index; + if (token_char_len > 0 && + !(status >= UBRK_WORD_NONE && status < UBRK_WORD_NONE_LIMIT) && + !(status >= UBRK_WORD_NUMBER && status < UBRK_WORD_NUMBER_LIMIT)) { + int32_t token_byte_len = 0; + int32_t token_byte_start = 0; + int32_t token_byte_end = 0; + + for (;;) { + u_strToUTF8WithSub(token_buf, token_buf_len, &token_byte_len, + wide_data + start_index, token_char_len, + 0xFFFD, NULL, + &err); + + if (U_SUCCESS(err)) { + break; + } else if (err == U_BUFFER_OVERFLOW_ERROR) { + token_buf_len *= 2; + token_buf = g_realloc(token_buf, sizeof(char) * token_buf_len); + err = U_ZERO_ERROR; + } else { + err = SQLITE_ABORT; + g_warning("Conversion to UTF-8 failed"); + goto cleanup; + } + } + + token_byte_start = g_array_index(wide_offsets, int32_t, start_index); + if (current_index < wide_data_len) { + token_byte_end = g_array_index(wide_offsets, int32_t, current_index); + } else { + token_byte_end = chars_len; + } + + ret = token_callback(context, + 0, + token_buf, + token_byte_len, + token_byte_start, + token_byte_end); + } + + start_index = current_index; + current_index = ubrk_next(iter); + } + + cleanup: + g_free(wide_data); + g_array_unref(wide_chars); + g_array_unref(wide_offsets); + g_free(token_buf); + + return ret; +} + +static fts5_api *get_fts5_api(sqlite3 *db) { + int rc = SQLITE_OK; + sqlite3_stmt *stmt; + fts5_api *api = NULL; + + rc = sqlite3_prepare_v2(db, "SELECT fts5(?1)", + -1, &stmt, 0); + if (rc != SQLITE_OK) { + return NULL; + } + + sqlite3_bind_pointer(stmt, 1, (void*) &api, "fts5_api_ptr", NULL); + sqlite3_step(stmt); + sqlite3_finalize(stmt); + + return api; +} + +static const fts5_tokenizer icu_tokeniser = { + icu_create, + icu_delete, + icu_tokenise +}; + +gboolean sqlite3_register_fts5_tokeniser(sqlite3 *db) { + fts5_api *api; + fts5_tokenizer *tokeniser = (fts5_tokenizer *) &icu_tokeniser; + int rc = SQLITE_OK; + + api = get_fts5_api(db); + if (!api) { + return FALSE; + } + + rc = api->xCreateTokenizer(api, + "geary_tokeniser", + NULL, + tokeniser, + NULL); + + return (rc == SQLITE_OK) ? TRUE : FALSE; +} + +// Entry point for external loadable library, required when using +// command line SQLite tool. The name of this function must match the +// name of the shared module. +int sqlite3_gearytokeniser_init(sqlite3 *db, + char **error_message, + const sqlite3_api_routines *api) { + g_info("Loading geary_tokeniser\n"); + SQLITE_EXTENSION_INIT2(api); + return sqlite3_register_fts5_tokeniser(db) ? SQLITE_OK : SQLITE_ABORT; +} diff --git a/src/engine/meson.build b/src/engine/meson.build index b3727861..ba9941ff 100644 --- a/src/engine/meson.build +++ b/src/engine/meson.build @@ -178,6 +178,7 @@ engine_vala_sources = files( 'imap-db/imap-db-email-identifier.vala', 'imap-db/imap-db-folder.vala', 'imap-db/imap-db-fts5-matches.c', + 'imap-db/imap-db-fts5-tokeniser.c', 'imap-db/imap-db-gc.vala', 'imap-db/imap-db-message-row.vala', 'imap-db/imap-db-sqlite.c', @@ -324,6 +325,7 @@ engine_dependencies = [ gio, glib, gmime, + icu, libmath, libstemmer, libxml, @@ -337,10 +339,17 @@ endif engine_build_dir = meson.current_build_dir() +engine_c_args = geary_c_args +engine_vala_args = geary_vala_args + +# Suppress SQLite loadable module init code +engine_c_args += [ + '-D', 'SQLITE_CORE', +] + # Generate internal VAPI for unit testing. See Meson issue # https://github.com/mesonbuild/meson/issues/1781 for official # internal VAPI support. -engine_vala_args = geary_vala_args engine_vala_args += [ '--internal-header=@0@/geary-engine-internal.h'.format(engine_build_dir), '--internal-vapi=@0@/geary-engine-internal.vapi'.format(engine_build_dir) @@ -364,7 +373,7 @@ engine_lib = static_library('geary-engine', dependencies: engine_dependencies, include_directories: config_h_dir, vala_args: engine_vala_args, - c_args: geary_c_args, + c_args: engine_c_args, ) # Dummy target to tell Meson about the internal VAPI given the @@ -402,3 +411,14 @@ engine_internal_dep = declare_dependency( include_directories: include_directories('.'), sources: engine_internal_header_fixup ) + +# Compile a loadable library containing the custom tokeniser so SQLite +# command line app can still be used. +tokeniser_lib = shared_library('geary-tokeniser', + files('imap-db/imap-db-fts5-tokeniser.c'), + dependencies: [ glib, icu, sqlite ], + c_args: [ + # Enable GLib structured logging + '-DG_LOG_USE_STRUCTURED', + ], +)