Ensure Geary.ContactStoreImpl handles non english searches

Fix DB impl actually do UTF-8 case-insensitive search matching. Add
some unit tests.
This commit is contained in:
Michael Gratton 2019-06-22 14:11:10 +10:00
parent 788a06144f
commit 4e1aa32514
5 changed files with 147 additions and 35 deletions

View file

@ -98,19 +98,19 @@ internal class Geary.ContactStoreImpl : BaseObject, Geary.ContactStore {
GLib.Cancellable? cancellable)
throws GLib.Error {
Gee.Collection<Contact> contacts = new Gee.LinkedList<Contact>();
string normalised_query = query.make_valid().normalize().down();
string normalised_query = Geary.Db.normalise_case_insensitive_query(query);
if (!String.is_empty(normalised_query)) {
normalised_query = normalised_query + "%";
Db.Statement stmt = cx.prepare("""
SELECT * FROM ContactTable
WHERE highest_importance >= ? AND (
real_name LIKE ? COLLATE UTF8ICASE OR
normalized_email LIKE ? COLLATE UTF8ICASE
UTF8FOLD(real_name) LIKE ? OR
UTF8FOLD(email) LIKE ?
)
ORDER BY highest_importance DESC,
real_name IS NULL,
real_name COLLATE UTF8ICASE,
email COLLATE UTF8ICASE
real_name COLLATE UTF8COLL,
email COLLATE UTF8COLL
LIMIT ?
""");
stmt.bind_uint(0, min_importance);

View file

@ -81,6 +81,17 @@ public bool set_shared_cache_mode(bool enabled) {
return sqlite3_enable_shared_cache(enabled ? 1 : 0) == Sqlite.OK;
}
/** Standard transformation for case-insensitive string values. */
public inline string normalise_case_insensitive_query(string text) {
// This would be a place to do transliteration to improve query
// results, for example normalising `á` to `a`. The built-in GLib
// method `string.to_ascii()` does this but is too strong: It will
// convert e.g. CJK chars to `?`. The `string.tokenize_and_fold`
// function may work better but the calling interface is all
// wrong.
return text.normalize().casefold();
}
private void check_cancelled(string? method, Cancellable? cancellable) throws IOError {
if (cancellable != null && cancellable.is_cancelled())
throw new IOError.CANCELLED("%s cancelled", !String.is_empty(method) ? method : "Operation");

View file

@ -5,22 +5,45 @@
* (version 2.1 or later). See the COPYING file in this distribution.
*/
[CCode (cname = "g_utf8_casefold")]
extern string utf8_casefold(string data, ssize_t len);
[CCode (cname = "g_utf8_collate_key")]
extern string utf8_collate_key(string data, ssize_t len);
extern int sqlite3_unicodesn_register_tokenizer(Sqlite.Database db);
private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
/** Name of UTF-8 case-sensitive SQLite collation function name. */
public const string UTF8_CASE_INSENSITIVE_COLLATION = "UTF8ICASE";
private static int case_insensitive_collation(int a_len, void* a_bytes,
int b_len, void* b_bytes) {
string a_str = utf8_casefold((string) a_bytes, a_len).collate_key();
string b_str = utf8_casefold((string) b_bytes, b_len).collate_key();
return strcmp(a_str, b_str);
/** SQLite UTF-8 case-insensitive, transliterating function name. */
public const string UTF8_CASE_INSENSITIVE_FN = "UTF8FOLD";
/** SQLite UTF-8 collation name. */
public const string UTF8_COLLATE = "UTF8COLL";
private static void utf8_transliterate_fold(Sqlite.Context context,
Sqlite.Value[] values) {
string? text = values[0].to_text();
if (text != null) {
context.result_text(Geary.Db.normalise_case_insensitive_query(text));
} else {
context.result_value(values[0]);
}
}
private static int utf8_collate(int a_len, void* a_bytes,
int b_len, void* b_bytes) {
// Don't need to normalise, collate_key() will do it for us
string? a_str = null;
if (a_bytes != null) {
a_str = utf8_collate_key((string) a_bytes, a_len);
}
string? b_str = null;
if (b_bytes != null) {
b_str = utf8_collate_key((string) b_bytes, b_len);
}
return GLib.strcmp(a_str, b_str);
}
internal GLib.File attachments_path;
private const int OPEN_PUMP_EVENT_LOOP_MSEC = 100;
@ -598,14 +621,29 @@ private class Geary.ImapDB.Database : Geary.Db.VersionedDatabase {
cx.set_recursive_triggers(true);
cx.set_synchronous(Db.SynchronousMode.NORMAL);
sqlite3_unicodesn_register_tokenizer(cx.db);
if (cx.db.create_collation(
UTF8_CASE_INSENSITIVE_COLLATION,
if (cx.db.create_function(
UTF8_CASE_INSENSITIVE_FN,
1, // n args
Sqlite.UTF8,
Database.case_insensitive_collation
null,
Database.utf8_transliterate_fold,
null,
null
) != Sqlite.OK) {
throw new DatabaseError.GENERAL(
"Failed to register collation function %s",
UTF8_CASE_INSENSITIVE_COLLATION
"Failed to register function %s",
UTF8_CASE_INSENSITIVE_FN
);
}
if (cx.db.create_collation(
UTF8_COLLATE,
Sqlite.UTF8,
Database.utf8_collate
) != Sqlite.OK) {
throw new DatabaseError.GENERAL(
"Failed to register collation %s", UTF8_COLLATE
);
}
}

View file

@ -20,6 +20,8 @@ class Geary.ContactStoreImplTest : TestCase {
add_test("search_no_match", search_no_match);
add_test("search_email_match", search_email_match);
add_test("search_name_match", search_name_match);
add_test("search_utf8_latin_names", search_utf8_latin_names);
add_test("search_utf8_multi_byte_names", search_utf8_multi_byte_names);
add_test("update_new_contact", update_new_contact);
add_test("update_existing_contact", update_existing_contact);
}
@ -45,20 +47,20 @@ class Geary.ContactStoreImplTest : TestCase {
this.db.open.end(async_result());
this.db.exec("""
INSERT INTO ContactTable (
id,
normalized_email,
real_name,
email,
highest_importance
) VALUES (
1,
'test@example.com',
'Test Name',
'Test@example.com',
50
);
""");
INSERT INTO ContactTable (
id,
normalized_email,
real_name,
email,
highest_importance
) VALUES (
1,
'test@example.com',
'Test Name',
'Test@example.com',
50
);
""");
this.test_article = new ContactStoreImpl(this.db);
}
@ -152,6 +154,67 @@ INSERT INTO ContactTable (
assert_false(search_hit.flags.always_load_remote_images(), "Existing flags");
}
public void search_utf8_latin_names() throws GLib.Error {
this.db.exec("""
INSERT INTO ContactTable (
real_name,
email,
normalized_email,
highest_importance
) VALUES (
'Germán',
'latin@example.com',
'latin@example.com',
50
);
""");
test_article.search.begin(
"germá",
0,
10,
null,
(obj, ret) => { async_complete(ret); }
);
Gee.Collection<Contact> results = test_article.search.end(
async_result()
);
assert_int(1, results.size, "results.size");
Contact search_hit = Collection.get_first(results);
assert_string("Germán", search_hit.real_name, "Existing real_name");
}
public void search_utf8_multi_byte_names() throws GLib.Error {
this.db.exec("""
INSERT INTO ContactTable (
real_name,
email,
normalized_email,
highest_importance
) VALUES (
'1',
'cjk@example.com',
'cjk@example.com',
50
);
""");
test_article.search.begin(
"年収",
0,
10,
null,
(obj, ret) => { async_complete(ret); }
);
Gee.Collection<Contact> results = test_article.search.end(
async_result()
);
assert_int(1, results.size, "results.size");
Contact search_hit = Collection.get_first(results);
assert_string("年収1億円目指せ", search_hit.real_name, "Existing real_name");
}
public void update_new_contact() throws GLib.Error {
Contact not_persisted = new Contact(
"New@example.com",

View file

@ -147,10 +147,10 @@ class Geary.ImapDB.DatabaseTest : TestCase {
INSERT INTO Test (test_str) VALUES ('BB');
INSERT INTO Test (test_str) VALUES ('🤯');
""");
string[] expected = { "🤯", "BB", "B", "a" };
string[] expected = { "a", "BB", "B", "🤯" };
Db.Result result = db.query(
"SELECT test_str FROM Test ORDER BY test_str COLLATE UTF8ICASE DESC"
"SELECT test_str FROM Test ORDER BY test_str COLLATE UTF8COLL DESC"
);
int i = 0;