Util.Email.SearchExpressionFactory: Use ICU for work breaking

Implement search query text word segmentaion using ICU, so that
languages that don't use spaces for word delimiters are correctly
tokenised.
This commit is contained in:
Michael Gratton 2021-01-19 20:42:26 +11:00 committed by Michael James Gratton
parent 2f81fdf146
commit 642bf00e88
2 changed files with 62 additions and 33 deletions

View file

@ -1,6 +1,6 @@
/* /*
* Copyright 2016 Software Freedom Conservancy Inc. * Copyright 2016 Software Freedom Conservancy Inc.
* Copyright 2019 Michael Gratton <mike@vee.net> * Copyright 2019-2021 Michael Gratton <mike@vee.net>
* *
* This software is licensed under the GNU Lesser General Public License * This software is licensed under the GNU Lesser General Public License
* (version 2.1 or later). See the COPYING file in this distribution. * (version 2.1 or later). See the COPYING file in this distribution.
@ -350,6 +350,10 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
private class Tokeniser { private class Tokeniser {
[Flags]
private enum CharStatus { NONE, IN_WORD, END_WORD; }
// These characters are chosen for being commonly used to // These characters are chosen for being commonly used to
// continue a single word (such as extended last names, // continue a single word (such as extended last names,
// i.e. "Lars-Eric") or in terms commonly searched for in an // i.e. "Lars-Eric") or in terms commonly searched for in an
@ -365,7 +369,7 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
} }
public bool is_at_word { public bool is_at_word {
get { return (this.attrs[this.current_c].is_word_start == 1); } get { return CharStatus.IN_WORD in this.char_status[this.current_pos]; }
} }
public bool is_at_quote { public bool is_at_quote {
@ -380,30 +384,51 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
private int next_pos = 0; private int next_pos = 0;
private unichar c = 0; private unichar c = 0;
private int current_c = -1; private CharStatus[] char_status;
private Pango.LogAttr[] attrs;
public Tokeniser(string query, Pango.Language language) { public Tokeniser(string query) {
this.query = query; this.query = query;
// Break up search string into individual words and/or // Break up search string into individual words and/or
// operators. Can't simply break on space or non-alphanumeric // operators. Can't simply break on space or
// chars since some languages don't use spaces, so use Pango // non-alphanumeric chars since some languages don't use
// for its support for the Unicode UAX #29 word boundary spec. // spaces, so use ICU for its support for the Unicode UAX
this.attrs = new Pango.LogAttr[query.char_count() + 1]; // #29 word boundary spec and dictionary-based breaking
Pango.get_log_attrs( // for languages that do not use spaces for work breaks.
query, query.length, -1, language, this.attrs
this.char_status = new CharStatus[query.length + 1];
var icu_err = Icu.ErrorCode.ZERO_ERROR;
var icu_text = Icu.Text.open_utf8(null, this.query.data, ref icu_err);
var word_breaker = Icu.BreakIterator.open(
WORD, "en", null, -1, ref icu_err
); );
word_breaker.set_utext(icu_text, ref icu_err);
int32 prev_index = 0;
var current_index = word_breaker.first();
var status = 0;
while (current_index != Icu.BreakIterator.DONE) {
status = word_breaker.rule_status;
if (!(status >= Icu.BreakIterator.WordBreak.NONE &&
status < Icu.BreakIterator.WordBreak.NONE_LIMIT)) {
for (int i = prev_index; i < current_index; i++) {
this.char_status[i] |= IN_WORD;
}
this.char_status[current_index] |= END_WORD;
}
prev_index = current_index;
current_index = word_breaker.next();
}
consume_char(); consume_char();
} }
public void consume_char() { public void consume_char() {
var current_pos = this.next_pos; var current_pos = this.next_pos;
if (this.query.get_next_char(ref this.next_pos, out this.c)) { this.query.get_next_char(ref this.next_pos, out this.c);
this.current_c++;
}
this.current_pos = current_pos; this.current_pos = current_pos;
} }
@ -415,13 +440,11 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
public string consume_word() { public string consume_word() {
var start = this.current_pos; var start = this.current_pos;
// the attr.is_word_end value applies to the first char
// after then end of a word, so need to move one past the
// end of the current word to determine where it ends
consume_char(); consume_char();
while (this.has_next && while (this.has_next &&
this.c != OPERATOR_SEPARATOR &&
(this.c in CONTINUATION_CHARS || (this.c in CONTINUATION_CHARS ||
this.attrs[this.current_c].is_word_end != 1)) { !(CharStatus.END_WORD in this.char_status[this.current_pos]))) {
consume_char(); consume_char();
} }
return this.query.slice(start, this.current_pos); return this.query.slice(start, this.current_pos);
@ -446,10 +469,6 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
public Geary.AccountInformation account { get; private set; } public Geary.AccountInformation account { get; private set; }
public Pango.Language language {
get; set; default = Pango.Language.get_default();
}
// Maps of localised search operator names and values to their // Maps of localised search operator names and values to their
// internal forms // internal forms
private Gee.Map<string,FactoryContext> text_operators = private Gee.Map<string,FactoryContext> text_operators =
@ -470,7 +489,7 @@ public class Util.Email.SearchExpressionFactory : Geary.BaseObject {
/** Constructs a search expression from the given query string. */ /** Constructs a search expression from the given query string. */
public Gee.List<Geary.SearchQuery.Term> parse_query(string query) { public Gee.List<Geary.SearchQuery.Term> parse_query(string query) {
var operands = new Gee.LinkedList<Geary.SearchQuery.Term>(); var operands = new Gee.LinkedList<Geary.SearchQuery.Term>();
var tokens = new Tokeniser(query, this.language); var tokens = new Tokeniser(query);
while (tokens.has_next) { while (tokens.has_next) {
if (tokens.is_at_word) { if (tokens.is_at_word) {
Geary.SearchQuery.Term? op = null; Geary.SearchQuery.Term? op = null;

View file

@ -195,18 +195,28 @@ public class Util.Email.Test : TestCase {
this.config.get_search_strategy(), this.config.get_search_strategy(),
this.account this.account
); );
test_article.language = Pango.Language.from_string("th");
var multiple = test_article.parse_query("ภาษาไทย"); var thai = test_article.parse_query("ภาษาไทย");
assert_collection(multiple).size(2); assert_collection(thai).size(2);
assert_true(multiple[0] is Geary.SearchQuery.EmailTextTerm); assert_true(thai[0] is Geary.SearchQuery.EmailTextTerm);
assert_true(multiple[1] is Geary.SearchQuery.EmailTextTerm); assert_true(thai[1] is Geary.SearchQuery.EmailTextTerm);
assert_collection( assert_collection(
((Geary.SearchQuery.EmailTextTerm) multiple[0]).terms ((Geary.SearchQuery.EmailTextTerm) thai[0]).terms
).size(1).contains("ภาษา"); ).size(1).contains("ภาษา");
assert_collection( assert_collection(
((Geary.SearchQuery.EmailTextTerm) multiple[1]).terms ((Geary.SearchQuery.EmailTextTerm) thai[1]).terms
).size(1).contains("ไทย"); ).size(1).contains("ไทย");
var chinese = test_article.parse_query("男子去");
assert_collection(chinese).size(2);
assert_true(chinese[0] is Geary.SearchQuery.EmailTextTerm);
assert_true(chinese[1] is Geary.SearchQuery.EmailTextTerm);
assert_collection(
((Geary.SearchQuery.EmailTextTerm) chinese[0]).terms
).size(1).contains("男子");
assert_collection(
((Geary.SearchQuery.EmailTextTerm) chinese[1]).terms
).size(1).contains("");
} }
public void multiple_search_terms() throws GLib.Error { public void multiple_search_terms() throws GLib.Error {
@ -277,10 +287,10 @@ public class Util.Email.Test : TestCase {
var simple_body = test_article.parse_query("body:hello"); var simple_body = test_article.parse_query("body:hello");
assert_collection(simple_body).size(1); assert_collection(simple_body).size(1);
assert_true(simple_body[0] is Geary.SearchQuery.EmailTextTerm); assert_true(simple_body[0] is Geary.SearchQuery.EmailTextTerm, "type");
var text_body = simple_body[0] as Geary.SearchQuery.EmailTextTerm; var text_body = simple_body[0] as Geary.SearchQuery.EmailTextTerm;
assert_true(text_body.target == BODY); assert_true(text_body.target == BODY, "target");
assert_true(text_body.matching_strategy == CONSERVATIVE); assert_true(text_body.matching_strategy == CONSERVATIVE, "strategy");
assert_collection(text_body.terms).size(1).contains("hello"); assert_collection(text_body.terms).size(1).contains("hello");
var simple_body_quoted = test_article.parse_query("body:\"hello\""); var simple_body_quoted = test_article.parse_query("body:\"hello\"");