Update email body normalization; fix #6837
This adds a basic, imperfect HTML -> text algorithm using libxml2. We use this to turn HTML email bodies into a searchable text corpus.
This commit is contained in:
parent
597a6fc9f8
commit
879f3a6b5d
5 changed files with 111 additions and 9 deletions
3
debian/control
vendored
3
debian/control
vendored
|
|
@ -10,6 +10,7 @@ Build-Depends: debhelper (>= 8),
|
|||
libnotify-dev (>=0.7.5),
|
||||
libcanberra-dev (>= 0.28),
|
||||
libwebkitgtk-3.0-dev (>= 1.10.0),
|
||||
libxml2-dev (>= 2.7.8),
|
||||
libsecret-1-dev (>= 0.11),
|
||||
libgmime-2.6-dev (>= 2.6.0),
|
||||
valac-0.18 (>= 0.17.4),
|
||||
|
|
@ -33,7 +34,7 @@ Depends: ${shlibs:Depends}, ${misc:Depends},
|
|||
libnotify4 (>= 0.7.5),
|
||||
libcanberra0 (>= 0.28),
|
||||
libwebkitgtk-3.0-0 (>= 1.10.0),
|
||||
libxml2 (>= 2.6.32),
|
||||
libxml2 (>= 2.7.8),
|
||||
libsqlite3-0 (>= 3.7.4),
|
||||
libsecret-1-0 (>= 0.11),
|
||||
libmessaging-menu0 (>= 12.10.2),
|
||||
|
|
|
|||
|
|
@ -362,11 +362,12 @@ pkg_check_modules(DEPS REQUIRED
|
|||
gmime-2.6>=2.6.0
|
||||
libsecret-1>=0.11
|
||||
webkitgtk-3.0>=1.10.0
|
||||
libxml-2.0>=2.7.8
|
||||
${EXTRA_CLIENT_PKG_CONFIG}
|
||||
)
|
||||
|
||||
set(ENGINE_PACKAGES
|
||||
glib-2.0 gee-0.8 gio-2.0 gmime-2.6 unique-3.0 posix sqlite3
|
||||
glib-2.0 gee-0.8 gio-2.0 gmime-2.6 unique-3.0 posix sqlite3 libxml-2.0
|
||||
)
|
||||
|
||||
set(CLIENT_PACKAGES
|
||||
|
|
|
|||
|
|
@ -93,6 +93,7 @@ public class Geary.Engine : BaseObject {
|
|||
RFC822.init();
|
||||
ImapEngine.init();
|
||||
Imap.init();
|
||||
HTML.init();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -465,7 +465,7 @@ public class Geary.RFC822.Message : BaseObject {
|
|||
html = true;
|
||||
} catch (Error e) {
|
||||
try {
|
||||
body = get_text_body();
|
||||
body = get_text_body(false);
|
||||
} catch (Error e) {
|
||||
// Ignore.
|
||||
}
|
||||
|
|
@ -475,12 +475,8 @@ public class Geary.RFC822.Message : BaseObject {
|
|||
|
||||
// TODO: add bodies of attached emails.
|
||||
|
||||
if (html) {
|
||||
// FIXME: this is inadequate. For example, <br> needs to be turned
|
||||
// into at least one space character, not just omitted. Also, we
|
||||
// should also replace entities with the characters they represent.
|
||||
body = Geary.HTML.remove_html_tags(body);
|
||||
}
|
||||
if (html)
|
||||
body = Geary.HTML.html_to_text(body);
|
||||
|
||||
return body;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,72 @@
|
|||
|
||||
namespace Geary.HTML {
|
||||
|
||||
private int init_count = 0;
|
||||
private Gee.HashSet<string>? breaking_elements = null;
|
||||
|
||||
/**
|
||||
* Must be called before ''any'' call to the HTML namespace.
|
||||
*
|
||||
* This will be initialized by the Engine when it's opened; or call this
|
||||
* directly to use these functions earlier.
|
||||
*/
|
||||
public void init() {
|
||||
if (init_count++ != 0)
|
||||
return;
|
||||
|
||||
init_breaking_elements();
|
||||
}
|
||||
|
||||
private void init_breaking_elements() {
|
||||
// Organized from <https://en.wikipedia.org/wiki/HTML_element>. This is a
|
||||
// list of block elements and some others that get special treatment.
|
||||
// NOTE: this SHOULD be a const list, but due to
|
||||
// <https://bugzilla.gnome.org/show_bug.cgi?id=646970>, it can't be.
|
||||
string[] elements = {
|
||||
"address",
|
||||
"blockquote",
|
||||
"br", // [1]
|
||||
"caption", // [2]
|
||||
"center",
|
||||
"dd",
|
||||
"del", // [3]
|
||||
"dir",
|
||||
"div",
|
||||
"dl",
|
||||
"dt",
|
||||
"embed",
|
||||
"h1", "h2", "h3", "h4", "h5", "h6",
|
||||
"hr",
|
||||
"img", // [1]
|
||||
"ins", // [3]
|
||||
"li",
|
||||
"map", // [1]
|
||||
"menu",
|
||||
"noscript", // [2]
|
||||
"object", // [1]
|
||||
"ol",
|
||||
"p",
|
||||
"pre",
|
||||
"script", // [2]
|
||||
"table",
|
||||
"tbody",
|
||||
"td",
|
||||
"tfoot",
|
||||
"th",
|
||||
"thead",
|
||||
"tr",
|
||||
"ul",
|
||||
|
||||
// [1]: Not block elements, but still break up the text
|
||||
// [2]: Some of these are oddities, but I figure they should break flow
|
||||
// [3]: Can be used as either block or inline; we go for broke
|
||||
};
|
||||
|
||||
breaking_elements = new Gee.HashSet<string>(String.stri_hash, String.stri_equal);
|
||||
foreach (string element in elements)
|
||||
breaking_elements.add(element);
|
||||
}
|
||||
|
||||
public inline string escape_markup(string? plain) {
|
||||
return (!String.is_empty(plain) && plain.validate()) ? Markup.escape_text(plain) : "";
|
||||
}
|
||||
|
|
@ -69,4 +135,41 @@ public string remove_html_tags(string input) {
|
|||
return input;
|
||||
}
|
||||
|
||||
/** Does a very approximate conversion from HTML to text. This does more than
|
||||
* stripping tags -- it inserts line breaks where appropriate, decodes
|
||||
* entities, etc. The layout of the text is largely lost. This is primarily
|
||||
* useful for pulling out tokens for searching, not for presenting to the user.
|
||||
*/
|
||||
public string html_to_text(string html, string encoding = "UTF-8") {
|
||||
Html.Doc *doc = Html.Doc.read_doc(html, "", encoding, Html.ParserOption.RECOVER |
|
||||
Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING | Html.ParserOption.NOBLANKS |
|
||||
Html.ParserOption.NONET | Html.ParserOption.COMPACT);
|
||||
|
||||
StringBuilder text = new StringBuilder();
|
||||
if (doc != null) {
|
||||
recurse_html_nodes_for_text(doc->get_root_element(), text);
|
||||
delete doc;
|
||||
}
|
||||
|
||||
return text.str;
|
||||
}
|
||||
|
||||
private void recurse_html_nodes_for_text(Xml.Node? node, StringBuilder text) {
|
||||
// TODO: add alt text for things that have it?
|
||||
|
||||
for (unowned Xml.Node? n = node; n != null; n = n.next) {
|
||||
if (n.type == Xml.ElementType.TEXT_NODE)
|
||||
text.append(n.content);
|
||||
else if (n.type == Xml.ElementType.ELEMENT_NODE && element_needs_break(n.name))
|
||||
text.append("\n");
|
||||
|
||||
recurse_html_nodes_for_text(n.children, text);
|
||||
}
|
||||
}
|
||||
|
||||
// Determines if the named element should break the flow of text.
|
||||
private bool element_needs_break(string element) {
|
||||
return breaking_elements.contains(element);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue