Update email body normalization; fix #6837

This adds a basic, imperfect HTML -> text algorithm using libxml2.  We
use this to turn HTML email bodies into a searchable text corpus.
This commit is contained in:
Charles Lindsay 2013-05-21 17:07:16 -07:00
parent 597a6fc9f8
commit 879f3a6b5d
5 changed files with 111 additions and 9 deletions

3
debian/control vendored
View file

@ -10,6 +10,7 @@ Build-Depends: debhelper (>= 8),
libnotify-dev (>=0.7.5),
libcanberra-dev (>= 0.28),
libwebkitgtk-3.0-dev (>= 1.10.0),
libxml2-dev (>= 2.7.8),
libsecret-1-dev (>= 0.11),
libgmime-2.6-dev (>= 2.6.0),
valac-0.18 (>= 0.17.4),
@ -33,7 +34,7 @@ Depends: ${shlibs:Depends}, ${misc:Depends},
libnotify4 (>= 0.7.5),
libcanberra0 (>= 0.28),
libwebkitgtk-3.0-0 (>= 1.10.0),
libxml2 (>= 2.6.32),
libxml2 (>= 2.7.8),
libsqlite3-0 (>= 3.7.4),
libsecret-1-0 (>= 0.11),
libmessaging-menu0 (>= 12.10.2),

View file

@ -362,11 +362,12 @@ pkg_check_modules(DEPS REQUIRED
gmime-2.6>=2.6.0
libsecret-1>=0.11
webkitgtk-3.0>=1.10.0
libxml-2.0>=2.7.8
${EXTRA_CLIENT_PKG_CONFIG}
)
set(ENGINE_PACKAGES
glib-2.0 gee-0.8 gio-2.0 gmime-2.6 unique-3.0 posix sqlite3
glib-2.0 gee-0.8 gio-2.0 gmime-2.6 unique-3.0 posix sqlite3 libxml-2.0
)
set(CLIENT_PACKAGES

View file

@ -93,6 +93,7 @@ public class Geary.Engine : BaseObject {
RFC822.init();
ImapEngine.init();
Imap.init();
HTML.init();
}
/**

View file

@ -465,7 +465,7 @@ public class Geary.RFC822.Message : BaseObject {
html = true;
} catch (Error e) {
try {
body = get_text_body();
body = get_text_body(false);
} catch (Error e) {
// Ignore.
}
@ -475,12 +475,8 @@ public class Geary.RFC822.Message : BaseObject {
// TODO: add bodies of attached emails.
if (html) {
// FIXME: this is inadequate. For example, <br> needs to be turned
// into at least one space character, not just omitted. Also, we
// should also replace entities with the characters they represent.
body = Geary.HTML.remove_html_tags(body);
}
if (html)
body = Geary.HTML.html_to_text(body);
return body;
}

View file

@ -6,6 +6,72 @@
namespace Geary.HTML {
private int init_count = 0;
private Gee.HashSet<string>? breaking_elements = null;
/**
* Must be called before ''any'' call to the HTML namespace.
*
* This will be initialized by the Engine when it's opened; or call this
* directly to use these functions earlier.
*/
public void init() {
if (init_count++ != 0)
return;
init_breaking_elements();
}
private void init_breaking_elements() {
// Organized from <https://en.wikipedia.org/wiki/HTML_element>. This is a
// list of block elements and some others that get special treatment.
// NOTE: this SHOULD be a const list, but due to
// <https://bugzilla.gnome.org/show_bug.cgi?id=646970>, it can't be.
string[] elements = {
"address",
"blockquote",
"br", // [1]
"caption", // [2]
"center",
"dd",
"del", // [3]
"dir",
"div",
"dl",
"dt",
"embed",
"h1", "h2", "h3", "h4", "h5", "h6",
"hr",
"img", // [1]
"ins", // [3]
"li",
"map", // [1]
"menu",
"noscript", // [2]
"object", // [1]
"ol",
"p",
"pre",
"script", // [2]
"table",
"tbody",
"td",
"tfoot",
"th",
"thead",
"tr",
"ul",
// [1]: Not block elements, but still break up the text
// [2]: Some of these are oddities, but I figure they should break flow
// [3]: Can be used as either block or inline; we go for broke
};
breaking_elements = new Gee.HashSet<string>(String.stri_hash, String.stri_equal);
foreach (string element in elements)
breaking_elements.add(element);
}
public inline string escape_markup(string? plain) {
return (!String.is_empty(plain) && plain.validate()) ? Markup.escape_text(plain) : "";
}
@ -69,4 +135,41 @@ public string remove_html_tags(string input) {
return input;
}
/** Does a very approximate conversion from HTML to text. This does more than
* stripping tags -- it inserts line breaks where appropriate, decodes
* entities, etc. The layout of the text is largely lost. This is primarily
* useful for pulling out tokens for searching, not for presenting to the user.
*/
public string html_to_text(string html, string encoding = "UTF-8") {
Html.Doc *doc = Html.Doc.read_doc(html, "", encoding, Html.ParserOption.RECOVER |
Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING | Html.ParserOption.NOBLANKS |
Html.ParserOption.NONET | Html.ParserOption.COMPACT);
StringBuilder text = new StringBuilder();
if (doc != null) {
recurse_html_nodes_for_text(doc->get_root_element(), text);
delete doc;
}
return text.str;
}
private void recurse_html_nodes_for_text(Xml.Node? node, StringBuilder text) {
// TODO: add alt text for things that have it?
for (unowned Xml.Node? n = node; n != null; n = n.next) {
if (n.type == Xml.ElementType.TEXT_NODE)
text.append(n.content);
else if (n.type == Xml.ElementType.ELEMENT_NODE && element_needs_break(n.name))
text.append("\n");
recurse_html_nodes_for_text(n.children, text);
}
}
// Determines if the named element should break the flow of text.
private bool element_needs_break(string element) {
return breaking_elements.contains(element);
}
}