Update email body normalization; fix #6837

This adds a basic, imperfect HTML -> text algorithm using libxml2. We use this to turn HTML email bodies into a searchable text corpus.
2013-05-21 17:07:16 -07:00 · 2013-05-21 17:07:16 -07:00 · 879f3a6b5d
commit 879f3a6b5d
parent 597a6fc9f8
5 changed files with 111 additions and 9 deletions
--- a/debian/control
+++ b/debian/control
@ -10,6 +10,7 @@ Build-Depends: debhelper (>= 8),
 libnotify-dev (>=0.7.5),
 libcanberra-dev (>= 0.28),
 libwebkitgtk-3.0-dev (>= 1.10.0),
+ libxml2-dev (>= 2.7.8),
 libsecret-1-dev (>= 0.11),
 libgmime-2.6-dev (>= 2.6.0),
 valac-0.18 (>= 0.17.4),
@ -33,7 +34,7 @@ Depends: ${shlibs:Depends}, ${misc:Depends},
 libnotify4 (>= 0.7.5),
 libcanberra0 (>= 0.28),
 libwebkitgtk-3.0-0 (>= 1.10.0),
- libxml2 (>= 2.6.32),
+ libxml2 (>= 2.7.8),
 libsqlite3-0 (>= 3.7.4),
 libsecret-1-0 (>= 0.11),
 libmessaging-menu0 (>= 12.10.2),
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -362,11 +362,12 @@ pkg_check_modules(DEPS REQUIRED
    gmime-2.6>=2.6.0
    libsecret-1>=0.11
    webkitgtk-3.0>=1.10.0
+    libxml-2.0>=2.7.8
    ${EXTRA_CLIENT_PKG_CONFIG}
 )

 set(ENGINE_PACKAGES
-    glib-2.0 gee-0.8 gio-2.0 gmime-2.6 unique-3.0 posix sqlite3
+    glib-2.0 gee-0.8 gio-2.0 gmime-2.6 unique-3.0 posix sqlite3 libxml-2.0
 )

 set(CLIENT_PACKAGES
--- a/src/engine/api/geary-engine.vala
+++ b/src/engine/api/geary-engine.vala
@ -93,6 +93,7 @@ public class Geary.Engine : BaseObject {
        RFC822.init();
        ImapEngine.init();
        Imap.init();
+        HTML.init();
    }
    
    /**
--- a/src/engine/rfc822/rfc822-message.vala
+++ b/src/engine/rfc822/rfc822-message.vala
@ -465,7 +465,7 @@ public class Geary.RFC822.Message : BaseObject {
            html = true;
        } catch (Error e) {
            try {
-                body = get_text_body();
+                body = get_text_body(false);
            } catch (Error e) {
                // Ignore.
            }
@ -475,12 +475,8 @@ public class Geary.RFC822.Message : BaseObject {
        
        // TODO: add bodies of attached emails.
        
-        if (html) {
-            // FIXME: this is inadequate.  For example, <br> needs to be turned
-            // into at least one space character, not just omitted.  Also, we
-            // should also replace entities with the characters they represent.
-            body = Geary.HTML.remove_html_tags(body);
-        }
+        if (html)
+            body = Geary.HTML.html_to_text(body);
        
        return body;
    }
--- a/src/engine/util/util-html.vala
+++ b/src/engine/util/util-html.vala
@ -6,6 +6,72 @@

 namespace Geary.HTML {

+private int init_count = 0;
+private Gee.HashSet<string>? breaking_elements = null;
+
+/**
+ * Must be called before ''any'' call to the HTML namespace.
+ *
+ * This will be initialized by the Engine when it's opened; or call this
+ * directly to use these functions earlier.
+ */
+public void init() {
+    if (init_count++ != 0)
+        return;
+    
+    init_breaking_elements();
+}
+
+private void init_breaking_elements() {
+    // Organized from <https://en.wikipedia.org/wiki/HTML_element>.  This is a
+    // list of block elements and some others that get special treatment.
+    // NOTE: this SHOULD be a const list, but due to
+    // <https://bugzilla.gnome.org/show_bug.cgi?id=646970>, it can't be.
+    string[] elements = {
+        "address",
+        "blockquote",
+        "br", // [1]
+        "caption", // [2]
+        "center",
+        "dd",
+        "del", // [3]
+        "dir",
+        "div",
+        "dl",
+        "dt",
+        "embed",
+        "h1", "h2", "h3", "h4", "h5", "h6",
+        "hr",
+        "img", // [1]
+        "ins", // [3]
+        "li",
+        "map", // [1]
+        "menu",
+        "noscript", // [2]
+        "object", // [1]
+        "ol",
+        "p",
+        "pre",
+        "script", // [2]
+        "table",
+        "tbody",
+        "td",
+        "tfoot",
+        "th",
+        "thead",
+        "tr",
+        "ul",
+        
+        // [1]: Not block elements, but still break up the text
+        // [2]: Some of these are oddities, but I figure they should break flow
+        // [3]: Can be used as either block or inline; we go for broke
+    };
+    
+    breaking_elements = new Gee.HashSet<string>(String.stri_hash, String.stri_equal);
+    foreach (string element in elements)
+        breaking_elements.add(element);
+}
+
 public inline string escape_markup(string? plain) {
    return (!String.is_empty(plain) && plain.validate()) ? Markup.escape_text(plain) : "";
 }
@ -69,4 +135,41 @@ public string remove_html_tags(string input) {
    return input;
 }

+/** Does a very approximate conversion from HTML to text.  This does more than
+ * stripping tags -- it inserts line breaks where appropriate, decodes
+ * entities, etc.  The layout of the text is largely lost.  This is primarily
+ * useful for pulling out tokens for searching, not for presenting to the user.
+ */
+public string html_to_text(string html, string encoding = "UTF-8") {
+    Html.Doc *doc = Html.Doc.read_doc(html, "", encoding, Html.ParserOption.RECOVER |
+        Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING | Html.ParserOption.NOBLANKS |
+        Html.ParserOption.NONET | Html.ParserOption.COMPACT);
+    
+    StringBuilder text = new StringBuilder();
+    if (doc != null) {
+        recurse_html_nodes_for_text(doc->get_root_element(), text);
+        delete doc;
+    }
+    
+    return text.str;
+}
+
+private void recurse_html_nodes_for_text(Xml.Node? node, StringBuilder text) {
+    // TODO: add alt text for things that have it?
+    
+    for (unowned Xml.Node? n = node; n != null; n = n.next) {
+        if (n.type == Xml.ElementType.TEXT_NODE)
+            text.append(n.content);
+        else if (n.type == Xml.ElementType.ELEMENT_NODE && element_needs_break(n.name))
+            text.append("\n");
+        
+        recurse_html_nodes_for_text(n.children, text);
+    }
+}
+
+// Determines if the named element should break the flow of text.
+private bool element_needs_break(string element) {
+    return breaking_elements.contains(element);
+}
+
 }