From deb0c415d0e6302ae192f7a079cbbada5f4335b5 Mon Sep 17 00:00:00 2001 From: Michael James Gratton Date: Sun, 18 Dec 2016 23:28:53 +1100 Subject: [PATCH] Fix HTML, CSS and JS leaking into conversation list preview. Bug 714317 When generating the preview, only the first 128 bytes of the first MIME part is fetched and used. If this part is text/html with a significant amount of embedded CSS, then there is a good chance the string passed to Geary.HTML::remove_html_tags() will be invalid, or be missing closing elements. Since that function uses regexes that require balanced tags to remove script and style blocks, then it was very possible that in these cases this method will miss removing these blocks. To solve this, remove_html_tags() is removed and its call sites are replaced by calls to Geary.HTML::html_to_text(), which has been tidyied up to produce more human-readable result. Add unit tests to cover new html_to_text functionality and its call sites. * src/engine/util/util-html.vala: Remove remove_html_tags(). Update html_to_text() to not just insert line breaks, but also insert spaces and alt text, and ignore tags like HEAD, SCRIPT and STYLE, as appropriate. Add an optional param to also allow skipping BLOCKQUOTE elements, which we don't want in the preview. --- src/engine/rfc822/rfc822-message-data.vala | 6 +- src/engine/rfc822/rfc822-message.vala | 4 +- src/engine/util/util-html.vala | 185 +++---- test/CMakeLists.txt | 2 + test/engine/rfc822-message-data-test.vala | 602 +++++++++++++++++++++ test/engine/util-html-test.vala | 76 +++ test/main.vala | 7 + 7 files changed, 780 insertions(+), 102 deletions(-) create mode 100644 test/engine/rfc822-message-data-test.vala create mode 100644 test/engine/util-html-test.vala diff --git a/src/engine/rfc822/rfc822-message-data.vala b/src/engine/rfc822/rfc822-message-data.vala index 0adb2528..6d80177a 100644 --- a/src/engine/rfc822/rfc822-message-data.vala +++ b/src/engine/rfc822/rfc822-message-data.vala @@ -390,14 +390,14 @@ public class Geary.RFC822.PreviewText : Geary.RFC822.Text { input_stream.write_to_stream(filter); uint8[] data = output.data; data += (uint8) '\0'; - + // Fix the preview up by removing HTML tags, redundant white space, common types of // message armor, text-based quotes, and various MIME fields. string preview_text = ""; - string original_text = is_html ? Geary.HTML.remove_html_tags((string) data) : (string) data; + string original_text = is_html ? Geary.HTML.html_to_text((string) data, false) : (string) data; string[] all_lines = original_text.split("\r\n"); bool in_header = false; // True after a header - + foreach(string line in all_lines) { if (in_header && line.has_prefix(" ") || line.has_prefix("\t")) { continue; // Skip "folded" (multi-line) headers. diff --git a/src/engine/rfc822/rfc822-message.vala b/src/engine/rfc822/rfc822-message.vala index 1d0dce10..217b445b 100644 --- a/src/engine/rfc822/rfc822-message.vala +++ b/src/engine/rfc822/rfc822-message.vala @@ -384,12 +384,12 @@ public class Geary.RFC822.Message : BaseObject { preview = get_plain_body(false, null); } catch (Error e) { try { - preview = Geary.HTML.remove_html_tags(get_html_body(null)); + preview = Geary.HTML.html_to_text(get_html_body(null), false); } catch (Error error) { debug("Could not generate message preview: %s\n and: %s", e.message, error.message); } } - + return Geary.String.safe_byte_substring((preview ?? "").chug(), Geary.Email.MAX_PREVIEW_BYTES); } diff --git a/src/engine/util/util-html.vala b/src/engine/util/util-html.vala index 4c9216d2..fdc3d1e4 100644 --- a/src/engine/util/util-html.vala +++ b/src/engine/util/util-html.vala @@ -7,7 +7,10 @@ namespace Geary.HTML { private int init_count = 0; -private Gee.HashSet? breaking_elements = null; +private Gee.HashSet? breaking_elements; +private Gee.HashSet? spacing_elements; +private Gee.HashSet? alt_text_elements; +private Gee.HashSet? ignored_elements; /** * Must be called before ''any'' call to the HTML namespace. @@ -18,58 +21,74 @@ private Gee.HashSet? breaking_elements = null; public void init() { if (init_count++ != 0) return; - - init_breaking_elements(); + + init_element_sets(); } -private void init_breaking_elements() { - // Organized from . This is a - // list of block elements and some others that get special treatment. - // NOTE: this SHOULD be a const list, but due to - // , it can't be. - string[] elements = { +private void init_element_sets() { + // Organized from , + // and some custom + // inference. + + // Block elements and some others that cause new lines to be + // inserted when converting to text. Not all block elements are + // included since some (e.g. lists) will have nested breaking + // children. + breaking_elements = new Gee.HashSet(Ascii.stri_hash, Ascii.stri_equal); + breaking_elements.add_all_array({ "address", "blockquote", "br", // [1] "caption", // [2] "center", - "dd", - "del", // [3] - "dir", "div", - "dl", "dt", "embed", + "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", - "img", // [1] - "ins", // [3] + "iframe", // [1] "li", "map", // [1] "menu", "noscript", // [2] "object", // [1] - "ol", "p", "pre", - "script", // [2] - "table", - "tbody", - "td", - "tfoot", - "th", - "thead", "tr", - "ul", - + // [1]: Not block elements, but still break up the text // [2]: Some of these are oddities, but I figure they should break flow - // [3]: Can be used as either block or inline; we go for broke - }; - - breaking_elements = new Gee.HashSet(Ascii.stri_hash, Ascii.stri_equal); - foreach (string element in elements) - breaking_elements.add(element); + }); + + // Elements that cause spaces to be inserted afterwards when + // converting to text. + spacing_elements = new Gee.HashSet(Ascii.stri_hash, Ascii.stri_equal); + spacing_elements.add_all_array({ + "dt", + "dd", + "img", + "td", + "th", + }); + + // Elements that may have alt text + alt_text_elements = new Gee.HashSet(Ascii.stri_hash, Ascii.stri_equal); + alt_text_elements.add_all_array({ + "img", + }); + + // Elements that should not be included when converting to text + ignored_elements = new Gee.HashSet(Ascii.stri_hash, Ascii.stri_equal); + ignored_elements.add_all_array({ + "base", + "link", + "meta", + "head", + "script", + "style", + "template", + }); } public inline string escape_markup(string? plain) { @@ -88,89 +107,61 @@ public string preserve_whitespace(string? text) { return output; } -// Removes any text between < and >. Additionally, if input terminates in the middle of a tag, -// the tag will be removed. -// If the HTML is invalid, the original string will be returned. -public string remove_html_tags(string input) { - try { - string output = input; - - // Count the number of < and > characters. - unichar c; - uint64 less_than = 0; - uint64 greater_than = 0; - for (int i = 0; output.get_next_char (ref i, out c);) { - if (c == '<') - less_than++; - else if (c == '>') - greater_than++; - } - - if (less_than == greater_than + 1) { - output += ">"; // Append an extra > so our regex works. - greater_than++; - } - - if (less_than != greater_than) - return input; // Invalid HTML. - - // Removes script tags and everything between them. - // Based on regex here: http://stackoverflow.com/questions/116403/im-looking-for-a-regular-expression-to-remove-a-given-xhtml-tag-from-a-string - Regex script = new Regex("]*?>[\\s\\S]*?<\\/script>", RegexCompileFlags.CASELESS); - output = script.replace(output, -1, 0, ""); - - // Removes style tags and everything between them. - // Based on regex above. - Regex style = new Regex("]*?>[\\s\\S]*?<\\/style>", RegexCompileFlags.CASELESS); - output = style.replace(output, -1, 0, ""); - - // Removes remaining tags. - Regex tags = new Regex("<[^>]*>", RegexCompileFlags.CASELESS); - return tags.replace(output, -1, 0, ""); - } catch (Error e) { - debug("Error stripping HTML tags: %s", e.message); - } - - return input; -} - /** * Does a very approximate conversion from HTML to text. * - * This does more than stripping tags -- it inserts line breaks where appropriate, decodes - * entities, etc. The layout of the text is largely lost. This is primarily - * useful for pulling out tokens for searching, not for presenting to the user. + * This does more than stripping tags -- it inserts line breaks where + * appropriate, decodes entities, etc. Note the full string is parsed + * by libxml's HTML parser to create a DOM-like tree representation, + * which is then walked, so this function can be somewhat + * computationally expensive. */ -public string html_to_text(string html, string encoding = Geary.RFC822.UTF8_CHARSET) { +public string html_to_text(string html, + bool include_blockquotes = true, + string encoding = Geary.RFC822.UTF8_CHARSET) { Html.Doc *doc = Html.Doc.read_doc(html, "", encoding, Html.ParserOption.RECOVER | Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING | Html.ParserOption.NOBLANKS | Html.ParserOption.NONET | Html.ParserOption.COMPACT); - + StringBuilder text = new StringBuilder(); if (doc != null) { - recurse_html_nodes_for_text(doc->get_root_element(), text); + recurse_html_nodes_for_text(doc->get_root_element(), include_blockquotes, text); delete doc; } - + return text.str; } -private void recurse_html_nodes_for_text(Xml.Node? node, StringBuilder text) { - // TODO: add alt text for things that have it? - +private void recurse_html_nodes_for_text(Xml.Node? node, + bool include_blockquotes, + StringBuilder text) { for (unowned Xml.Node? n = node; n != null; n = n.next) { - if (n.type == Xml.ElementType.TEXT_NODE) + if (n.type == Xml.ElementType.TEXT_NODE) { text.append(n.content); - else if (n.type == Xml.ElementType.ELEMENT_NODE && element_needs_break(n.name)) - text.append("\n"); - - recurse_html_nodes_for_text(n.children, text); + } else if (n.type == Xml.ElementType.ELEMENT_NODE) { + string name = n.name; + if (include_blockquotes || name != "blockquote") { + if (name in alt_text_elements) { + string? alt_text = node.get_prop("alt"); + if (alt_text != null) { + text.append(alt_text); + } + } + + if (!(name in ignored_elements)) { + recurse_html_nodes_for_text(n.children, include_blockquotes, text); + } + + if (name in spacing_elements) { + text.append(" "); + } + + if (name in breaking_elements) { + text.append("\n"); + } + } + } } } -// Determines if the named element should break the flow of text. -private bool element_needs_break(string element) { - return breaking_elements.contains(element); -} - } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 40d227aa..7b39df73 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -7,6 +7,8 @@ set(TEST_SRC testcase.vala # Taken as-is from libgee, courtesy Julien Peeters engine/rfc822-mailbox-address-test.vala + engine/rfc822-message-data-test.vala + engine/util-html-test.vala ) # Vala diff --git a/test/engine/rfc822-message-data-test.vala b/test/engine/rfc822-message-data-test.vala new file mode 100644 index 00000000..0c4ce5ab --- /dev/null +++ b/test/engine/rfc822-message-data-test.vala @@ -0,0 +1,602 @@ +/* + * Copyright 2016 Michael Gratton + * + * This software is licensed under the GNU Lesser General Public License + * (version 2.1 or later). See the COPYING file in this distribution. + */ + +class Geary.RFC822.MessageDataTest : Gee.TestCase { + + public MessageDataTest() { + base("Geary.RFC822.MessageDataTest"); + add_test("PreviewText.with_header", preview_text_with_header); + } + + public void preview_text_with_header() { + string part_headers = "Content-Type: text/html; charset=utf-8\r\nContent-Transfer-Encoding: quoted-printable\r\n\r\n"; + + PreviewText preview1 = new PreviewText.with_header( + new Geary.Memory.StringBuffer(HTML_BODY1_ENCODED), + new Geary.Memory.StringBuffer(part_headers) + ); + assert(preview1.buffer.to_string() == HTML_BODY1_EXPECTED); + + PreviewText preview2 = new PreviewText.with_header( + new Geary.Memory.StringBuffer(HTML_BODY2_ENCODED), + new Geary.Memory.StringBuffer(part_headers) + ); + assert(preview2.buffer.to_string() == HTML_BODY2_EXPECTED); + } + + public static string HTML_BODY1_ENCODED = """ + + + + +
+


Hi Kenneth,

We xxxxx xxxx xx xxx xxx xx xxxx x xxxxx= +xxx xxxxxxxx.=C2=A0



Thank you,

XXXXX= +X XXXXXX

You can reply directly to this message or click the fol= +lowing link:
https://app.foobar.com/xxxxxxxxxxxxxxxx1641966deff6c48623ab= +a

You can change your email preferences at:
https://app.foobar.com/xxxxxxxxxxx

"""; + + public static string HTML_BODY1_EXPECTED = "Hi Kenneth, We xxxxx xxxx xx xxx xxx xx xxxx x xxxxxxxx xxxxxxxx.  Thank you, XXXXXX XXXXXX You can reply directly to this message or click the following link: https://app.foobar.com/xxxxxxxxxxxxxxxx1641966deff6c48623aba You can change your email preferences at: https://app.foobar.com/xxxxxxxxxxx"; + + public static string HTML_BODY2_ENCODED = """ + + + + + + +
+
+ + + +
+ + + + + + +
+Buy It Now from US $1,750.00 to US $5,950.00. +
+
+ + + +
+ + + +
3D""<= +/td> +
+
+ + + +
+ + + + + + +
+

+Daccordi, Worldwide: 2 new matches today +

+
+
+ + + +
+ + + + + + +
+
+ + + + +
+ + + + + + + + + + + + + +
+ + + + + + +
+ + + + + +
+
+

+ +Daccordi 50th anniversary edition with... + +

+
+Buy it now: US $5,950.00 +
+100% positive feedback +
+
+ + + + +
+ + + + + + + + + + + + + +
+ + + + + + +
+ + + + + +
+
+

+ +Daccordi Griffe Campagnolo Croce D'Aune... + +

+
+Buy it now: US $1,750.00 +
+100% positive feedback +
+
+
+
+
+ + + +
+ + + + + + + +
+ + + + + + + + + +
+ + + + +
+ View all results +
+
3D""
3D""
+ + + + + + + + + +
+ + + + +
+Refine this search +
+
3D""
3D""
+
+ + + + +
+ Disable emails for this search<= +/a> +
+
+
 
+ + + +
+ + + + +
+
+
+ + +"""; + + public static string HTML_BODY2_EXPECTED = "Buy It Now from US $1,750.00 to US $5,950.00. eBay Daccordi, Worldwide: 2 new matches today Daccordi 50th anniversary edition with... Buy it now: US $5,950.00 100% positive feedback Daccordi Griffe Campagnolo Croce D'Aune... Buy it now: US $1,750.00 100% positive feedback View all results Refine this search Disable emails for this search   Email reference id: [#d9f42b5e860b4eabb98195c2888cba9e#] We don't check this mailbox, so please don't reply to this message. If you have a question, go to Help & Contact. ©2016 eBay Inc., eBay International AG Helvetiastrasse 15/17 - P.O. Box 133, 3000 Bern 6, Switzerland"; +} diff --git a/test/engine/util-html-test.vala b/test/engine/util-html-test.vala new file mode 100644 index 00000000..8bf060fd --- /dev/null +++ b/test/engine/util-html-test.vala @@ -0,0 +1,76 @@ +/* + * Copyright 2016 Michael Gratton + * + * This software is licensed under the GNU Lesser General Public License + * (version 2.1 or later). See the COPYING file in this distribution. + */ + +class Geary.HTML.UtilTest : Gee.TestCase { + + public UtilTest() { + base("Geary.HTML.Util"); + add_test("remove_html_tags", remove_html_tags); + } + + public void remove_html_tags() { + string blockquote_body = """
hello

there

"""; + + string style_complete = """"""; + + string style_truncated = """ + + + + +
+


Hi Kenneth,

We xxxxx xxxx xx xxx xxx xx xxxx x xxxxxxxx xxxxxxxx. +



Thank you,

XXXXX +X XXXXXX

You can reply directly to this message or click the following link:
https://app.foobar.com/xxxxxxxxxxxxxxxx1641966deff6c48623aba

You can change your email preferences at:
https://app.foobar.com/xxxxxxxxxxx

+"""; + + public static string HTML_BODY_COMPLETE_EXPECTED = """ + +Hi Kenneth, + + We xxxxx xxxx xx xxx xxx xx xxxx x xxxxxxxx xxxxxxxx. + + + + +Thank you, + +XXXXX +X XXXXXX + +You can reply directly to this message or click the following link: +https://app.foobar.com/xxxxxxxxxxxxxxxx1641966deff6c48623aba + +You can change your email preferences at: +https://app.foobar.com/xxxxxxxxxxx + +"""; + +} diff --git a/test/main.vala b/test/main.vala index 62ebdf21..e6228ccf 100644 --- a/test/main.vala +++ b/test/main.vala @@ -7,9 +7,16 @@ int main(string[] args) { Test.init(ref args); + + Geary.RFC822.init(); + Geary.HTML.init(); + TestSuite root = TestSuite.get_root(); + // Engine tests + root.add_suite(new Geary.HTML.UtilTest().get_suite()); root.add_suite(new Geary.RFC822.MailboxAddressTest().get_suite()); + root.add_suite(new Geary.RFC822.MessageDataTest().get_suite()); return Test.run(); }