diff --git a/src/engine/rfc822/rfc822-message-data.vala b/src/engine/rfc822/rfc822-message-data.vala index 0adb2528..6d80177a 100644 --- a/src/engine/rfc822/rfc822-message-data.vala +++ b/src/engine/rfc822/rfc822-message-data.vala @@ -390,14 +390,14 @@ public class Geary.RFC822.PreviewText : Geary.RFC822.Text { input_stream.write_to_stream(filter); uint8[] data = output.data; data += (uint8) '\0'; - + // Fix the preview up by removing HTML tags, redundant white space, common types of // message armor, text-based quotes, and various MIME fields. string preview_text = ""; - string original_text = is_html ? Geary.HTML.remove_html_tags((string) data) : (string) data; + string original_text = is_html ? Geary.HTML.html_to_text((string) data, false) : (string) data; string[] all_lines = original_text.split("\r\n"); bool in_header = false; // True after a header - + foreach(string line in all_lines) { if (in_header && line.has_prefix(" ") || line.has_prefix("\t")) { continue; // Skip "folded" (multi-line) headers. diff --git a/src/engine/rfc822/rfc822-message.vala b/src/engine/rfc822/rfc822-message.vala index 1d0dce10..217b445b 100644 --- a/src/engine/rfc822/rfc822-message.vala +++ b/src/engine/rfc822/rfc822-message.vala @@ -384,12 +384,12 @@ public class Geary.RFC822.Message : BaseObject { preview = get_plain_body(false, null); } catch (Error e) { try { - preview = Geary.HTML.remove_html_tags(get_html_body(null)); + preview = Geary.HTML.html_to_text(get_html_body(null), false); } catch (Error error) { debug("Could not generate message preview: %s\n and: %s", e.message, error.message); } } - + return Geary.String.safe_byte_substring((preview ?? "").chug(), Geary.Email.MAX_PREVIEW_BYTES); } diff --git a/src/engine/util/util-html.vala b/src/engine/util/util-html.vala index 4c9216d2..fdc3d1e4 100644 --- a/src/engine/util/util-html.vala +++ b/src/engine/util/util-html.vala @@ -7,7 +7,10 @@ namespace Geary.HTML { private int init_count = 0; -private Gee.HashSet? breaking_elements = null; +private Gee.HashSet? breaking_elements; +private Gee.HashSet? spacing_elements; +private Gee.HashSet? alt_text_elements; +private Gee.HashSet? ignored_elements; /** * Must be called before ''any'' call to the HTML namespace. @@ -18,58 +21,74 @@ private Gee.HashSet? breaking_elements = null; public void init() { if (init_count++ != 0) return; - - init_breaking_elements(); + + init_element_sets(); } -private void init_breaking_elements() { - // Organized from . This is a - // list of block elements and some others that get special treatment. - // NOTE: this SHOULD be a const list, but due to - // , it can't be. - string[] elements = { +private void init_element_sets() { + // Organized from , + // and some custom + // inference. + + // Block elements and some others that cause new lines to be + // inserted when converting to text. Not all block elements are + // included since some (e.g. lists) will have nested breaking + // children. + breaking_elements = new Gee.HashSet(Ascii.stri_hash, Ascii.stri_equal); + breaking_elements.add_all_array({ "address", "blockquote", "br", // [1] "caption", // [2] "center", - "dd", - "del", // [3] - "dir", "div", - "dl", "dt", "embed", + "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", - "img", // [1] - "ins", // [3] + "iframe", // [1] "li", "map", // [1] "menu", "noscript", // [2] "object", // [1] - "ol", "p", "pre", - "script", // [2] - "table", - "tbody", - "td", - "tfoot", - "th", - "thead", "tr", - "ul", - + // [1]: Not block elements, but still break up the text // [2]: Some of these are oddities, but I figure they should break flow - // [3]: Can be used as either block or inline; we go for broke - }; - - breaking_elements = new Gee.HashSet(Ascii.stri_hash, Ascii.stri_equal); - foreach (string element in elements) - breaking_elements.add(element); + }); + + // Elements that cause spaces to be inserted afterwards when + // converting to text. + spacing_elements = new Gee.HashSet(Ascii.stri_hash, Ascii.stri_equal); + spacing_elements.add_all_array({ + "dt", + "dd", + "img", + "td", + "th", + }); + + // Elements that may have alt text + alt_text_elements = new Gee.HashSet(Ascii.stri_hash, Ascii.stri_equal); + alt_text_elements.add_all_array({ + "img", + }); + + // Elements that should not be included when converting to text + ignored_elements = new Gee.HashSet(Ascii.stri_hash, Ascii.stri_equal); + ignored_elements.add_all_array({ + "base", + "link", + "meta", + "head", + "script", + "style", + "template", + }); } public inline string escape_markup(string? plain) { @@ -88,89 +107,61 @@ public string preserve_whitespace(string? text) { return output; } -// Removes any text between < and >. Additionally, if input terminates in the middle of a tag, -// the tag will be removed. -// If the HTML is invalid, the original string will be returned. -public string remove_html_tags(string input) { - try { - string output = input; - - // Count the number of < and > characters. - unichar c; - uint64 less_than = 0; - uint64 greater_than = 0; - for (int i = 0; output.get_next_char (ref i, out c);) { - if (c == '<') - less_than++; - else if (c == '>') - greater_than++; - } - - if (less_than == greater_than + 1) { - output += ">"; // Append an extra > so our regex works. - greater_than++; - } - - if (less_than != greater_than) - return input; // Invalid HTML. - - // Removes script tags and everything between them. - // Based on regex here: http://stackoverflow.com/questions/116403/im-looking-for-a-regular-expression-to-remove-a-given-xhtml-tag-from-a-string - Regex script = new Regex("]*?>[\\s\\S]*?<\\/script>", RegexCompileFlags.CASELESS); - output = script.replace(output, -1, 0, ""); - - // Removes style tags and everything between them. - // Based on regex above. - Regex style = new Regex("]*?>[\\s\\S]*?<\\/style>", RegexCompileFlags.CASELESS); - output = style.replace(output, -1, 0, ""); - - // Removes remaining tags. - Regex tags = new Regex("<[^>]*>", RegexCompileFlags.CASELESS); - return tags.replace(output, -1, 0, ""); - } catch (Error e) { - debug("Error stripping HTML tags: %s", e.message); - } - - return input; -} - /** * Does a very approximate conversion from HTML to text. * - * This does more than stripping tags -- it inserts line breaks where appropriate, decodes - * entities, etc. The layout of the text is largely lost. This is primarily - * useful for pulling out tokens for searching, not for presenting to the user. + * This does more than stripping tags -- it inserts line breaks where + * appropriate, decodes entities, etc. Note the full string is parsed + * by libxml's HTML parser to create a DOM-like tree representation, + * which is then walked, so this function can be somewhat + * computationally expensive. */ -public string html_to_text(string html, string encoding = Geary.RFC822.UTF8_CHARSET) { +public string html_to_text(string html, + bool include_blockquotes = true, + string encoding = Geary.RFC822.UTF8_CHARSET) { Html.Doc *doc = Html.Doc.read_doc(html, "", encoding, Html.ParserOption.RECOVER | Html.ParserOption.NOERROR | Html.ParserOption.NOWARNING | Html.ParserOption.NOBLANKS | Html.ParserOption.NONET | Html.ParserOption.COMPACT); - + StringBuilder text = new StringBuilder(); if (doc != null) { - recurse_html_nodes_for_text(doc->get_root_element(), text); + recurse_html_nodes_for_text(doc->get_root_element(), include_blockquotes, text); delete doc; } - + return text.str; } -private void recurse_html_nodes_for_text(Xml.Node? node, StringBuilder text) { - // TODO: add alt text for things that have it? - +private void recurse_html_nodes_for_text(Xml.Node? node, + bool include_blockquotes, + StringBuilder text) { for (unowned Xml.Node? n = node; n != null; n = n.next) { - if (n.type == Xml.ElementType.TEXT_NODE) + if (n.type == Xml.ElementType.TEXT_NODE) { text.append(n.content); - else if (n.type == Xml.ElementType.ELEMENT_NODE && element_needs_break(n.name)) - text.append("\n"); - - recurse_html_nodes_for_text(n.children, text); + } else if (n.type == Xml.ElementType.ELEMENT_NODE) { + string name = n.name; + if (include_blockquotes || name != "blockquote") { + if (name in alt_text_elements) { + string? alt_text = node.get_prop("alt"); + if (alt_text != null) { + text.append(alt_text); + } + } + + if (!(name in ignored_elements)) { + recurse_html_nodes_for_text(n.children, include_blockquotes, text); + } + + if (name in spacing_elements) { + text.append(" "); + } + + if (name in breaking_elements) { + text.append("\n"); + } + } + } } } -// Determines if the named element should break the flow of text. -private bool element_needs_break(string element) { - return breaking_elements.contains(element); -} - } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 40d227aa..7b39df73 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -7,6 +7,8 @@ set(TEST_SRC testcase.vala # Taken as-is from libgee, courtesy Julien Peeters engine/rfc822-mailbox-address-test.vala + engine/rfc822-message-data-test.vala + engine/util-html-test.vala ) # Vala diff --git a/test/engine/rfc822-message-data-test.vala b/test/engine/rfc822-message-data-test.vala new file mode 100644 index 00000000..0c4ce5ab --- /dev/null +++ b/test/engine/rfc822-message-data-test.vala @@ -0,0 +1,602 @@ +/* + * Copyright 2016 Michael Gratton + * + * This software is licensed under the GNU Lesser General Public License + * (version 2.1 or later). See the COPYING file in this distribution. + */ + +class Geary.RFC822.MessageDataTest : Gee.TestCase { + + public MessageDataTest() { + base("Geary.RFC822.MessageDataTest"); + add_test("PreviewText.with_header", preview_text_with_header); + } + + public void preview_text_with_header() { + string part_headers = "Content-Type: text/html; charset=utf-8\r\nContent-Transfer-Encoding: quoted-printable\r\n\r\n"; + + PreviewText preview1 = new PreviewText.with_header( + new Geary.Memory.StringBuffer(HTML_BODY1_ENCODED), + new Geary.Memory.StringBuffer(part_headers) + ); + assert(preview1.buffer.to_string() == HTML_BODY1_EXPECTED); + + PreviewText preview2 = new PreviewText.with_header( + new Geary.Memory.StringBuffer(HTML_BODY2_ENCODED), + new Geary.Memory.StringBuffer(part_headers) + ); + assert(preview2.buffer.to_string() == HTML_BODY2_EXPECTED); + } + + public static string HTML_BODY1_ENCODED = """ + + + + +
+


Hi Kenneth,

We xxxxx xxxx xx xxx xxx xx xxxx x xxxxx= +xxx xxxxxxxx.=C2=A0



Thank you,

XXXXX= +X XXXXXX

You can reply directly to this message or click the fol= +lowing link:
https://app.foobar.com/xxxxxxxxxxxxxxxx1641966deff6c48623ab= +a

You can change your email preferences at:
https://app.foobar.com/xxxxxxxxxxx

"""; + + public static string HTML_BODY1_EXPECTED = "Hi Kenneth, We xxxxx xxxx xx xxx xxx xx xxxx x xxxxxxxx xxxxxxxx.  Thank you, XXXXXX XXXXXX You can reply directly to this message or click the following link: https://app.foobar.com/xxxxxxxxxxxxxxxx1641966deff6c48623aba You can change your email preferences at: https://app.foobar.com/xxxxxxxxxxx"; + + public static string HTML_BODY2_ENCODED = """ + + + + + + +
+
+ + + +
+ + + + + + +
+Buy It Now from US $1,750.00 to US $5,950.00. +
+
+ + + +
+ + + +
3D""<= +/td> +
+
+ + + +
+ + + + + + +
+

+Daccordi, Worldwide: 2 new matches today +

+
+
+ + + +
+ + + + + + +
+
+ + + + +
+ + + + + + + + + + + + + +
+ + + + + + +
+ + + + + +
+
+

+ +Daccordi 50th anniversary edition with... + +

+
+Buy it now: US $5,950.00 +
+100% positive feedback +
+
+ + + + +
+ + + + + + + + + + + + + +
+ + + + + + +
+ + + + + +
+
+

+ +Daccordi Griffe Campagnolo Croce D'Aune... + +

+
+Buy it now: US $1,750.00 +
+100% positive feedback +
+
+
+
+
+ + + +
+ + + + + + + +
+ + + + + + + + + +
+ + + + +
+ View all results +
+
3D""
3D""
+ + + + + + + + + +
+ + + + +
+Refine this search +
+
3D""
3D""
+
+ + + + +
+ Disable emails for this search<= +/a> +
+
+
 
+ + + +
+ + + + +
+
+
+ + +"""; + + public static string HTML_BODY2_EXPECTED = "Buy It Now from US $1,750.00 to US $5,950.00. eBay Daccordi, Worldwide: 2 new matches today Daccordi 50th anniversary edition with... Buy it now: US $5,950.00 100% positive feedback Daccordi Griffe Campagnolo Croce D'Aune... Buy it now: US $1,750.00 100% positive feedback View all results Refine this search Disable emails for this search   Email reference id: [#d9f42b5e860b4eabb98195c2888cba9e#] We don't check this mailbox, so please don't reply to this message. If you have a question, go to Help & Contact. ©2016 eBay Inc., eBay International AG Helvetiastrasse 15/17 - P.O. Box 133, 3000 Bern 6, Switzerland"; +} diff --git a/test/engine/util-html-test.vala b/test/engine/util-html-test.vala new file mode 100644 index 00000000..8bf060fd --- /dev/null +++ b/test/engine/util-html-test.vala @@ -0,0 +1,76 @@ +/* + * Copyright 2016 Michael Gratton + * + * This software is licensed under the GNU Lesser General Public License + * (version 2.1 or later). See the COPYING file in this distribution. + */ + +class Geary.HTML.UtilTest : Gee.TestCase { + + public UtilTest() { + base("Geary.HTML.Util"); + add_test("remove_html_tags", remove_html_tags); + } + + public void remove_html_tags() { + string blockquote_body = """
hello

there

"""; + + string style_complete = """"""; + + string style_truncated = """ + + + + +
+


Hi Kenneth,

We xxxxx xxxx xx xxx xxx xx xxxx x xxxxxxxx xxxxxxxx. +



Thank you,

XXXXX +X XXXXXX

You can reply directly to this message or click the following link:
https://app.foobar.com/xxxxxxxxxxxxxxxx1641966deff6c48623aba

You can change your email preferences at:
https://app.foobar.com/xxxxxxxxxxx

+"""; + + public static string HTML_BODY_COMPLETE_EXPECTED = """ + +Hi Kenneth, + + We xxxxx xxxx xx xxx xxx xx xxxx x xxxxxxxx xxxxxxxx. + + + + +Thank you, + +XXXXX +X XXXXXX + +You can reply directly to this message or click the following link: +https://app.foobar.com/xxxxxxxxxxxxxxxx1641966deff6c48623aba + +You can change your email preferences at: +https://app.foobar.com/xxxxxxxxxxx + +"""; + +} diff --git a/test/main.vala b/test/main.vala index 62ebdf21..e6228ccf 100644 --- a/test/main.vala +++ b/test/main.vala @@ -7,9 +7,16 @@ int main(string[] args) { Test.init(ref args); + + Geary.RFC822.init(); + Geary.HTML.init(); + TestSuite root = TestSuite.get_root(); + // Engine tests + root.add_suite(new Geary.HTML.UtilTest().get_suite()); root.add_suite(new Geary.RFC822.MailboxAddressTest().get_suite()); + root.add_suite(new Geary.RFC822.MessageDataTest().get_suite()); return Test.run(); }