Merge pull request #383 from openzim/better_link_extraction

Decoding of HTML entities in links
openzim · Nov 15, 2023 · d406de4 · d406de4
2 parents b8a0a4c + a9d0e1c
commit d406de4
Show file tree

Hide file tree

Showing 3 changed files with 195 additions and 25 deletions.
diff --git a/src/tools.cpp b/src/tools.cpp
@@ -276,6 +276,54 @@ void stripTitleInvalidChars(std::string& str)
   replaceStringInPlace(str, "\u202C", "");
 }
 
+namespace
+{
+
+const char* getHtmlEntity(const std::string& core)
+{
+  static const std::map<std::string, const char*> t = {
+    { "amp",  "&"  },
+    { "apos", "'"  },
+    { "quot", "\"" },
+    { "lt",   "<"  },
+    { "gt",   ">"  },
+  };
+
+  const auto it = t.find(core);
+  return it != t.end() ? it->second : nullptr;
+}
+
+} // unnamed namespace
+
+std::string decodeHtmlEntities(const std::string& str)
+{
+  const char* p = str.c_str();
+  std::string result;
+  const char* start = nullptr;
+  for ( ; *p ; ++p ) {
+    if ( *p == '&' ) {
+      if ( start ) {
+        result.insert(result.end(), start, p);
+      }
+      start = p;
+    } else if ( !start ) {
+      result.push_back(*p);
+    } else if ( *p == ';' ) {
+      const char* d = getHtmlEntity(std::string(start+1, p));
+      if ( d ) {
+        result += d;
+      } else {
+        result.insert(result.end(), start, p+1);
+      }
+      start = nullptr;
+    }
+  }
+  if ( start ) {
+    result.insert(result.end(), start, p);
+  }
+  return result;
+}
+
 std::vector<html_link> generic_getLinks(const std::string& page)
 {
     const char* p = page.c_str();
@@ -310,7 +358,7 @@ std::vector<html_link> generic_getLinks(const std::string& page)
         while(*p != delimiter)
             p++;
         const std::string link(linkStart, p);
-        links.push_back(html_link(attr, link));
+        links.push_back(html_link(attr, decodeHtmlEntities(link)));
         p += 1;
     }
     return links;
@@ -356,7 +404,6 @@ std::string normalize_link(const std::string& input, const std::string& baseUrl)
     std::string output;
     output.reserve(baseUrl.size() + input.size() + 1);
 
-    bool in_query = false;
     bool check_rel = false;
     const char* p = input.c_str();
     if ( *(p) == '/') {
@@ -373,7 +420,7 @@ std::string normalize_link(const std::string& input, const std::string& baseUrl)
     //URL Decoding.
     while (*p)
     {
-        if ( !in_query && check_rel ) {
+        if ( check_rel ) {
             if (strncmp(p, "../", 3) == 0) {
                 // We must go "up"
                 // Remove the '/' at the end of output.
@@ -394,9 +441,13 @@ std::string normalize_link(const std::string& input, const std::string& baseUrl)
                 continue;
             }
         }
-        if ( *p == '#' || *p == '?')
-            // This is a beginning of the #anchor inside a page. No need to decode more
+
+        if ( *p == '#' || *p == '?') {
+            // For our purposes we can safely discard the query and/or fragment
+            // components of the URL
             break;
+        }
+
         if ( *p == '%')
         {
             char ch;
@@ -405,10 +456,7 @@ std::string normalize_link(const std::string& input, const std::string& baseUrl)
             p += 3;
             continue;
         }
-        if ( *p == '?' ) {
-            // We are in the query, so don't try to interprete '/' as path separator
-            in_query = true;
-        }
+
         if ( *p == '/') {
             check_rel = true;
             if (output.empty()) {

diff --git a/src/tools.h b/src/tools.h
@@ -203,6 +203,8 @@ bool isOutofBounds(const std::string& input, std::string base);
 //Please note that the adler32 hash function has a high number of collisions, and that the hash match is not taken as final.
 int adler32(const std::string& buf);
 
+std::string decodeHtmlEntities(const std::string& str);
+
 //Removes extra spaces from URLs. Usually done by the browser, so web authors sometimes tend to ignore it.
 //Converts the %20 to space.Essential for comparing URLs.
 std::string normalize_link(const std::string& input, const std::string& baseUrl);

diff --git a/test/tools-test.cpp b/test/tools-test.cpp
@@ -247,6 +247,9 @@ TEST(tools, normalize_link)
 
     ASSERT_EQ(normalize_link("a", ""), "a");
     ASSERT_EQ(normalize_link("./a", ""), "a");
+
+    // URI-decoding is performed
+    ASSERT_EQ(normalize_link("/%41%62c", "/"), "Abc");
 }
 
 TEST(tools, addler32)
@@ -257,32 +260,149 @@ TEST(tools, addler32)
     ASSERT_EQ(adler32(""), 1);
 }
 
+TEST(tools, decodeHtmlEntities)
+{
+    ASSERT_EQ(decodeHtmlEntities(""),   "");
+
+    // Supported HTML character references
+    ASSERT_EQ(decodeHtmlEntities("&amp;"),  "&");
+    ASSERT_EQ(decodeHtmlEntities("&apos;"), "'");
+    ASSERT_EQ(decodeHtmlEntities("&quot;"), "\"");
+    ASSERT_EQ(decodeHtmlEntities("&lt;"),   "<");
+    ASSERT_EQ(decodeHtmlEntities("&gt;"),   ">");
+
+    // All other HTML character references
+    // (https://html.spec.whatwg.org/multipage/syntax.html#character-references)
+    // are NOT currently supported
+    ASSERT_EQ(decodeHtmlEntities("&nbsp;"), "&nbsp;");
+
+    // Capitalized versions of supported ones do NOT work
+    ASSERT_EQ(decodeHtmlEntities("&AMP;"), "&AMP;");
+    ASSERT_EQ(decodeHtmlEntities("&aMP;"), "&aMP;");
+
+    // HTML entities of the form &#dd...; and/or &#xhh...; are NOT decoded
+    ASSERT_EQ(decodeHtmlEntities("&#65;"),  "&#65;" ); // should be "A"
+    ASSERT_EQ(decodeHtmlEntities("&#x41;"), "&#x41;"); // should be "A"
+
+    // Handling of "incomplete" entity
+    ASSERT_EQ(decodeHtmlEntities("&amp"), "&amp");
+
+    // No double decoding
+    ASSERT_EQ(decodeHtmlEntities("&amp;lt;"), "&lt;");
+
+    ASSERT_EQ(decodeHtmlEntities("&lt;&gt;"), "<>");
+
+    ASSERT_EQ(decodeHtmlEntities("1&lt;2"),   "1<2");
+
+    ASSERT_EQ(decodeHtmlEntities("3&5&gt;3/5"), "3&5>3/5");
+
+    ASSERT_EQ(
+        decodeHtmlEntities("Q&amp;A stands for &quot;Questions and answers&quot;"),
+        "Q&A stands for \"Questions and answers\""
+    );
+}
+
+std::string links2Str(const std::vector<html_link>& links)
+{
+    std::ostringstream oss;
+    const char* sep = "";
+    for ( const auto& l : links ) {
+        oss << sep << "{ " << l.attribute << ", " << l.link << " }";
+        sep = "\n";
+    }
+    return oss.str();
+}
+
+#define EXPECT_LINKS(html, expectedStr) \
+        ASSERT_EQ(links2Str(generic_getLinks(html)), expectedStr)
+
 TEST(tools, getLinks)
 {
-    auto v = generic_getLinks("");
+    EXPECT_LINKS(
+      "",
+      ""
+    );
+
+    EXPECT_LINKS(
+      R"(<link href="https://fonts.io/css?family=OpenSans" rel="stylesheet">)",
+      "{ href, https://fonts.io/css?family=OpenSans }"
+    );
 
-    ASSERT_TRUE(v.empty());
+    EXPECT_LINKS(
+      R"(<link href='https://fonts.io/css?family=OpenSans' rel="stylesheet">)",
+      "{ href, https://fonts.io/css?family=OpenSans }"
+    );
 
-    std::string page1 = "<link href=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
-    auto v1 = generic_getLinks(page1);
+    EXPECT_LINKS(
+      R"(<link src="https://fonts.io/css?family=OpenSans" rel="stylesheet">)",
+      "{ src, https://fonts.io/css?family=OpenSans }"
+    );
 
-    ASSERT_TRUE(v1.size() == 1);
-    ASSERT_EQ(v1[0].attribute, "href");
-    ASSERT_EQ(v1[0].link, "https://fonts.goos.com/css?family=OpenSans");
+    // URI-decoding is NOT performed on extracted links
+    // (that's normalize_link()'s job)
+    EXPECT_LINKS(
+      "<audio controls src ='/music/It&apos;s%20only%20love.ogg'></audio>",
+      "{ src, /music/It's%20only%20love.ogg }"
+    );
 
-    std::string page2 = "<link href=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
-    auto v2 = generic_getLinks(page2);
+    EXPECT_LINKS(
+      R"(<a href="/R&amp;D">Research and development</a>
+         blablabla
+         <a href="../syntax/&lt;script&gt;">&lt;script&gt;</a>
+         ...
+         <a href="/Presidents/Dwight_&quot;Ike&quot;_Eisenhower">#34</a>
+         <img src="https://example.com/getlogo?w=640&amp;h=480">
+      )",
+      "{ href, /R&D }"                                    "\n"
+      "{ href, ../syntax/<script> }"                      "\n"
+      "{ href, /Presidents/Dwight_\"Ike\"_Eisenhower }"   "\n"
+      "{ src, https://example.com/getlogo?w=640&h=480 }"
+    );
 
-    ASSERT_TRUE(v2.size() == 1);
-    ASSERT_EQ(v1[0].attribute, "href");
+    // Known issue - HTML is not parsed and therefore false links
+    //               may be returned
+    EXPECT_LINKS(
+      R"(
+<html>
+  <head>
+    <link src = "/css/stylesheet.css" rel="stylesheet">
+    <link rel="icon" href   =    '/favicon.ico'>
+  </head>
+  <body>
+    <img src="../img/welcome.png">
+    <!--
+      <a href="commented_out_link.htm"></a>
+      <img src="commented_out_image.png">
+    -->
+    <pre>
+      &lt;a href="not_a_link_in_example_code_block.htm"&gt;&lt;/a&gt;
+      &lt;img src="not_a_link_in_example_code_block.png"&gt;
+    </pre>
+    Powered by <a target="_blank" href="https://kiwix.org">Kiwix</a>.
+  </body>
+</html>
+)",
+      // links
+      "{ src, /css/stylesheet.css }"                      "\n"
+      "{ href, /favicon.ico }"                            "\n"
+      "{ src, ../img/welcome.png }"                       "\n"
+      "{ href, commented_out_link.htm }"                  "\n"
+      "{ src, commented_out_image.png }"                  "\n"
+      "{ href, not_a_link_in_example_code_block.htm }"    "\n"
+      "{ src, not_a_link_in_example_code_block.png }"     "\n"
+      "{ href, https://kiwix.org }"
+    );
 
-    std::string page3 = "<link src=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
-    auto v3 = generic_getLinks(page3);
+    // Despite HTML not being properly parsed, not every href or src followed
+    // by an equality sign (with optional whitespace in between) results in a
+    // link
+    EXPECT_LINKS(
+      "abcd href = qwerty src={123} xyz",
+      ""
+    );
 
-    ASSERT_TRUE(v3.size() == 1);
-    ASSERT_EQ(v3[0].attribute, "src");
-    ASSERT_EQ(v3[0].link, "https://fonts.goos.com/css?family=OpenSans");
 }
+#undef EXPECT_LINKS
 
 TEST(tools, httpRedirectHtml)
 {