Skip to content

Commit

Permalink
Merge pull request #383 from openzim/better_link_extraction
Browse files Browse the repository at this point in the history
Decoding of HTML entities in links
  • Loading branch information
kelson42 authored Nov 15, 2023
2 parents b8a0a4c + a9d0e1c commit d406de4
Show file tree
Hide file tree
Showing 3 changed files with 195 additions and 25 deletions.
66 changes: 57 additions & 9 deletions src/tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,54 @@ void stripTitleInvalidChars(std::string& str)
replaceStringInPlace(str, "\u202C", "");
}

namespace
{

const char* getHtmlEntity(const std::string& core)
{
static const std::map<std::string, const char*> t = {
{ "amp", "&" },
{ "apos", "'" },
{ "quot", "\"" },
{ "lt", "<" },
{ "gt", ">" },
};

const auto it = t.find(core);
return it != t.end() ? it->second : nullptr;
}

} // unnamed namespace

std::string decodeHtmlEntities(const std::string& str)
{
const char* p = str.c_str();
std::string result;
const char* start = nullptr;
for ( ; *p ; ++p ) {
if ( *p == '&' ) {
if ( start ) {
result.insert(result.end(), start, p);
}
start = p;
} else if ( !start ) {
result.push_back(*p);
} else if ( *p == ';' ) {
const char* d = getHtmlEntity(std::string(start+1, p));
if ( d ) {
result += d;
} else {
result.insert(result.end(), start, p+1);
}
start = nullptr;
}
}
if ( start ) {
result.insert(result.end(), start, p);
}
return result;
}

std::vector<html_link> generic_getLinks(const std::string& page)
{
const char* p = page.c_str();
Expand Down Expand Up @@ -310,7 +358,7 @@ std::vector<html_link> generic_getLinks(const std::string& page)
while(*p != delimiter)
p++;
const std::string link(linkStart, p);
links.push_back(html_link(attr, link));
links.push_back(html_link(attr, decodeHtmlEntities(link)));
p += 1;
}
return links;
Expand Down Expand Up @@ -356,7 +404,6 @@ std::string normalize_link(const std::string& input, const std::string& baseUrl)
std::string output;
output.reserve(baseUrl.size() + input.size() + 1);

bool in_query = false;
bool check_rel = false;
const char* p = input.c_str();
if ( *(p) == '/') {
Expand All @@ -373,7 +420,7 @@ std::string normalize_link(const std::string& input, const std::string& baseUrl)
//URL Decoding.
while (*p)
{
if ( !in_query && check_rel ) {
if ( check_rel ) {
if (strncmp(p, "../", 3) == 0) {
// We must go "up"
// Remove the '/' at the end of output.
Expand All @@ -394,9 +441,13 @@ std::string normalize_link(const std::string& input, const std::string& baseUrl)
continue;
}
}
if ( *p == '#' || *p == '?')
// This is a beginning of the #anchor inside a page. No need to decode more

if ( *p == '#' || *p == '?') {
// For our purposes we can safely discard the query and/or fragment
// components of the URL
break;
}

if ( *p == '%')
{
char ch;
Expand All @@ -405,10 +456,7 @@ std::string normalize_link(const std::string& input, const std::string& baseUrl)
p += 3;
continue;
}
if ( *p == '?' ) {
// We are in the query, so don't try to interprete '/' as path separator
in_query = true;
}

if ( *p == '/') {
check_rel = true;
if (output.empty()) {
Expand Down
2 changes: 2 additions & 0 deletions src/tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ bool isOutofBounds(const std::string& input, std::string base);
//Please note that the adler32 hash function has a high number of collisions, and that the hash match is not taken as final.
int adler32(const std::string& buf);

std::string decodeHtmlEntities(const std::string& str);

//Removes extra spaces from URLs. Usually done by the browser, so web authors sometimes tend to ignore it.
//Converts the %20 to space.Essential for comparing URLs.
std::string normalize_link(const std::string& input, const std::string& baseUrl);
Expand Down
152 changes: 136 additions & 16 deletions test/tools-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,9 @@ TEST(tools, normalize_link)

ASSERT_EQ(normalize_link("a", ""), "a");
ASSERT_EQ(normalize_link("./a", ""), "a");

// URI-decoding is performed
ASSERT_EQ(normalize_link("/%41%62c", "/"), "Abc");
}

TEST(tools, addler32)
Expand All @@ -257,32 +260,149 @@ TEST(tools, addler32)
ASSERT_EQ(adler32(""), 1);
}

TEST(tools, decodeHtmlEntities)
{
ASSERT_EQ(decodeHtmlEntities(""), "");

// Supported HTML character references
ASSERT_EQ(decodeHtmlEntities("&amp;"), "&");
ASSERT_EQ(decodeHtmlEntities("&apos;"), "'");
ASSERT_EQ(decodeHtmlEntities("&quot;"), "\"");
ASSERT_EQ(decodeHtmlEntities("&lt;"), "<");
ASSERT_EQ(decodeHtmlEntities("&gt;"), ">");

// All other HTML character references
// (https://html.spec.whatwg.org/multipage/syntax.html#character-references)
// are NOT currently supported
ASSERT_EQ(decodeHtmlEntities("&nbsp;"), "&nbsp;");

// Capitalized versions of supported ones do NOT work
ASSERT_EQ(decodeHtmlEntities("&AMP;"), "&AMP;");
ASSERT_EQ(decodeHtmlEntities("&aMP;"), "&aMP;");

// HTML entities of the form &#dd...; and/or &#xhh...; are NOT decoded
ASSERT_EQ(decodeHtmlEntities("&#65;"), "&#65;" ); // should be "A"
ASSERT_EQ(decodeHtmlEntities("&#x41;"), "&#x41;"); // should be "A"

// Handling of "incomplete" entity
ASSERT_EQ(decodeHtmlEntities("&amp"), "&amp");

// No double decoding
ASSERT_EQ(decodeHtmlEntities("&amp;lt;"), "&lt;");

ASSERT_EQ(decodeHtmlEntities("&lt;&gt;"), "<>");

ASSERT_EQ(decodeHtmlEntities("1&lt;2"), "1<2");

ASSERT_EQ(decodeHtmlEntities("3&5&gt;3/5"), "3&5>3/5");

ASSERT_EQ(
decodeHtmlEntities("Q&amp;A stands for &quot;Questions and answers&quot;"),
"Q&A stands for \"Questions and answers\""
);
}

std::string links2Str(const std::vector<html_link>& links)
{
std::ostringstream oss;
const char* sep = "";
for ( const auto& l : links ) {
oss << sep << "{ " << l.attribute << ", " << l.link << " }";
sep = "\n";
}
return oss.str();
}

#define EXPECT_LINKS(html, expectedStr) \
ASSERT_EQ(links2Str(generic_getLinks(html)), expectedStr)

TEST(tools, getLinks)
{
auto v = generic_getLinks("");
EXPECT_LINKS(
"",
""
);

EXPECT_LINKS(
R"(<link href="https://fonts.io/css?family=OpenSans" rel="stylesheet">)",
"{ href, https://fonts.io/css?family=OpenSans }"
);

ASSERT_TRUE(v.empty());
EXPECT_LINKS(
R"(<link href='https://fonts.io/css?family=OpenSans' rel="stylesheet">)",
"{ href, https://fonts.io/css?family=OpenSans }"
);

std::string page1 = "<link href=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
auto v1 = generic_getLinks(page1);
EXPECT_LINKS(
R"(<link src="https://fonts.io/css?family=OpenSans" rel="stylesheet">)",
"{ src, https://fonts.io/css?family=OpenSans }"
);

ASSERT_TRUE(v1.size() == 1);
ASSERT_EQ(v1[0].attribute, "href");
ASSERT_EQ(v1[0].link, "https://fonts.goos.com/css?family=OpenSans");
// URI-decoding is NOT performed on extracted links
// (that's normalize_link()'s job)
EXPECT_LINKS(
"<audio controls src ='/music/It&apos;s%20only%20love.ogg'></audio>",
"{ src, /music/It's%20only%20love.ogg }"
);

std::string page2 = "<link href=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
auto v2 = generic_getLinks(page2);
EXPECT_LINKS(
R"(<a href="/R&amp;D">Research and development</a>
blablabla
<a href="../syntax/&lt;script&gt;">&lt;script&gt;</a>
...
<a href="/Presidents/Dwight_&quot;Ike&quot;_Eisenhower">#34</a>
<img src="https://example.com/getlogo?w=640&amp;h=480">
)",
"{ href, /R&D }" "\n"
"{ href, ../syntax/<script> }" "\n"
"{ href, /Presidents/Dwight_\"Ike\"_Eisenhower }" "\n"
"{ src, https://example.com/getlogo?w=640&h=480 }"
);

ASSERT_TRUE(v2.size() == 1);
ASSERT_EQ(v1[0].attribute, "href");
// Known issue - HTML is not parsed and therefore false links
// may be returned
EXPECT_LINKS(
R"(
<html>
<head>
<link src = "/css/stylesheet.css" rel="stylesheet">
<link rel="icon" href = '/favicon.ico'>
</head>
<body>
<img src="../img/welcome.png">
<!--
<a href="commented_out_link.htm"></a>
<img src="commented_out_image.png">
-->
<pre>
&lt;a href="not_a_link_in_example_code_block.htm"&gt;&lt;/a&gt;
&lt;img src="not_a_link_in_example_code_block.png"&gt;
</pre>
Powered by <a target="_blank" href="https://kiwix.org">Kiwix</a>.
</body>
</html>
)",
// links
"{ src, /css/stylesheet.css }" "\n"
"{ href, /favicon.ico }" "\n"
"{ src, ../img/welcome.png }" "\n"
"{ href, commented_out_link.htm }" "\n"
"{ src, commented_out_image.png }" "\n"
"{ href, not_a_link_in_example_code_block.htm }" "\n"
"{ src, not_a_link_in_example_code_block.png }" "\n"
"{ href, https://kiwix.org }"
);

std::string page3 = "<link src=\"https://fonts.goos.com/css?family=OpenSans\" rel=\"stylesheet\">";
auto v3 = generic_getLinks(page3);
// Despite HTML not being properly parsed, not every href or src followed
// by an equality sign (with optional whitespace in between) results in a
// link
EXPECT_LINKS(
"abcd href = qwerty src={123} xyz",
""
);

ASSERT_TRUE(v3.size() == 1);
ASSERT_EQ(v3[0].attribute, "src");
ASSERT_EQ(v3[0].link, "https://fonts.goos.com/css?family=OpenSans");
}
#undef EXPECT_LINKS

TEST(tools, httpRedirectHtml)
{
Expand Down

0 comments on commit d406de4

Please sign in to comment.