Skip to content

Commit

Permalink
Enter decodeHtmlEntities()
Browse files Browse the repository at this point in the history
  • Loading branch information
veloman-yunkan committed Nov 14, 2023
1 parent df1d32b commit d4a0c13
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 0 deletions.
47 changes: 47 additions & 0 deletions src/tools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,53 @@ void stripTitleInvalidChars(std::string& str)
replaceStringInPlace(str, "\u202C", "");
}

namespace
{

const char* getHtmlEntity(const std::string& core)
{
static const std::map<std::string, const char*> t = {
{ "amp", "&" },
{ "quot", "\"" },
{ "lt", "<" },
{ "gt", ">" },
};

const auto it = t.find(core);
return it != t.end() ? it->second : nullptr;
}

} // unnamed namespace

std::string decodeHtmlEntities(const std::string& str)
{
const char* p = str.c_str();
std::string result;
const char* start = nullptr;
for ( ; *p ; ++p ) {
if ( *p == '&' ) {
if ( start ) {
result.insert(result.end(), start, p);
}
start = p;
} else if ( !start ) {
result.push_back(*p);
} else if ( *p == ';' ) {
const char* d = getHtmlEntity(std::string(start+1, p));
if ( d ) {
result += d;
} else {
result.insert(result.end(), start, p+1);
}
start = nullptr;
}
}
if ( start ) {
result.insert(result.end(), start, p);
}
return result;
}

std::vector<html_link> generic_getLinks(const std::string& page)
{
const char* p = page.c_str();
Expand Down
2 changes: 2 additions & 0 deletions src/tools.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ bool isOutofBounds(const std::string& input, std::string base);
//Please note that the adler32 hash function has a high number of collisions, and that the hash match is not taken as final.
int adler32(const std::string& buf);

std::string decodeHtmlEntities(const std::string& str);

//Removes extra spaces from URLs. Usually done by the browser, so web authors sometimes tend to ignore it.
//Converts the %20 to space.Essential for comparing URLs.
std::string normalize_link(const std::string& input, const std::string& baseUrl);
Expand Down
38 changes: 38 additions & 0 deletions test/tools-test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,44 @@ TEST(tools, addler32)
ASSERT_EQ(adler32(""), 1);
}

TEST(tools, decodeHtmlEntities)
{
ASSERT_EQ(decodeHtmlEntities(""), "");

// Supported HTML character references
ASSERT_EQ(decodeHtmlEntities("&amp;"), "&");
ASSERT_EQ(decodeHtmlEntities("&quot;"), "\"");
ASSERT_EQ(decodeHtmlEntities("&lt;"), "<");
ASSERT_EQ(decodeHtmlEntities("&gt;"), ">");

// All other HTML character references
// (https://html.spec.whatwg.org/multipage/syntax.html#character-references)
// are NOT currently supported
ASSERT_EQ(decodeHtmlEntities("&apos;"), "&apos;"); // should be "'"

// Capitalized versions of supported ones do NOT work
ASSERT_EQ(decodeHtmlEntities("&AMP;"), "&AMP;");
ASSERT_EQ(decodeHtmlEntities("&aMP;"), "&aMP;");

// HTML entities of the form &#dd...; and/or &#xhh...; are NOT decoded
ASSERT_EQ(decodeHtmlEntities("&#65;"), "&#65;" ); // should be "A"
ASSERT_EQ(decodeHtmlEntities("&#x41;"), "&#x41;"); // should be "A"

// Handling of "incomplete" entity
ASSERT_EQ(decodeHtmlEntities("&amp"), "&amp");

// No double decoding
ASSERT_EQ(decodeHtmlEntities("&amp;lt;"), "&lt;");

ASSERT_EQ(decodeHtmlEntities("&lt;&gt;"), "<>");

ASSERT_EQ(decodeHtmlEntities("1&lt;2"), "1<2");

ASSERT_EQ(
decodeHtmlEntities("Q&amp;A stands for &quot;Questions and answers&quot;"),
"Q&A stands for \"Questions and answers\""
);
}

std::string links2Str(const std::vector<html_link>& links)
{
Expand Down

0 comments on commit d4a0c13

Please sign in to comment.