Skip to content

Commit

Permalink
FIX Use HTML5 library for better support of indexing page content
Browse files Browse the repository at this point in the history
  • Loading branch information
wilr committed Sep 20, 2021
1 parent 405ecb9 commit dc97808
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 12 deletions.
5 changes: 3 additions & 2 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
"email": "will@fullscreen.io"
}],
"require": {
"symbiote/silverstripe-queuedjobs": "^4.0.0",
"symbiote/silverstripe-queuedjobs": "^4",
"algolia/algoliasearch-client-php": "^2",
"ramsey/uuid": "^4"
"ramsey/uuid": "^4",
"masterminds/html5": "^2.7"
},
"require-dev": {
"phpunit/phpunit": "^5.7",
Expand Down
36 changes: 26 additions & 10 deletions src/Service/AlgoliaPageCrawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

namespace Wilr\SilverStripe\Algolia\Service;

use DOMDocument;
use DOMXPath;
use Exception;
use Masterminds\HTML5;
use Psr\Log\LoggerInterface;
use SilverStripe\CMS\Controllers\ModelAsController;
use SilverStripe\CMS\Model\SiteTree;
Expand Down Expand Up @@ -33,12 +33,20 @@ class AlgoliaPageCrawler

/**
* Defines the xpath selector for the first element of content
* that should be indexed.
* that should be indexed. If blank, defaults to the `main` element
*
* @config
* @var string
*/
private static $content_xpath_selector = '//main';
private static $content_xpath_selector = '';

/**
* @config
*
* @var string
*/
private static $content_element_tag = 'main';


public function __construct($item)
{
Expand All @@ -52,9 +60,11 @@ public function getMainContent(): string
}

$selector = $this->config()->get('content_xpath_selector');
$useXpath = true;

if (!$selector) {
return '';
$useXpath = false;
$selector = $this->config()->get('content_element_tag');
}

// Enable frontend themes in order to correctly render the elements as
Expand Down Expand Up @@ -83,21 +93,27 @@ public function getMainContent(): string
try {
/** @var DBHTMLText $page */
$page = $controller->render();

if ($page) {
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$dom->loadHTML($page->forTemplate());
$xpath = new DOMXPath($dom);
$nodes = $xpath->query($selector);
$html5 = new HTML5();

$dom = $html5->loadHTML($page->forTemplate());

if ($useXpath) {
$xpath = new DOMXPath($dom);
$nodes = $xpath->query($selector);
} else {
$nodes = $dom->getElementsByTagName($selector);
}

if (isset($nodes[0])) {
$output = $nodes[0]->nodeValue;
$output = preg_replace('/\s+/', ' ', $nodes[0]->nodeValue);
}
}
} catch (Exception $e) {
Injector::inst()->create(LoggerInterface::class)->error($e);
}

Requirements::restore();
Config::unnest();

Expand Down

0 comments on commit dc97808

Please sign in to comment.