Skip to content

Commit

Permalink
feat: content matcher to index full-text-content
Browse files Browse the repository at this point in the history
  • Loading branch information
sitepark-veltrup committed Feb 15, 2024
1 parent abcd512 commit c40d02d
Show file tree
Hide file tree
Showing 11 changed files with 424 additions and 6 deletions.
15 changes: 14 additions & 1 deletion src/Console/Command/SolrIndexerBuilder.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,15 @@
use Atoolo\Resource\Loader\SiteKitLoader;
use Atoolo\Resource\Loader\SiteKitNavigationHierarchyLoader;
use Atoolo\Search\Console\Command\Io\IndexerProgressBar;
use Atoolo\Search\Service\Indexer\ContentCollector;
use Atoolo\Search\Service\Indexer\DocumentEnricher;
use Atoolo\Search\Service\Indexer\IndexDocument;
use Atoolo\Search\Service\Indexer\IndexingAborter;
use Atoolo\Search\Service\Indexer\LocationFinder;
use Atoolo\Search\Service\Indexer\SiteKit\ContentMatcher;
use Atoolo\Search\Service\Indexer\SiteKit\DefaultSchema2xDocumentEnricher;
use Atoolo\Search\Service\Indexer\SiteKit\HeadlineMatcher;
use Atoolo\Search\Service\Indexer\SiteKit\RichtTextMatcher;
use Atoolo\Search\Service\Indexer\SiteKit\SubDirTranslationSplitter;
use Atoolo\Search\Service\Indexer\SolrIndexer;
use Atoolo\Search\Service\SolrParameterClientFactory;
Expand Down Expand Up @@ -75,8 +79,17 @@ public function build(): SolrIndexer
$navigationLoader = new SiteKitNavigationHierarchyLoader(
$resourceLoader
);

/** @var iterable<ContentMatcher> $matcher */
$matcher = [
new HeadlineMatcher(),
new RichtTextMatcher(),
];
$contentCollector = new ContentCollector($matcher);

$schema21 = new DefaultSchema2xDocumentEnricher(
$navigationLoader
$navigationLoader,
$contentCollector
);

/** @var array<DocumentEnricher<IndexDocument>> $documentEnricherList */
Expand Down
62 changes: 62 additions & 0 deletions src/Service/Indexer/ContentCollector.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?php

declare(strict_types=1);

namespace Atoolo\Search\Service\Indexer;

use Atoolo\Search\Service\Indexer\SiteKit\ContentMatcher;

class ContentCollector
{
/**
* @param iterable<ContentMatcher> $matchers
*/
public function __construct(private readonly iterable $matchers)
{
}

/**
* @param array<mixed,mixed> $data
*/
public function collect(array $data): string
{
$content = $this->walk([], $data);
return implode(' ', $content);
}

/**
* @param string[] $path
* @param array<mixed,mixed> $data
* @return string[]
*/
private function walk(array $path, array $data): array
{
$contentCollections = [];
foreach ($data as $key => $value) {
if (!is_array($value)) {
continue;
}

if (is_string($key)) {
$path[] = $key;
}

$matcherContent = [];
foreach ($this->matchers as $matcher) {
$content = $matcher->match($path, $value);
if (!is_string($content)) {
continue;
}
$matcherContent[] = $content;
}
$contentCollections[] = $matcherContent;
$contentCollections[] = $this->walk($path, $value);

if (is_string($key)) {
array_pop($path);
}
}

return array_merge([], ...$contentCollections);
}
}
14 changes: 14 additions & 0 deletions src/Service/Indexer/SiteKit/ContentMatcher.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?php

declare(strict_types=1);

namespace Atoolo\Search\Service\Indexer\SiteKit;

interface ContentMatcher
{
/**
* @param string[] $path
* @param array<mixed, mixed> $value
*/
public function match(array $path, array $value): bool|string;
}
109 changes: 104 additions & 5 deletions src/Service/Indexer/SiteKit/DefaultSchema2xDocumentEnricher.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,45 @@
use Atoolo\Resource\Loader\SiteKitNavigationHierarchyLoader;
use Atoolo\Resource\Resource;
use Atoolo\Search\Exception\DocumentEnrichingException;
use Atoolo\Search\Service\Indexer\ContentCollector;
use Atoolo\Search\Service\Indexer\DocumentEnricher;
use Atoolo\Search\Service\Indexer\IndexDocument;
use Atoolo\Search\Service\Indexer\IndexSchema2xDocument;
use DateTime;
use Exception;

/**
* @phpstan-type Phone array{
* countryCode?:string,
* areaCode?:string,
* localNumber?:string
* }
* @phpstan-type PhoneData array{phone:Phone}
* @phpstan-type PhoneList array<PhoneData>
* @phpstan-type Email array{email:string}
* @phpstan-type EmailList array<Email>
* @phpstan-type ContactData array{
* phoneList?:PhoneList,
* emailList:EmailList
* }
* @phpstan-type AddressData array{
* buildingName?:string,
* street?:string,
* postOfficeBoxData?: array{
* buildingName?:string
* }
* }
* @phpstan-type ContactPoint array{
* contactData?:ContactData,
* addressData?:AddressData
* }
* @implements DocumentEnricher<IndexSchema2xDocument>
*/
class DefaultSchema2xDocumentEnricher implements DocumentEnricher
{
public function __construct(
private readonly SiteKitNavigationHierarchyLoader $navigationLoader
private readonly SiteKitNavigationHierarchyLoader $navigationLoader,
private readonly ContentCollector $contentCollector
) {
}

Expand Down Expand Up @@ -149,7 +175,7 @@ public function enrichDocument(
if ($siteGroupId !== 0) {
$sites[] = (string)$siteGroupId;
}
$doc->sp_site = array_unique($sites, SORT_STRING);
$doc->sp_site = array_unique($sites);
} catch (Exception $e) {
throw new DocumentEnrichingException(
$resource->getLocation(),
Expand Down Expand Up @@ -226,9 +252,6 @@ public function enrichDocument(
'text/html; charset=UTF-8'
);
$doc->meta_content_type = $contentType;
$doc->content = $resource->getData()->getString(
'searchindexdata.content'
);

$accessType = $resource->getData()->getString('init.access.type');

Expand All @@ -252,9 +275,85 @@ public function enrichDocument(

$doc->sp_source = ['internal'];

return $this->enrichContent($resource, $doc);
}

/**
* @param IndexSchema2xDocument $doc
* @return IndexSchema2xDocument
*/
private function enrichContent(
Resource $resource,
IndexDocument $doc,
): IndexDocument {

$content = [];
$content[] = $resource->getData()->getString(
'searchindexdata.content'
);

$content[] = $this->contentCollector->collect(
$resource->getData()->getArray('content')
);

/** @var ContactPoint $contactPoint */
$contactPoint = $resource->getData()->getArray('metadata.contactPoint');
$content[] = $this->contactPointToContent($contactPoint);

$cleanContent = preg_replace(
'/\s+/',
' ',
implode(' ', $content)
);

$doc->content = trim($cleanContent ?? '');

return $doc;
}

/**
* @param ContactPoint $contactPoint
* @return string
*/
private function contactPointToContent(array $contactPoint): string
{
if (empty($contactPoint)) {
return '';
}

$content = [];
foreach (($contactPoint['contactData']['phoneList'] ?? []) as $phone) {
$countryCode = $phone['phone']['countryCode'] ?? '';
if (
!empty($countryCode) &&
!in_array($countryCode, $content, true)
) {
$content[] = '+' . $countryCode;
}
$areaCode = $phone['phone']['areaCode'] ?? '';
if (!empty($areaCode) && !in_array($areaCode, $content, true)) {
$content[] = $areaCode;
$content[] = '0' . $areaCode;
}
$content[] = ($phone['phone']['localNumber'] ?? '');
}
foreach (($contactPoint['contactData']['emailList'] ?? []) as $email) {
$content[] = $email['email'];
}

if (isset($contactPoint['addressData'])) {
$addressData = $contactPoint['addressData'];
$content[] = ($addressData['street'] ?? '');
$content[] = ($addressData['buildingName'] ?? '');
$content[] = (
$addressData['postOfficeBoxData']['buildingName'] ??
''
);
}

return implode(' ', $content);
}

private function idWithoutSignature(string $id): int
{
$s = substr($id, -11);
Expand Down
29 changes: 29 additions & 0 deletions src/Service/Indexer/SiteKit/HeadlineMatcher.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<?php

declare(strict_types=1);

namespace Atoolo\Search\Service\Indexer\SiteKit;

class HeadlineMatcher implements ContentMatcher
{
/**
* @inheritDoc
*/
public function match(array $path, array $value): bool|string
{
$len = count($path);
if ($len < 2) {
return false;
}

if (
$path[$len - 2] !== 'items' ||
$path[$len - 1] !== 'model'
) {
return false;
}

$headline = $value['headline'] ?? false;
return is_string($headline) ? $headline : false;
}
}
51 changes: 51 additions & 0 deletions src/Service/Indexer/SiteKit/QuoteSectionMatcher.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<?php

declare(strict_types=1);

namespace Atoolo\Search\Service\Indexer\SiteKit;

/**
* @phpstan-type Model array{quote?:string, citation?:string}
*/
class QuoteSectionMatcher implements ContentMatcher
{
/**
* @inheritDoc
*/
public function match(array $path, array $value): bool|string
{
$len = count($path);
if ($len < 1) {
return false;
}

if (
$path[$len - 1] !== 'items'
) {
return false;
}

if (($value['type'] ?? '') !== 'quote') {
return false;
}

$model = $value['model'] ?? false;
if (!is_array($model)) {
return false;
}

/** @var Model $model */

$content = [];
$quote = $model['quote'] ?? '';
if (is_string($quote)) {
$content[] = $quote;
}
$citation = $model['citation'] ?? '';
if (is_string($citation)) {
$content[] = $citation;
}

return implode(' ', $content);
}
}
22 changes: 22 additions & 0 deletions src/Service/Indexer/SiteKit/RichtTextMatcher.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?php

declare(strict_types=1);

namespace Atoolo\Search\Service\Indexer\SiteKit;

class RichtTextMatcher implements ContentMatcher
{
/**
* @inheritDoc
*/
public function match(array $path, array $value): bool|string
{
$modelType = $value['modelType'] ?? false;
if ($modelType !== 'html.richText') {
return false;
}

$text = $value['text'] ?? false;
return is_string($text) ? strip_tags($text) : false;
}
}
Loading

0 comments on commit c40d02d

Please sign in to comment.