diff --git a/src/Console/Command/SolrIndexerBuilder.php b/src/Console/Command/SolrIndexerBuilder.php index a88f16b..bd6920d 100644 --- a/src/Console/Command/SolrIndexerBuilder.php +++ b/src/Console/Command/SolrIndexerBuilder.php @@ -8,11 +8,15 @@ use Atoolo\Resource\Loader\SiteKitLoader; use Atoolo\Resource\Loader\SiteKitNavigationHierarchyLoader; use Atoolo\Search\Console\Command\Io\IndexerProgressBar; +use Atoolo\Search\Service\Indexer\ContentCollector; use Atoolo\Search\Service\Indexer\DocumentEnricher; use Atoolo\Search\Service\Indexer\IndexDocument; use Atoolo\Search\Service\Indexer\IndexingAborter; use Atoolo\Search\Service\Indexer\LocationFinder; +use Atoolo\Search\Service\Indexer\SiteKit\ContentMatcher; use Atoolo\Search\Service\Indexer\SiteKit\DefaultSchema2xDocumentEnricher; +use Atoolo\Search\Service\Indexer\SiteKit\HeadlineMatcher; +use Atoolo\Search\Service\Indexer\SiteKit\RichtTextMatcher; use Atoolo\Search\Service\Indexer\SiteKit\SubDirTranslationSplitter; use Atoolo\Search\Service\Indexer\SolrIndexer; use Atoolo\Search\Service\SolrParameterClientFactory; @@ -75,8 +79,17 @@ public function build(): SolrIndexer $navigationLoader = new SiteKitNavigationHierarchyLoader( $resourceLoader ); + + /** @var iterable $matcher */ + $matcher = [ + new HeadlineMatcher(), + new RichtTextMatcher(), + ]; + $contentCollector = new ContentCollector($matcher); + $schema21 = new DefaultSchema2xDocumentEnricher( - $navigationLoader + $navigationLoader, + $contentCollector ); /** @var array> $documentEnricherList */ diff --git a/src/Service/Indexer/ContentCollector.php b/src/Service/Indexer/ContentCollector.php new file mode 100644 index 0000000..372abef --- /dev/null +++ b/src/Service/Indexer/ContentCollector.php @@ -0,0 +1,62 @@ + $matchers + */ + public function __construct(private readonly iterable $matchers) + { + } + + /** + * @param array $data + */ + public function collect(array $data): string + { + $content = $this->walk([], $data); + return implode(' ', $content); + } + + /** + * @param string[] $path + * @param array $data + * @return string[] + */ + private function walk(array $path, array $data): array + { + $contentCollections = []; + foreach ($data as $key => $value) { + if (!is_array($value)) { + continue; + } + + if (is_string($key)) { + $path[] = $key; + } + + $matcherContent = []; + foreach ($this->matchers as $matcher) { + $content = $matcher->match($path, $value); + if (!is_string($content)) { + continue; + } + $matcherContent[] = $content; + } + $contentCollections[] = $matcherContent; + $contentCollections[] = $this->walk($path, $value); + + if (is_string($key)) { + array_pop($path); + } + } + + return array_merge([], ...$contentCollections); + } +} diff --git a/src/Service/Indexer/SiteKit/ContentMatcher.php b/src/Service/Indexer/SiteKit/ContentMatcher.php new file mode 100644 index 0000000..823ddb7 --- /dev/null +++ b/src/Service/Indexer/SiteKit/ContentMatcher.php @@ -0,0 +1,14 @@ + $value + */ + public function match(array $path, array $value): bool|string; +} diff --git a/src/Service/Indexer/SiteKit/DefaultSchema2xDocumentEnricher.php b/src/Service/Indexer/SiteKit/DefaultSchema2xDocumentEnricher.php index 7a92733..72505cd 100644 --- a/src/Service/Indexer/SiteKit/DefaultSchema2xDocumentEnricher.php +++ b/src/Service/Indexer/SiteKit/DefaultSchema2xDocumentEnricher.php @@ -7,6 +7,7 @@ use Atoolo\Resource\Loader\SiteKitNavigationHierarchyLoader; use Atoolo\Resource\Resource; use Atoolo\Search\Exception\DocumentEnrichingException; +use Atoolo\Search\Service\Indexer\ContentCollector; use Atoolo\Search\Service\Indexer\DocumentEnricher; use Atoolo\Search\Service\Indexer\IndexDocument; use Atoolo\Search\Service\Indexer\IndexSchema2xDocument; @@ -14,12 +15,37 @@ use Exception; /** + * @phpstan-type Phone array{ + * countryCode?:string, + * areaCode?:string, + * localNumber?:string + * } + * @phpstan-type PhoneData array{phone:Phone} + * @phpstan-type PhoneList array + * @phpstan-type Email array{email:string} + * @phpstan-type EmailList array + * @phpstan-type ContactData array{ + * phoneList?:PhoneList, + * emailList:EmailList + * } + * @phpstan-type AddressData array{ + * buildingName?:string, + * street?:string, + * postOfficeBoxData?: array{ + * buildingName?:string + * } + * } + * @phpstan-type ContactPoint array{ + * contactData?:ContactData, + * addressData?:AddressData + * } * @implements DocumentEnricher */ class DefaultSchema2xDocumentEnricher implements DocumentEnricher { public function __construct( - private readonly SiteKitNavigationHierarchyLoader $navigationLoader + private readonly SiteKitNavigationHierarchyLoader $navigationLoader, + private readonly ContentCollector $contentCollector ) { } @@ -149,7 +175,7 @@ public function enrichDocument( if ($siteGroupId !== 0) { $sites[] = (string)$siteGroupId; } - $doc->sp_site = array_unique($sites, SORT_STRING); + $doc->sp_site = array_unique($sites); } catch (Exception $e) { throw new DocumentEnrichingException( $resource->getLocation(), @@ -226,9 +252,6 @@ public function enrichDocument( 'text/html; charset=UTF-8' ); $doc->meta_content_type = $contentType; - $doc->content = $resource->getData()->getString( - 'searchindexdata.content' - ); $accessType = $resource->getData()->getString('init.access.type'); @@ -252,9 +275,85 @@ public function enrichDocument( $doc->sp_source = ['internal']; + return $this->enrichContent($resource, $doc); + } + + /** + * @param IndexSchema2xDocument $doc + * @return IndexSchema2xDocument + */ + private function enrichContent( + Resource $resource, + IndexDocument $doc, + ): IndexDocument { + + $content = []; + $content[] = $resource->getData()->getString( + 'searchindexdata.content' + ); + + $content[] = $this->contentCollector->collect( + $resource->getData()->getArray('content') + ); + + /** @var ContactPoint $contactPoint */ + $contactPoint = $resource->getData()->getArray('metadata.contactPoint'); + $content[] = $this->contactPointToContent($contactPoint); + + $cleanContent = preg_replace( + '/\s+/', + ' ', + implode(' ', $content) + ); + + $doc->content = trim($cleanContent ?? ''); + return $doc; } + /** + * @param ContactPoint $contactPoint + * @return string + */ + private function contactPointToContent(array $contactPoint): string + { + if (empty($contactPoint)) { + return ''; + } + + $content = []; + foreach (($contactPoint['contactData']['phoneList'] ?? []) as $phone) { + $countryCode = $phone['phone']['countryCode'] ?? ''; + if ( + !empty($countryCode) && + !in_array($countryCode, $content, true) + ) { + $content[] = '+' . $countryCode; + } + $areaCode = $phone['phone']['areaCode'] ?? ''; + if (!empty($areaCode) && !in_array($areaCode, $content, true)) { + $content[] = $areaCode; + $content[] = '0' . $areaCode; + } + $content[] = ($phone['phone']['localNumber'] ?? ''); + } + foreach (($contactPoint['contactData']['emailList'] ?? []) as $email) { + $content[] = $email['email']; + } + + if (isset($contactPoint['addressData'])) { + $addressData = $contactPoint['addressData']; + $content[] = ($addressData['street'] ?? ''); + $content[] = ($addressData['buildingName'] ?? ''); + $content[] = ( + $addressData['postOfficeBoxData']['buildingName'] ?? + '' + ); + } + + return implode(' ', $content); + } + private function idWithoutSignature(string $id): int { $s = substr($id, -11); diff --git a/src/Service/Indexer/SiteKit/HeadlineMatcher.php b/src/Service/Indexer/SiteKit/HeadlineMatcher.php new file mode 100644 index 0000000..26e2466 --- /dev/null +++ b/src/Service/Indexer/SiteKit/HeadlineMatcher.php @@ -0,0 +1,29 @@ + [ + [ + "model" => [ + "richText" => [ + "normalized" => true, + "modelType" => "html.richText", + "text" => "

Ein Text

" + ] + ] + ] + ] + ]; + $content = $collector->collect($data); + + $this->assertEquals('

Ein Text

', $content, 'unexpected content'); + } +} diff --git a/test/Service/Indexer/SiteKit/HeadlineMatcherTest.php b/test/Service/Indexer/SiteKit/HeadlineMatcherTest.php new file mode 100644 index 0000000..6e1632c --- /dev/null +++ b/test/Service/Indexer/SiteKit/HeadlineMatcherTest.php @@ -0,0 +1,24 @@ + "Überschrift" + ]; + + $content = $matcher->match(['items', 'model'], $value); + + $this->assertEquals('Überschrift', $content, 'unexpected headline'); + } +} diff --git a/test/Service/Indexer/SiteKit/QuoteSectionMatcherTest.php b/test/Service/Indexer/SiteKit/QuoteSectionMatcherTest.php new file mode 100644 index 0000000..b6882cf --- /dev/null +++ b/test/Service/Indexer/SiteKit/QuoteSectionMatcherTest.php @@ -0,0 +1,32 @@ + "quote", + "model" => [ + "quote" => "Quote-Text", + "citation" => "Citation" + ] + ]; + + $content = $matcher->match(['items'], $value); + + $this->assertEquals( + 'Quote-Text Citation', + $content, + 'unexpected quote text' + ); + } +} diff --git a/test/Service/Indexer/SiteKit/RichtTextMatcherTest.php b/test/Service/Indexer/SiteKit/RichtTextMatcherTest.php new file mode 100644 index 0000000..5d98bf9 --- /dev/null +++ b/test/Service/Indexer/SiteKit/RichtTextMatcherTest.php @@ -0,0 +1,26 @@ + true, + "modelType" => "html.richText", + "text" => "

Ein Text

" + ]; + + $content = $matcher->match([], $value); + + $this->assertEquals('Ein Text', $content, 'unexpected content'); + } +}