From bbe073bf4fde2dec8ddb11a849e39e3453e233c7 Mon Sep 17 00:00:00 2001 From: Dmytro Sydorenko Date: Thu, 21 Apr 2022 17:04:21 +0200 Subject: [PATCH 1/3] Updated to support Laravel 9. --- composer.json | 8 +-- src/Commands/SitemapCommand.php | 25 +++++--- src/Handlers/StatsHandler.php | 107 ++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 12 deletions(-) create mode 100644 src/Handlers/StatsHandler.php diff --git a/composer.json b/composer.json index 2994608..ad1680b 100644 --- a/composer.json +++ b/composer.json @@ -23,11 +23,11 @@ ], "require": { "php": "^7.3|^8.0", - "laravel/framework": "^6.20.12|^7.30.3|^8.4", - "guzzlehttp/guzzle": "^7.0", - "vdb/php-spider": "^v0.5.2", + "laravel/framework": "^6.20.12||^7.30.3||^8.4||^9.2", + "guzzlehttp/guzzle": "^7.2", + "vdb/php-spider": "^v0.6.3", "nesbot/carbon": "^2.41", - "spatie/robots-txt": "^1.0" + "spatie/robots-txt": "^1.0||^2.0" }, "require-dev": { "symfony/thanks": "^1.0" diff --git a/src/Commands/SitemapCommand.php b/src/Commands/SitemapCommand.php index b58252d..9904748 100644 --- a/src/Commands/SitemapCommand.php +++ b/src/Commands/SitemapCommand.php @@ -4,17 +4,24 @@ use Exception; use DOMDocument; -use SimpleXMLElement; use Carbon\Carbon; -use Illuminate\Console\Command; -use Symfony\Component\EventDispatcher\Event; +use SimpleXMLElement; +use VDB\Spider\Spider; use Spatie\Robots\Robots; +use Illuminate\Console\Command; use VDB\Spider\Event\SpiderEvents; -use VDB\Spider\StatsHandler; -use VDB\Spider\Spider; -use VDB\Spider\Discoverer\XPathExpressionDiscoverer; +use Symfony\Component\EventDispatcher\Event; +use VDB\Spider\QueueManager\InMemoryQueueManager; +use VDB\Spider\QueueManager\QueueManagerInterface; use VDB\Spider\Filter\Prefetch\AllowedHostsFilter; +use VDB\Spider\Discoverer\XPathExpressionDiscoverer; +use BringYourOwnIdeas\LaravelSitemap\Handlers\StatsHandler; +/** + * Class SitemapCommand + * + * @package BringYourOwnIdeas\LaravelSitemap\Commands + */ class SitemapCommand extends Command { /** @@ -64,7 +71,7 @@ protected function crawlWebsite($url) // Add a URI discoverer. Without it, the spider does nothing. // In this case, we want tags and the canonical link - $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//a|//link[@rel=\"canonical\"]")); + $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@rel=\"canonical\"]//a")); $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter([$url], true)); // Set limits @@ -83,7 +90,9 @@ function (Event $event) { // Add a listener to collect stats to the Spider and the QueueMananger. // There are more components that dispatch events you can use. $statsHandler = new StatsHandler(); - $spider->getQueueManager()->getDispatcher()->addSubscriber($statsHandler); + /** @var QueueManagerInterface|InMemoryQueueManager $queueManager */ + $queueManager = $spider->getQueueManager(); + $queueManager->getDispatcher()->addSubscriber($statsHandler); $spider->getDispatcher()->addSubscriber($statsHandler); // Execute crawl diff --git a/src/Handlers/StatsHandler.php b/src/Handlers/StatsHandler.php new file mode 100644 index 0000000..ebdcefb --- /dev/null +++ b/src/Handlers/StatsHandler.php @@ -0,0 +1,107 @@ + 'addToFiltered', + SpiderEvents::SPIDER_CRAWL_FILTER_PREFETCH => 'addToFiltered', + SpiderEvents::SPIDER_CRAWL_POST_ENQUEUE => 'addToQueued', + SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED => 'addToPersisted', + SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST => 'addToFailed' + ); + } + + public function addToQueued(GenericEvent $event) + { + $this->queued[] = $event->getArgument('uri'); + } + + public function addToPersisted(GenericEvent $event) + { + $this->persisted[] = $event->getArgument('uri'); + } + + public function addToFiltered(GenericEvent $event) + { + $this->filtered[] = $event->getArgument('uri'); + } + + public function addToFailed(GenericEvent $event) + { + $this->failed[$event->getArgument('uri')->toString()] = $event->getArgument('message'); + } + + /** + * @return UriInterface[] + */ + public function getQueued(): array + { + return $this->queued; + } + + /** + * @return UriInterface[] + */ + public function getPersisted(): array + { + return $this->persisted; + } + + /** + * @return FilterableInterface[] + */ + public function getFiltered(): array + { + return $this->filtered; + } + + /** + * @return array of form array($uriString, $reason) + */ + public function getFailed(): array + { + return $this->failed; + } + + public function toString(): string + { + $spiderId = $this->getSpiderId(); + $queued = $this->getQueued(); + $filtered = $this->getFiltered(); + $failed = $this->getFailed(); + + $string = ''; + + $string .= "\n\nSPIDER ID: " . $spiderId; + $string .= "\n ENQUEUED: " . count($queued); + $string .= "\n SKIPPED: " . count($filtered); + $string .= "\n FAILED: " . count($failed); + + return $string; + } +} From 1b4452f80c1f1ad945ceaa724eaa6a732a0e7a15 Mon Sep 17 00:00:00 2001 From: Dmytro Sydorenko Date: Sun, 13 Nov 2022 18:58:05 +0100 Subject: [PATCH 2/3] Changrd back string for XPathExpressionDiscoverer(). --- src/Commands/SitemapCommand.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Commands/SitemapCommand.php b/src/Commands/SitemapCommand.php index 9904748..bcc0f26 100644 --- a/src/Commands/SitemapCommand.php +++ b/src/Commands/SitemapCommand.php @@ -71,7 +71,7 @@ protected function crawlWebsite($url) // Add a URI discoverer. Without it, the spider does nothing. // In this case, we want tags and the canonical link - $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//div[@rel=\"canonical\"]//a")); + $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//a|//link[@rel=\"canonical\"]")); $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter([$url], true)); // Set limits From 36d102a3b251c0a0af6c9cee73bc89a97022ac4f Mon Sep 17 00:00:00 2001 From: Dmytro Sydorenko Date: Tue, 22 Oct 2024 10:39:01 +0200 Subject: [PATCH 3/3] Updated to support Laravel 10.x and 11.x versions. --- composer.json | 6 +++--- src/Commands/SitemapCommand.php | 31 ++++++++++++++++++------------- src/Handlers/StatsHandler.php | 27 ++++++++++++++++----------- 3 files changed, 37 insertions(+), 27 deletions(-) diff --git a/composer.json b/composer.json index ad1680b..d5c30cd 100644 --- a/composer.json +++ b/composer.json @@ -22,10 +22,10 @@ } ], "require": { - "php": "^7.3|^8.0", - "laravel/framework": "^6.20.12||^7.30.3||^8.4||^9.2", + "php": ">=8.1", + "laravel/framework": "^9.2||^10.0||^11.0", "guzzlehttp/guzzle": "^7.2", - "vdb/php-spider": "^v0.6.3", + "vdb/php-spider": "^v0.7.2", "nesbot/carbon": "^2.41", "spatie/robots-txt": "^1.0||^2.0" }, diff --git a/src/Commands/SitemapCommand.php b/src/Commands/SitemapCommand.php index bcc0f26..4560569 100644 --- a/src/Commands/SitemapCommand.php +++ b/src/Commands/SitemapCommand.php @@ -10,7 +10,6 @@ use Spatie\Robots\Robots; use Illuminate\Console\Command; use VDB\Spider\Event\SpiderEvents; -use Symfony\Component\EventDispatcher\Event; use VDB\Spider\QueueManager\InMemoryQueueManager; use VDB\Spider\QueueManager\QueueManagerInterface; use VDB\Spider\Filter\Prefetch\AllowedHostsFilter; @@ -37,9 +36,9 @@ class SitemapCommand extends Command /** * Generate the sitemap * - * @return void + * @return int */ - public function handle() + public function handle(): int { // Crawl the site $this->info('Starting site crawl...'); @@ -51,6 +50,8 @@ public function handle() // Signal completion $this->info('Sitemap generation completed.'); + + return Command::SUCCESS; } /** @@ -59,7 +60,7 @@ public function handle() * @param string $url * @return array $resources */ - protected function crawlWebsite($url) + protected function crawlWebsite(string $url): array { // Load the robots.txt from the site. $robots_url = $url . '/robots.txt'; @@ -71,7 +72,7 @@ protected function crawlWebsite($url) // Add a URI discoverer. Without it, the spider does nothing. // In this case, we want tags and the canonical link - $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//a|//link[@rel=\"canonical\"]")); + $spider->getDiscovererSet()->set(new XPathExpressionDiscoverer("//a|//link[@rel=\"canonical\"]//a")); $spider->getDiscovererSet()->addFilter(new AllowedHostsFilter([$url], true)); // Set limits @@ -81,8 +82,8 @@ protected function crawlWebsite($url) // Let's add something to enable us to stop the script $spider->getDispatcher()->addListener( SpiderEvents::SPIDER_CRAWL_USER_STOPPED, - function (Event $event) { - consoleOutput()->error("Crawl aborted."); + function () { + echo "Crawl aborted."; exit(); } ); @@ -104,8 +105,8 @@ function (Event $event) { $this->comment("Failed: " . count($statsHandler->getFailed())); $this->comment("Persisted: " . count($statsHandler->getPersisted())); - // Finally we could do some processing on the downloaded resources - // In this example, we will echo the title of all resources + // Finally, we could do some processing on the downloaded resources + // In this example we will echo the title of all resources $this->comment("\nResources:"); $resources = []; foreach ($spider->getDownloader()->getPersistenceHandler() as $resource) { @@ -116,7 +117,10 @@ function (Event $event) { // $noindex = false; if ($resource->getCrawler()->filterXpath('//meta[@name="robots"]')->count() > 0) { - $noindex = (strpos($resource->getCrawler()->filterXpath('//meta[@name="robots"]')->attr('content'), 'noindex') !== false); + $noindex = (str_contains( + $resource->getCrawler()->filterXpath('//meta[@name="robots"]')->attr('content'), + 'noindex' + )); $this->info(sprintf(" - Skipping %s (on-page no-index)", $url)); } @@ -160,10 +164,11 @@ function (Event $event) { /** * Write the sitemap as a file. * - * @param array $resources + * @param array $resources + * * @return void **/ - protected function writeSitemap($resources) + protected function writeSitemap(array $resources): void { // Prepare XML $urlset = new SimpleXMLElement(''); @@ -190,7 +195,7 @@ protected function writeSitemap($resources) $dom->loadXML($urlset->asXML()); $dom->formatOutput = true; - // Write file + // Write a file try { file_put_contents(public_path() . '/sitemap.xml', $dom->saveXML()); } catch (Exception $exception) { diff --git a/src/Handlers/StatsHandler.php b/src/Handlers/StatsHandler.php index ebdcefb..e1fd904 100644 --- a/src/Handlers/StatsHandler.php +++ b/src/Handlers/StatsHandler.php @@ -15,43 +15,48 @@ class StatsHandler implements EventSubscriberInterface { /** @var string */ - protected $spiderId; + protected string $spiderId; - protected $persisted = array(); + protected array $persisted = []; - protected $queued = array(); + protected array $queued = []; - protected $filtered = array(); + protected array $filtered = []; - protected $failed = array(); + protected array $failed = []; public static function getSubscribedEvents(): array { - return array( + return [ SpiderEvents::SPIDER_CRAWL_FILTER_POSTFETCH => 'addToFiltered', SpiderEvents::SPIDER_CRAWL_FILTER_PREFETCH => 'addToFiltered', SpiderEvents::SPIDER_CRAWL_POST_ENQUEUE => 'addToQueued', SpiderEvents::SPIDER_CRAWL_RESOURCE_PERSISTED => 'addToPersisted', SpiderEvents::SPIDER_CRAWL_ERROR_REQUEST => 'addToFailed' - ); + ]; } - public function addToQueued(GenericEvent $event) + private function getSpiderId(): string + { + return $this->spiderId; + } + + public function addToQueued(GenericEvent $event): void { $this->queued[] = $event->getArgument('uri'); } - public function addToPersisted(GenericEvent $event) + public function addToPersisted(GenericEvent $event): void { $this->persisted[] = $event->getArgument('uri'); } - public function addToFiltered(GenericEvent $event) + public function addToFiltered(GenericEvent $event): void { $this->filtered[] = $event->getArgument('uri'); } - public function addToFailed(GenericEvent $event) + public function addToFailed(GenericEvent $event): void { $this->failed[$event->getArgument('uri')->toString()] = $event->getArgument('message'); }