From ef1372847a4c54ea74b16ea7a37aa0a8ceb6cc9b Mon Sep 17 00:00:00 2001 From: Manuel Reich Date: Mon, 9 Jan 2023 15:34:11 +0100 Subject: [PATCH] Allow to set url and exclude patterns as comand-line flags --- README.md | 27 ++++++++++++++++++++++++--- main.go | 52 ++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 8ac2dbb..e00ce37 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,30 @@ # colly-linkcheck -Linkcheker that crawls ems.press and checks for dead links. +Linkcheker that crawls a webpage and checks for dead links. -Right now the url and the ignore patterns are hardcoded. -Maybe we change that later so that can be customized. +The crawler will check external links but will only parse content of pages with the same host as the provided start url. + +## usage (command line) + +```bash +# parse complete page +go run main.go --url "https://ems.press" + +# exclude path patterns. E.g. all paths startung with journals +go run main.go --url "https://ems.press" --exclude "^\/journals*" + +# exclude multiple path patterns, but also include sub patterns: +go run main.go \ + --url "https://ems.press" \ + \ + --exclude "^\/journals\/.*\/articles.*" \ + --exclude "^\/journals\/.*\/issues.*" \ + --exclude "^\/books\/.*\/.*" \ + \ + --include "^\/journals\/msl\/articles.*" \ + --include "^\/journals\/msl\/issues.*" \ + --include "^\/books\/esiam.*" +``` ## use as github action diff --git a/main.go b/main.go index 2f67ec6..d5e3eb5 100644 --- a/main.go +++ b/main.go @@ -1,7 +1,9 @@ package main import ( + "flag" "fmt" + "net/url" "os" "regexp" @@ -20,20 +22,49 @@ func matchAny(urlPath string, patterns []string) bool { return false } +type flagList []string + +func (list *flagList) String() string { + return fmt.Sprint(*list) +} +func (i *flagList) Set(value string) error { + *i = append(*i, value) + return nil +} + func main() { exitCode := 0 defer func() { os.Exit(exitCode) }() + type arrayFlags []string + + var excludePatterns flagList + var includePatterns flagList + + urlString := flag.String("url", "REQUIRED", "the url to start crawling") + flag.Var(&excludePatterns, "exclude", "list of regex patterns of url to exclude") + flag.Var(&includePatterns, "include", "list of regex patterns. This can be used to include a subset of urls, that were excluded via a broad `exclude` pattern") + flag.Parse() + + startUrl, urlParseError := url.Parse(*urlString) + if *urlString == "REQUIRED" || urlParseError != nil { + fmt.Println("invalud startUrl provided (", startUrl, ")") + exitCode = 2 + return + } + + fmt.Println(excludePatterns) + collector := colly.NewCollector(colly.Async()) collector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 8}) - // Find and visit all links on ems.press pages + // Find and visit all links on pages with same host as startUrl collector.OnHTML("a[href]", func(element *colly.HTMLElement) { url := element.Request.URL - if url.Host == "ems.press" { + if url.Host == startUrl.Host { element.Request.Ctx.Put(element.Attr("href"), url.String()) element.Request.Visit(element.Attr("href")) } @@ -44,20 +75,9 @@ func main() { request.Abort() } - exclude := []string{ - "^\\/journals\\/.*\\/articles.*", - "^\\/journals\\/.*\\/issues.*", - "^\\/books\\/.*\\/.*", - } - include := []string{ - "^\\/journals\\/msl\\/articles.*", - "^\\/journals\\/msl\\/issues.*", - "^\\/books\\/esiam.*", - } - urlPath := request.URL.Path - matchedExclude := matchAny(urlPath, exclude) - matchedInclude := matchAny(urlPath, include) + matchedExclude := matchAny(urlPath, excludePatterns) + matchedInclude := matchAny(urlPath, includePatterns) if matchedExclude && !matchedInclude { request.Abort() @@ -83,6 +103,6 @@ func main() { ) }) - collector.Visit("https://ems.press/") + collector.Visit(startUrl.String()) collector.Wait() }