From ef1372847a4c54ea74b16ea7a37aa0a8ceb6cc9b Mon Sep 17 00:00:00 2001
From: Manuel Reich <mr@wundf.net>
Date: Mon, 9 Jan 2023 15:34:11 +0100
Subject: [PATCH] Allow to set url and exclude patterns as comand-line flags

---
 README.md | 27 ++++++++++++++++++++++++---
 main.go   | 52 ++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 60 insertions(+), 19 deletions(-)

diff --git a/README.md b/README.md
index 8ac2dbb..e00ce37 100644
--- a/README.md
+++ b/README.md
@@ -1,9 +1,30 @@
 # colly-linkcheck
 
-Linkcheker that crawls ems.press and checks for dead links.
+Linkcheker that crawls a webpage and checks for dead links.
 
-Right now the url and the ignore patterns are hardcoded.
-Maybe we change that later so that can be customized.
+The crawler will check external links but will only parse content of pages with the same host as the provided start url.
+
+## usage (command line)
+
+```bash
+# parse complete page
+go run main.go --url "https://ems.press"
+
+# exclude path patterns. E.g. all paths startung with journals
+go run main.go --url "https://ems.press" --exclude "^\/journals*"
+
+# exclude multiple path patterns, but also include sub patterns:
+go run main.go \
+    --url "https://ems.press" \
+    \
+    --exclude "^\/journals\/.*\/articles.*" \
+	--exclude "^\/journals\/.*\/issues.*" \
+	--exclude "^\/books\/.*\/.*" \
+    \
+	--include "^\/journals\/msl\/articles.*" \
+	--include "^\/journals\/msl\/issues.*" \
+	--include "^\/books\/esiam.*"
+```
 
 ## use as github action
 
diff --git a/main.go b/main.go
index 2f67ec6..d5e3eb5 100644
--- a/main.go
+++ b/main.go
@@ -1,7 +1,9 @@
 package main
 
 import (
+	"flag"
 	"fmt"
+	"net/url"
 	"os"
 	"regexp"
 
@@ -20,20 +22,49 @@ func matchAny(urlPath string, patterns []string) bool {
 	return false
 }
 
+type flagList []string
+
+func (list *flagList) String() string {
+	return fmt.Sprint(*list)
+}
+func (i *flagList) Set(value string) error {
+	*i = append(*i, value)
+	return nil
+}
+
 func main() {
 	exitCode := 0
 	defer func() {
 		os.Exit(exitCode)
 	}()
 
+	type arrayFlags []string
+
+	var excludePatterns flagList
+	var includePatterns flagList
+
+	urlString := flag.String("url", "REQUIRED", "the url to start crawling")
+	flag.Var(&excludePatterns, "exclude", "list of regex patterns of url to exclude")
+	flag.Var(&includePatterns, "include", "list of regex patterns. This can be used to include a subset of urls, that were excluded via a broad `exclude` pattern")
+	flag.Parse()
+
+	startUrl, urlParseError := url.Parse(*urlString)
+	if *urlString == "REQUIRED" || urlParseError != nil {
+		fmt.Println("invalud startUrl provided (", startUrl, ")")
+		exitCode = 2
+		return
+	}
+
+	fmt.Println(excludePatterns)
+
 	collector := colly.NewCollector(colly.Async())
 	collector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 8})
 
-	// Find and visit all links on ems.press pages
+	// Find and visit all links on pages with same host as startUrl
 	collector.OnHTML("a[href]", func(element *colly.HTMLElement) {
 		url := element.Request.URL
 
-		if url.Host == "ems.press" {
+		if url.Host == startUrl.Host {
 			element.Request.Ctx.Put(element.Attr("href"), url.String())
 			element.Request.Visit(element.Attr("href"))
 		}
@@ -44,20 +75,9 @@ func main() {
 			request.Abort()
 		}
 
-		exclude := []string{
-			"^\\/journals\\/.*\\/articles.*",
-			"^\\/journals\\/.*\\/issues.*",
-			"^\\/books\\/.*\\/.*",
-		}
-		include := []string{
-			"^\\/journals\\/msl\\/articles.*",
-			"^\\/journals\\/msl\\/issues.*",
-			"^\\/books\\/esiam.*",
-		}
-
 		urlPath := request.URL.Path
-		matchedExclude := matchAny(urlPath, exclude)
-		matchedInclude := matchAny(urlPath, include)
+		matchedExclude := matchAny(urlPath, excludePatterns)
+		matchedInclude := matchAny(urlPath, includePatterns)
 
 		if matchedExclude && !matchedInclude {
 			request.Abort()
@@ -83,6 +103,6 @@ func main() {
 		)
 	})
 
-	collector.Visit("https://ems.press/")
+	collector.Visit(startUrl.String())
 	collector.Wait()
 }