Extracts microdata, Crawls XML sitemaps, JSON output file

ricardoaat · May 4, 2018 · 66582ac · 66582ac
1 parent a7347cc
commit 66582ac
Show file tree

Hide file tree

Showing 5 changed files with 237 additions and 169 deletions.
diff --git a/Main.go b/Main.go
@@ -1,17 +1,12 @@
 package main
 
 import (
-	"errors"
 	"flag"
 	"fmt"
 	"net/url"
 	"os"
-	"regexp"
-	"strings"
-	"time"
 
-	microdata "github.com/bioschemas/bioschemas-gocrawlit/getmicrodata"
-	"github.com/gocolly/colly"
+	"github.com/bioschemas/bioschemas-gocrawlit/crawler"
 	"github.com/rifflock/lfshook"
 	log "github.com/sirupsen/logrus"
 )
@@ -50,7 +45,10 @@ func main() {
 
 	d := flag.Bool("d", false, "Sets up the log level to debug")
 	v := flag.Bool("v", false, "Returns the binary version and built date info")
+	q := flag.Bool("q", false, "Skip queries on the URL.")
 	u := flag.String("u", "", "Url to crawl and extract markup")
+	m := flag.Int("maxdepth", 0, "Max number of recursion depth of visited URLs")
+	p := flag.Bool("p", false, "Stay on current path.")
 
 	flag.Parse()
 
@@ -60,155 +58,39 @@ func main() {
 	log.Info(fmt.Sprintf("Version: %s Build Date: %s", version, buildDate))
 
 	if !*v {
-		if err := crawl(*u); err != nil {
-			log.Error(err)
-		}
-		//test(*u)
-	}
-}
-
-func test(u string) {
-	html := `
-	`
-
-	url, err := url.Parse(u)
-	if err != nil {
-		log.Error("Error parsing URL")
-	}
-	log.Info("Parsed host ", url.Host)
-
-	p := microdata.NewParser(strings.NewReader(html), url)
-	data, err := p.Parse()
-	if err != nil {
-		log.Error("Error parsing microdata from HTML ", html)
-	}
-
-	json, err := data.JSON()
-	if err != nil {
-		log.Error("Error getting JSON from microdata HTML ")
-	}
-
-	out := fmt.Sprintf("%s", json)
-
-	fmt.Println(out)
-
-}
 
-func crawl(u string) error {
-	log.Info("URL to crawl ", u)
+		log.Info("URL to crawl ", *u)
 
-	if u == "" {
-		log.Error("Empty URL")
-		return errors.New("The URL must not be empty")
-	}
-	baseURL, err := url.Parse(u)
-	if err != nil {
-		return err
-	}
-	log.Info("Parsed host ", baseURL.Host)
-
-	fn := fmt.Sprintf("%s_schema.yaml", baseURL.Host)
-	fout, err := os.Create(fn)
-	if err != nil {
-		log.Error("Fail to create file. Check your file path and permissions")
-		return err
-	}
-	defer fout.Close()
-
-	cacheDir := fmt.Sprintf(".bioschemas_gocrawlit_cache/%s_cache", baseURL.Host)
-
-	c := colly.NewCollector(
-
-		colly.AllowedDomains(baseURL.Host, fmt.Sprintf("www.%s", baseURL.Host)),
-		colly.MaxDepth(2),
-		//colly.Async(true),
-		// Cache responses to prevent multiple download of pages
-		// even if the collector is restarted
-		colly.CacheDir(cacheDir),
-
-		// Visit only root url and urls
-		colly.URLFilters(
-			regexp.MustCompile(u),
-		),
-	)
-
-	// Parallelism can be controlled also by spawning fixed
-	// number of go routines.
-	//c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})
-
-	// Set error handler
-	c.OnError(func(r *colly.Response, err error) {
-		log.Error("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
-	})
-
-	c.OnHTML(`script[type="application/ld+json"]`, func(e *colly.HTMLElement) {
-		log.Warn("Script found ", e.Request.URL)
-		log.Info(e.Text)
-		fout.WriteString(fmt.Sprintf(`%s\n%s`, e.Request.URL, e.Text))
-	})
-
-	c.OnHTML(`html`, func(e *colly.HTMLElement) {
-		child := e.DOM.Find(`[itemtype^='http://schema.org']`)
-
-		if child.Length() > 0 {
-			log.Warn("Found itemtype bioschemas")
-			html, err := e.DOM.Html()
-			if err != nil {
-				log.Error("Error getting HTML")
-			}
-
-			fout.WriteString(fmt.Sprintf("\n%s - itemtype %s\n", e.Request.URL, e.Attr("itemtype")))
-
-			json, err := extractMicrodata(html, baseURL)
-
-			if err != nil {
-				log.Error("Error calling extractMicrodata ", err)
-				return
-			}
-			fout.Write(json)
+		if *u == "" {
+			log.Error("Empty URL")
 		}
 
-		time.Sleep(1 * time.Second)
-	})
-
-	c.OnHTML("a[href]", func(e *colly.HTMLElement) {
-		link := e.Attr("href")
-
-		log.WithFields(log.Fields{
-			"Text": e.Text,
-			"Link": link,
-		}).Debug("Link found")
-
-		c.Visit(e.Request.AbsoluteURL(link))
-	})
+		baseURL, err := url.Parse(*u)
+		if err != nil {
+			log.Error("Error parsing URL ", err)
+		}
 
-	// Before making a request print "Visiting ..."
-	c.OnRequest(func(r *colly.Request) {
-		r.Headers.Add("Accept", "text/html")
-		log.Info("Visiting ", r.URL.String())
-	})
+		filter := ""
+		if *p {
+			filter = fmt.Sprintf(`^%s://%s%s`, baseURL.Scheme, baseURL.Host, baseURL.Path)
+		}
 
-	c.Visit(u)
+		var ad []string
+		ad = append(ad, baseURL.Host)
+		ad = append(ad, fmt.Sprintf("www.%s", baseURL.Host))
 
-	return nil
-}
+		c := crawler.Crawler{
+			BaseURL:        baseURL,
+			SkipQueries:    *q,
+			MaxDepth:       *m,
+			AllowedDomains: ad,
+			Filter:         filter,
+		}
 
-func extractMicrodata(html string, baseURL *url.URL) ([]byte, error) {
-	var json []byte
+		c.Init()
 
-	p := microdata.NewParser(strings.NewReader(html), baseURL)
-	data, err := p.Parse()
-	if err != nil {
-		log.Error("Error parsing microdata from HTML ", html)
-		return json, err
-	}
+		c.Start()
 
-	json, err = data.JSON()
-	if err != nil {
-		log.Error("Error getting JSON from microdata HTML ")
-		return json, err
+		c.ToJSONfile()
 	}
-
-	return json, nil
-
 }
diff --git a/README.md b/README.md
@@ -1,28 +1,31 @@
 # BIOSCHEMAS.ORG GO CRAWL IT!
 
-Minimal web crawler. Extracts microdata and JSON-LD metadata with schema.org.
+Minimal crawler and extractor of microdata and JSON-LD metadata.
 
-### ToDo
-
-- [x] Crawl website
-- [x] URL by command line parameters
-- [x] Extracts JSON-LD
-- [x] Extracts microdata
-- [ ] JSON-LD schema.org check
-- [ ] Better file output
 
 ## How to use it:
 
-Run it like this: 
+Use example: 
 ```
-./bioschemas-gocrawlit -u "https://identifiers.org"
+./bioschemas-gocrawlit -p -u "https://www.ebi.ac.uk/biosamples/samples"
+./bioschemas-gocrawlit -q -u https://tess.elixir-europe.org/sitemaps/events.xml
+./bioschemas-gocrawlit -u http://159.149.160.88/pscan_chip_dev/
 ```
 
-## Help
+A folder "bioschemas_gocrawlit_cache" will be created on the current path of execution;This folder contains crawled website information in order to prevent multiple download of pages. Is safe to delete this folder.
+
+### Output
+
+Scraped data will be stored in a json file named ```<website_host>_schema.json``` on the current program folder.
 
-Use the -h parameter to get info about the command tool.
 
-```./bioschemas-gocrawlit -h```
+### Available commands
+
+- **-p**: Stay on current path. i.e. When crawling a page like ```https://www.ebi.ac.uk/biosamples/samples``` and don't want it to crawl the whole website, e.g. ```https://www.ebi.ac.uk```.
+- **-m**: Max number of recursion depth of visited URLs. Default infinity recursion. (The crawler does not revisit URLs)
+- **-u**: Start page to start crawling.
+- **-q**: Remove query section from the link URL found.
+- **-h**: Print Help and exit.
 
 
 ## Building binaries
@@ -33,6 +36,15 @@ To create a binary for your current SO use:
 To create a binary for windows, macos and linux SO use:
 ```make build-all```
 
-The binaries would be found on build/ path.
+The binaries would be placed under build/ path.
+
 
+## ToDo
+
+- [x] Crawl website
+- [x] URL by command line parameters
+- [x] Extracts JSON-LD
+- [x] Extracts microdata
+- [x] Better file output
+- [x] Sitemap.xml Crawl option