Skip to content

Commit

Permalink
Extracts microdata, Crawls XML sitemaps, JSON output file
Browse files Browse the repository at this point in the history
  • Loading branch information
ricardoaat committed May 4, 2018
1 parent a7347cc commit 66582ac
Show file tree
Hide file tree
Showing 5 changed files with 237 additions and 169 deletions.
174 changes: 28 additions & 146 deletions Main.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,12 @@
package main

import (
"errors"
"flag"
"fmt"
"net/url"
"os"
"regexp"
"strings"
"time"

microdata "github.com/bioschemas/bioschemas-gocrawlit/getmicrodata"
"github.com/gocolly/colly"
"github.com/bioschemas/bioschemas-gocrawlit/crawler"
"github.com/rifflock/lfshook"
log "github.com/sirupsen/logrus"
)
Expand Down Expand Up @@ -50,7 +45,10 @@ func main() {

d := flag.Bool("d", false, "Sets up the log level to debug")
v := flag.Bool("v", false, "Returns the binary version and built date info")
q := flag.Bool("q", false, "Skip queries on the URL.")
u := flag.String("u", "", "Url to crawl and extract markup")
m := flag.Int("maxdepth", 0, "Max number of recursion depth of visited URLs")
p := flag.Bool("p", false, "Stay on current path.")

flag.Parse()

Expand All @@ -60,155 +58,39 @@ func main() {
log.Info(fmt.Sprintf("Version: %s Build Date: %s", version, buildDate))

if !*v {
if err := crawl(*u); err != nil {
log.Error(err)
}
//test(*u)
}
}

func test(u string) {
html := `
`

url, err := url.Parse(u)
if err != nil {
log.Error("Error parsing URL")
}
log.Info("Parsed host ", url.Host)

p := microdata.NewParser(strings.NewReader(html), url)
data, err := p.Parse()
if err != nil {
log.Error("Error parsing microdata from HTML ", html)
}

json, err := data.JSON()
if err != nil {
log.Error("Error getting JSON from microdata HTML ")
}

out := fmt.Sprintf("%s", json)

fmt.Println(out)

}

func crawl(u string) error {
log.Info("URL to crawl ", u)
log.Info("URL to crawl ", *u)

if u == "" {
log.Error("Empty URL")
return errors.New("The URL must not be empty")
}
baseURL, err := url.Parse(u)
if err != nil {
return err
}
log.Info("Parsed host ", baseURL.Host)

fn := fmt.Sprintf("%s_schema.yaml", baseURL.Host)
fout, err := os.Create(fn)
if err != nil {
log.Error("Fail to create file. Check your file path and permissions")
return err
}
defer fout.Close()

cacheDir := fmt.Sprintf(".bioschemas_gocrawlit_cache/%s_cache", baseURL.Host)

c := colly.NewCollector(

colly.AllowedDomains(baseURL.Host, fmt.Sprintf("www.%s", baseURL.Host)),
colly.MaxDepth(2),
//colly.Async(true),
// Cache responses to prevent multiple download of pages
// even if the collector is restarted
colly.CacheDir(cacheDir),

// Visit only root url and urls
colly.URLFilters(
regexp.MustCompile(u),
),
)

// Parallelism can be controlled also by spawning fixed
// number of go routines.
//c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})

// Set error handler
c.OnError(func(r *colly.Response, err error) {
log.Error("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})

c.OnHTML(`script[type="application/ld+json"]`, func(e *colly.HTMLElement) {
log.Warn("Script found ", e.Request.URL)
log.Info(e.Text)
fout.WriteString(fmt.Sprintf(`%s\n%s`, e.Request.URL, e.Text))
})

c.OnHTML(`html`, func(e *colly.HTMLElement) {
child := e.DOM.Find(`[itemtype^='http://schema.org']`)

if child.Length() > 0 {
log.Warn("Found itemtype bioschemas")
html, err := e.DOM.Html()
if err != nil {
log.Error("Error getting HTML")
}

fout.WriteString(fmt.Sprintf("\n%s - itemtype %s\n", e.Request.URL, e.Attr("itemtype")))

json, err := extractMicrodata(html, baseURL)

if err != nil {
log.Error("Error calling extractMicrodata ", err)
return
}
fout.Write(json)
if *u == "" {
log.Error("Empty URL")
}

time.Sleep(1 * time.Second)
})

c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Attr("href")

log.WithFields(log.Fields{
"Text": e.Text,
"Link": link,
}).Debug("Link found")

c.Visit(e.Request.AbsoluteURL(link))
})
baseURL, err := url.Parse(*u)
if err != nil {
log.Error("Error parsing URL ", err)
}

// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
r.Headers.Add("Accept", "text/html")
log.Info("Visiting ", r.URL.String())
})
filter := ""
if *p {
filter = fmt.Sprintf(`^%s://%s%s`, baseURL.Scheme, baseURL.Host, baseURL.Path)
}

c.Visit(u)
var ad []string
ad = append(ad, baseURL.Host)
ad = append(ad, fmt.Sprintf("www.%s", baseURL.Host))

return nil
}
c := crawler.Crawler{
BaseURL: baseURL,
SkipQueries: *q,
MaxDepth: *m,
AllowedDomains: ad,
Filter: filter,
}

func extractMicrodata(html string, baseURL *url.URL) ([]byte, error) {
var json []byte
c.Init()

p := microdata.NewParser(strings.NewReader(html), baseURL)
data, err := p.Parse()
if err != nil {
log.Error("Error parsing microdata from HTML ", html)
return json, err
}
c.Start()

json, err = data.JSON()
if err != nil {
log.Error("Error getting JSON from microdata HTML ")
return json, err
c.ToJSONfile()
}

return json, nil

}
42 changes: 27 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,28 +1,31 @@
# BIOSCHEMAS.ORG GO CRAWL IT!

Minimal web crawler. Extracts microdata and JSON-LD metadata with schema.org.
Minimal crawler and extractor of microdata and JSON-LD metadata.

### ToDo

- [x] Crawl website
- [x] URL by command line parameters
- [x] Extracts JSON-LD
- [x] Extracts microdata
- [ ] JSON-LD schema.org check
- [ ] Better file output

## How to use it:

Run it like this:
Use example:
```
./bioschemas-gocrawlit -u "https://identifiers.org"
./bioschemas-gocrawlit -p -u "https://www.ebi.ac.uk/biosamples/samples"
./bioschemas-gocrawlit -q -u https://tess.elixir-europe.org/sitemaps/events.xml
./bioschemas-gocrawlit -u http://159.149.160.88/pscan_chip_dev/
```

## Help
A folder "bioschemas_gocrawlit_cache" will be created on the current path of execution;This folder contains crawled website information in order to prevent multiple download of pages. Is safe to delete this folder.

### Output

Scraped data will be stored in a json file named ```<website_host>_schema.json``` on the current program folder.

Use the -h parameter to get info about the command tool.

```./bioschemas-gocrawlit -h```
### Available commands

- **-p**: Stay on current path. i.e. When crawling a page like ```https://www.ebi.ac.uk/biosamples/samples``` and don't want it to crawl the whole website, e.g. ```https://www.ebi.ac.uk```.
- **-m**: Max number of recursion depth of visited URLs. Default infinity recursion. (The crawler does not revisit URLs)
- **-u**: Start page to start crawling.
- **-q**: Remove query section from the link URL found.
- **-h**: Print Help and exit.


## Building binaries
Expand All @@ -33,6 +36,15 @@ To create a binary for your current SO use:
To create a binary for windows, macos and linux SO use:
```make build-all```

The binaries would be found on build/ path.
The binaries would be placed under build/ path.


## ToDo

- [x] Crawl website
- [x] URL by command line parameters
- [x] Extracts JSON-LD
- [x] Extracts microdata
- [x] Better file output
- [x] Sitemap.xml Crawl option

Loading

0 comments on commit 66582ac

Please sign in to comment.