Skip to content

Commit

Permalink
AcceptContentTypes per source, Fix isValidJson issue with arrays (gle…
Browse files Browse the repository at this point in the history
…anerio#206)

* accept content types gleanerio#192.
Prep for json profiles gleanerio#163



* accept content types gleanerio#192.
Prep for json profiles gleanerio#163
gleanerio#128. For Validation just unmarshal an interface
  • Loading branch information
valentinedwv authored Aug 31, 2023
1 parent 6a4d814 commit 256a703
Show file tree
Hide file tree
Showing 7 changed files with 217 additions and 34 deletions.
45 changes: 45 additions & 0 deletions configs/gleaner_opencore
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
context:
cache: true
contextmaps:
- file: ./assets/schemaorg-current-https.jsonld
prefix: https://schema.org/
- file: ./assets/schemaorg-current-http.jsonld
prefix: http://schema.org/
gleaner:
mill: true
runid: runX
summon: true
millers:
graph: true
minio:
address: oss.geocodes-dev.earthcube.org
port: 443
ssl: true
accesskey: worldsbestaccesskey
secretkey: worldsbestsecretkey
bucket: opencore
sources:
- sourcetype: sitemap
name: opencoredata
logo: https://opencoredata.org/img/logo22small.png
url: http://opencoredata.org/sitemap.xml
headless: false
pid: https://www.re3data.org/repository/r3d100012874
propername: opencoredata
domain: https://opencoredata.org/
active: true
credentialsfile: ""
other: {}
headlesswait: 0
delay: 1
identifierpath: ""
apipagelimit: 0
identifiertype: identifiersha
fixcontextoption: 0
acceptcontenttype: "application/ld+json"
summoner:
after: ""
delay: null
headless: http://127.0.0.1:9222
mode: full
threads: 15
58 changes: 44 additions & 14 deletions internal/summoner/acquire/acquire.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,17 @@ func ResRetrieve(v1 *viper.Viper, mc *minio.Client, m map[string][]string, runSt
wg.Wait()
}

func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, error) {
func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, string, string, error) {
bucketName, err := configTypes.GetBucketName(v1)
if err != nil {
return bucketName, 0, 0, 0, err
return bucketName, 0, 0, 0, configTypes.AccceptContentType, "", err
}

var mcfg configTypes.Summoner
mcfg, err = configTypes.ReadSummmonerConfig(v1.Sub("summoner"))

if err != nil {
return bucketName, 0, 0, 0, err
return bucketName, 0, 0, 0, configTypes.AccceptContentType, "", err
}
// Set default thread counts and global delay
tc := mcfg.Threads
Expand All @@ -74,9 +74,11 @@ func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, err
// look for a domain specific override crawl delay
sources, err := configTypes.GetSources(v1)
source, err := configTypes.GetSourceByName(sources, sourceName)
acceptContent := source.AcceptContentType
jsonProfile := source.JsonProfile
hw := source.HeadlessWait
if err != nil {
return bucketName, tc, delay, hw, err
return bucketName, tc, delay, hw, acceptContent, jsonProfile, err
}

if source.Delay != 0 && source.Delay > delay {
Expand All @@ -85,13 +87,14 @@ func getConfig(v1 *viper.Viper, sourceName string) (string, int, int64, int, err
log.Info("Crawl delay set to ", delay, " for ", sourceName)
}
log.Info("Thread count ", tc, " delay ", delay)
return bucketName, tc, delay, hw, nil

return bucketName, tc, delay, hw, acceptContent, jsonProfile, nil
}

func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName string,
wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) {

bucketName, tc, delay, headlessWait, err := getConfig(v1, sourceName)
bucketName, tc, delay, headlessWait, acceptContent, jsonProfile, err := getConfig(v1, sourceName)
if err != nil {
// trying to read a source, so let's not kill everything with a panic/fatal
log.Error("Error reading config file ", err)
Expand Down Expand Up @@ -133,7 +136,7 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
log.Error(i, err, urlloc)
}
req.Header.Set("User-Agent", EarthCubeAgent)
req.Header.Set("Accept", "application/ld+json, text/html")
req.Header.Set("Accept", acceptContent)

resp, err := client.Do(req)
if err != nil {
Expand All @@ -145,8 +148,31 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
}
defer resp.Body.Close()

jsonlds, err := FindJSONInResponse(v1, urlloc, repologger, resp)

jsonlds, err := FindJSONInResponse(v1, urlloc, jsonProfile, repologger, resp)
// there was an issue with sitemaps... but now this code
//if contains(contentTypeHeader, JSONContentType) || contains(contentTypeHeader, "application/json") {
//
// b, err := io.ReadAll(resp.Body)
// // b, err := ioutil.ReadAll(resp.Body) Go.1.15 and earlier
// if err != nil {
// log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order)
// repoStats.Inc(common.Issues)
// lwg.Done() // tell the wait group that we be done
// <-semaphoreChan
// return
// }
// jsonlds = []string{string(b)}
//} else {
// var err error
// jsonlds, err = FindJSONInResponse(v1, urlloc, jsonProfile, repologger, resp)
// if err != nil {
// log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order)
// repoStats.Inc(common.Issues)
// lwg.Done() // tell the wait group that we be done
// <-semaphoreChan
// return
// }
//}
if err != nil {
log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order)
repoStats.Inc(common.Issues)
Expand Down Expand Up @@ -194,7 +220,7 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
common.RunRepoStatsOutput(repoStats, sourceName)
}

func FindJSONInResponse(v1 *viper.Viper, urlloc string, repologger *log.Logger, response *http.Response) ([]string, error) {
func FindJSONInResponse(v1 *viper.Viper, urlloc string, jsonProfile string, repologger *log.Logger, response *http.Response) ([]string, error) {
doc, err := goquery.NewDocumentFromResponse(response)
if err != nil {
return nil, err
Expand All @@ -206,19 +232,23 @@ func FindJSONInResponse(v1 *viper.Viper, urlloc string, repologger *log.Logger,
// if the URL is sending back JSON-LD correctly as application/ld+json
// this should not be here IMHO, but need to support people not setting proper header value
// The URL is sending back JSON-LD but incorrectly sending as application/json
// would like to add contains(contentTypeHeader, jsonProfile)
// but empty profile strings matching all
if contains(contentTypeHeader, JSONContentType) || contains(contentTypeHeader, "application/json") || fileExtensionIsJson(urlloc) {
logFields := log.Fields{"url": urlloc, "contentType": "json or ld_json"}
repologger.WithFields(logFields).Debug()
log.WithFields(logFields).Debug(urlloc, " as ", contentTypeHeader)

jsonlds, err = addToJsonListIfValid(v1, jsonlds, doc.Text())
resp_text := doc.Text()
jsonlds, err = addToJsonListIfValid(v1, jsonlds, resp_text)
if err != nil {
log.WithFields(logFields).Error("Error processing json response from ", urlloc, err)
repologger.WithFields(logFields).Error(err)
}
// look in the HTML response for <script type=application/ld+json>
// look in the HTML response for <script type=application/ld+json> ^
} else {
doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
//doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
//Please note that Cascadia's selectors do not necessarily match all supported selectors of jQuery (Sizzle). https://github.com/andybalholm/cascadia
doc.Find("script[type^='application/ld+json']").Each(func(i int, s *goquery.Selection) {
jsonlds, err = addToJsonListIfValid(v1, jsonlds, s.Text())
logFields := log.Fields{"url": urlloc, "contentType": "script[type='application/ld+json']"}
repologger.WithFields(logFields).Info()
Expand Down
22 changes: 11 additions & 11 deletions internal/summoner/acquire/acquire_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ func TestGetConfig(t *testing.T) {
}

viper := ConfigSetupHelper(conf)
bucketName, tc, delay, err := getConfig(viper, "testSource")
bucketName, tc, delay, _, _, _, err := getConfig(viper, "testSource")
assert.Equal(t, "test", bucketName)
assert.Equal(t, 5, tc)
assert.Equal(t, int64(0), delay)
Expand All @@ -43,7 +43,7 @@ func TestGetConfig(t *testing.T) {
}

viper := ConfigSetupHelper(conf)
bucketName, tc, delay, err := getConfig(viper, "testSource")
bucketName, tc, delay, _, _, _, err := getConfig(viper, "testSource")
assert.Equal(t, "test", bucketName)
assert.Equal(t, 1, tc)
assert.Equal(t, int64(1000), delay)
Expand All @@ -58,7 +58,7 @@ func TestGetConfig(t *testing.T) {
}

viper := ConfigSetupHelper(conf)
bucketName, tc, delay, err := getConfig(viper, "testSource")
bucketName, tc, delay, _, _, _, err := getConfig(viper, "testSource")
assert.Equal(t, "test", bucketName)
assert.Equal(t, 5, tc)
assert.Equal(t, int64(0), delay)
Expand All @@ -73,7 +73,7 @@ func TestGetConfig(t *testing.T) {
}

viper := ConfigSetupHelper(conf)
bucketName, tc, delay, err := getConfig(viper, "testSource")
bucketName, tc, delay, _, _, _, err := getConfig(viper, "testSource")
assert.Equal(t, "test", bucketName)
assert.Equal(t, 1, tc)
assert.Equal(t, int64(100), delay)
Expand All @@ -88,7 +88,7 @@ func TestGetConfig(t *testing.T) {
}

viper := ConfigSetupHelper(conf)
bucketName, tc, delay, err := getConfig(viper, "testSource")
bucketName, tc, delay, _, _, _, err := getConfig(viper, "testSource")
assert.Equal(t, "test", bucketName)
assert.Equal(t, 1, tc)
assert.Equal(t, int64(50), delay)
Expand All @@ -102,7 +102,7 @@ func TestFindJSONInResponse(t *testing.T) {
}
viper := ConfigSetupHelper(conf)
logger := log.New()

const JSONContentType = "application/ld+json"
testJson := `{
"@graph":[
{
Expand All @@ -128,7 +128,7 @@ func TestFindJSONInResponse(t *testing.T) {
}

t.Run("It returns an error if the response document cannot be parsed", func(t *testing.T) {
result, err := FindJSONInResponse(viper, urlloc, logger, nil)
result, err := FindJSONInResponse(viper, urlloc, JSONContentType, logger, nil)
assert.Nil(t, result)
assert.Equal(t, errors.New("Response is nil"), err)
})
Expand All @@ -140,7 +140,7 @@ func TestFindJSONInResponse(t *testing.T) {
response.ContentLength = int64(len(html))
var expected []string

result, err := FindJSONInResponse(viper, urlloc, logger, response)
result, err := FindJSONInResponse(viper, urlloc, JSONContentType, logger, response)
assert.Nil(t, err)
assert.Equal(t, result, append(expected, testJson))
})
Expand All @@ -150,7 +150,7 @@ func TestFindJSONInResponse(t *testing.T) {
response.ContentLength = int64(len(testJson))
var expected []string

result, err := FindJSONInResponse(viper, "test.json", logger, response)
result, err := FindJSONInResponse(viper, "test.json", JSONContentType, logger, response)
assert.Nil(t, err)
assert.Equal(t, result, append(expected, testJson))
})
Expand All @@ -161,7 +161,7 @@ func TestFindJSONInResponse(t *testing.T) {
response.Header.Add("Content-Type", JSONContentType)
var expected []string

result, err := FindJSONInResponse(viper, urlloc, logger, response)
result, err := FindJSONInResponse(viper, urlloc, JSONContentType, logger, response)
assert.Nil(t, err)
assert.Equal(t, result, append(expected, testJson))
})
Expand All @@ -172,7 +172,7 @@ func TestFindJSONInResponse(t *testing.T) {
response.Header.Add("Content-Type", "application/json; charset=utf-8")
var expected []string

result, err := FindJSONInResponse(viper, urlloc, logger, response)
result, err := FindJSONInResponse(viper, urlloc, JSONContentType, logger, response)
assert.Nil(t, err)
assert.Equal(t, result, append(expected, testJson))
})
Expand Down
8 changes: 4 additions & 4 deletions internal/summoner/acquire/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func RetrieveAPIData(apiSources []configTypes.Sources, mc *minio.Client, runStat

func getAPISource(v1 *viper.Viper, mc *minio.Client, source configTypes.Sources, wg *sync.WaitGroup, repologger *log.Logger, repoStats *common.RepoStats) {

bucketName, tc, delay, _, err := getConfig(v1, source.Name) // _ is headless wait
bucketName, tc, delay, _, acceptContent, jsonProfile, err := getConfig(v1, source.Name) // _ is headless wait
if err != nil {
// trying to read a source, so let's not kill everything with a panic/fatal
log.Error("Error reading config file ", err)
Expand Down Expand Up @@ -92,8 +92,8 @@ func getAPISource(v1 *viper.Viper, mc *minio.Client, source configTypes.Sources,
log.Error(i, err, urlloc)
}
req.Header.Set("User-Agent", EarthCubeAgent)
req.Header.Set("Accept", "application/ld+json, text/html")

//req.Header.Set("Accept", "application/ld+json, text/html")
req.Header.Set("Accept", acceptContent)
response, err := client.Do(req)

if err != nil {
Expand All @@ -116,7 +116,7 @@ func getAPISource(v1 *viper.Viper, mc *minio.Client, source configTypes.Sources,
log.Trace("Response status ", response.StatusCode, " from ", urlloc)
responseStatusChan <- response.StatusCode

jsonlds, err := FindJSONInResponse(v1, urlloc, repologger, response)
jsonlds, err := FindJSONInResponse(v1, urlloc, jsonProfile, repologger, response)

if err != nil {
log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order)
Expand Down
2 changes: 1 addition & 1 deletion internal/summoner/acquire/headlessNG.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologge
expressionTmpl := `
function getMetadata() {
return new Promise((resolve, reject) => {
const elements = document.querySelectorAll('script[type="application/ld+json"]');
const elements = document.querySelectorAll('script[type^="application/ld+json"]');
let metadata = [];
elements.forEach(function(element) {
if(element && element.innerText) {
Expand Down
42 changes: 38 additions & 4 deletions internal/summoner/acquire/jsonutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,59 @@ import (
// / A utility to keep a list of JSON-LD files that we have found
// in or on a page
func addToJsonListIfValid(v1 *viper.Viper, jsonlds []string, new_json string) ([]string, error) {

valid, err := isValid(v1, new_json)
if err != nil {
isValidGraphArray, jsonlds, _ := isGraphArray(v1, new_json)
if isValidGraphArray {
return jsonlds, nil
}
return jsonlds, fmt.Errorf("error checking for valid json: %s", err)
}
if !valid {

return jsonlds, fmt.Errorf("invalid json; continuing")
}
return append(jsonlds, new_json), nil
}

func isGraphArray(v1 *viper.Viper, jsonld string) (bool, []string, error) {
var errs error
jsonlds := []string{}
var myArray []interface{}
err := json.Unmarshal([]byte(jsonld), &myArray)
if err == nil {
var myArray []map[string]interface{}
err := json.Unmarshal([]byte(jsonld), &myArray)
if err == nil {
for _, j := range myArray {
jsonld, _ := json.Marshal(j) // we just unmarshaled it.
valid, err := isValid(v1, string(jsonld))
if valid && err == nil {
jsonlds = append(jsonlds, string(jsonld))
} else {
errs = err
}
}
if len(jsonlds) > 0 {
return true, jsonlds, errs
}

}
}
return false, jsonlds, errs
}

// / Validate JSON-LD that we get
func isValid(v1 *viper.Viper, jsonld string) (bool, error) {
proc, options := common.JLDProc(v1)

var myInterface map[string]interface{}

err := json.Unmarshal([]byte(jsonld), &myInterface)
if err != nil {
return false, fmt.Errorf("Error in unmarshaling json: %s", err)
if err != nil {
return false, fmt.Errorf("Error in unmarshaling json: %s", err)
}
}

_, err = proc.ToRDF(myInterface, options) // returns triples but toss them, just validating
Expand Down Expand Up @@ -158,7 +192,7 @@ func fixId(jsonld string) (string, error) {
var formatter func(index int) string
if topLevelType == "Dataset" {
selector = "@id"
formatter = func(index int) string { return "@id"}
formatter = func(index int) string { return "@id" }
} else if topLevelType == "ItemList" {
selector = "itemListElement.#.item.@id"
formatter = func(index int) string { return fmt.Sprintf("itemListElement.%v.item.@id", index) }
Expand All @@ -173,7 +207,7 @@ func fixId(jsonld string) (string, error) {
idUrl, idErr := url.Parse(jsonIdentifier)
if idUrl.Scheme == "" { // we have a relative url and no base in the context
log.Trace("Transforming id: ", jsonIdentifier, " to file:// url because it is relative")
jsonld, idErr = sjson.Set(jsonld, formatter(index), "file://" + jsonIdentifier)
jsonld, idErr = sjson.Set(jsonld, formatter(index), "file://"+jsonIdentifier)
} else {
log.Trace("JSON-LD context base or IRI id found: ", originalBase, "ID: ", idUrl)
}
Expand Down
Loading

0 comments on commit 256a703

Please sign in to comment.