Merge pull request #48 from gleanerio/dv_master-fix_47

#47. run one source, fix init
gleanerio · Nov 8, 2021 · aefb014 · aefb014
2 parents 6ec4bcc + 1185b10
commit aefb014
Show file tree

Hide file tree

Showing 6 changed files with 259 additions and 14 deletions.
diff --git a/configs/template/GleanerConfig.md b/configs/template/GleanerConfig.md
@@ -0,0 +1,195 @@
+# Gleaner Configuration file
+
+This assumes that you have a container stack running
+
+```
+s3 store
+triple store
+headless
+```
+## Gleaner Configuration generation
+The suggested method of creating a configuration file is to use  glcon command can intialize a configuration directory, and allow for the generation of
+configuration files for gleaner and nabu. Download a glcon release from github
+The pattern is to intiialize a configuration directory, edit files, and generate new configurations
+### initialize a configuraiton directory
+```
+glcon config init -cfgName test
+```
+initializes a configuration in configs with name of 'test'
+Inside you will find
+```
+test % ls
+gleaner_base.yaml	readme.txt		sources.csv
+nabu_base.yaml		servers.yaml
+```
+
+### Edit the files
+Usually, you will only need to edit the servers.yaml and sources.csv
+The servers.yaml
+
+#### Servers.yaml
+```yaml
+---
+minio:
+  address: 0.0.0.0 # can be overridden with MINIO_ADDRESS
+  port: 9000 # can be overridden with MINIO_PORT
+  accessKey: worldsbestaccesskey # can be overridden with MINIO_ACCESS_KEY
+  secretKey: worldsbestsecretkey # can be overridden with MINIO_SECRET_KEY
+  ssl: false # can be overridden with MINIO_SSL
+  bucket: gleaner # can be overridden with MINIO_BUCKET
+sparql:
+  endpoint: http://localhost/blazegraph/namespace/earthcube/sparql
+s3:
+  bucket: gleaner # sync with above... can be overridden with MINIO_BUCKET... get's zapped if it's not here.
+  domain: us-east-1
+
+#headless field in gleaner.summoner
+headless: http://127.0.0.1:9222
+```
+First, in the "mino:" section make sure the accessKey and secretKey here match the access keys for your minio.
+These can be overridden with the environent variables:
+* "MINIO_ACCESS_KEY"
+* "MINIO_SECRET_KEY"
+
+#### sources.csv
+This is designed to be edited in a spreadsheet, or dumped as csv from a google spreadsheet
+
+```csv
+hack,SourceType,Active,Name,ProperName,URL,Headless,Domain,PID,Logo
+1,sitegraph,FALSE,aquadocs,AquaDocs,https://oih.aquadocs.org/aquadocs.json ,FALSE,https://aquadocs.org,http://hdl.handle.net/1834/41372,
+3,sitemap,TRUE,opentopography,OpenTopography,https://opentopography.org/sitemap.xml,FALSE,http://www.opentopography.org/,https://www.re3data.org/repository/r3d100010655,https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png
+,sitemap,TRUE,iris,IRIS,http://ds.iris.edu/files/sitemap.xml,FALSE,http://iris.edu,https://www.re3data.org/repository/r3d100010268,http://ds.iris.edu/static/img/layout/logos/iris_logo_shadow.png
+```
+
+Fields: 
+1. hack:a hack to make the fields are properly read.
+2. SourceType : [sitemap, sitegraph] type of source
+3. Active: [TRUE,FALSE] is source active. 
+4. Name: short name of source. It should be one word (no space) and be lower case.
+5. ProperName: Long name of source that will be added to organization record for provenance
+6. URL: URL of sitemap or sitegraph.
+7. Headless: [FALSE,TRUE] should be set to false unless you know this site uses JavaScript to place the JSON-LD into the page.  This is true of some sites and it is supported but not currently auto-detected.  So you will need to know this and set it.  For most place, this will be false.
+   if the json-ld is generated in a page dynamically, then use , TRUE
+8. Domain: 
+9. PID: a unique identifier for the source. Perfered that is is a research id.
+10. Logo: while no longer used, logo of the source
+
+### generate the configuraiton files
+```
+glcon generate -cfgName test
+```
+This will generate files 'gleaner' and 'yaml'  and make copies of the existing configuration files
+
+The full details are discussed below
+
+## Gleaner Configuration
+
+So now we are ready to review the Gleaner configuration file named gleaner.  There is actually quite a bit in this file, but for this starting demo only a few things we need to worry about.  The default file will look like:
+
+```yaml
+---
+minio:
+  address: 0.0.0.0
+  port: 9000
+  accessKey: worldsbestaccesskey
+  secretKey: worldsbestsecretkey
+  ssl: false
+  bucket: gleaner
+gleaner:
+  runid: runX # this will be the bucket the output is placed in...
+  summon: true # do we want to visit the web sites and pull down the files
+  mill: true
+context:
+  cache: true
+contextmaps:
+  - prefix: "https://schema.org/"
+    file: "./configs/schemaorg-current-https.jsonld"
+  - prefix: "http://schema.org/"
+    file: "./configs/schemaorg-current-https.jsonld"
+summoner:
+  after: ""      # "21 May 20 10:00 UTC"   
+  mode: full  # full || diff:  If diff compare what we have currently in gleaner to sitemap, get only new, delete missing
+  threads: 5
+  delay:  # milliseconds (1000 = 1 second) to delay between calls (will FORCE threads to 1) 
+  headless: http://127.0.0.1:9222  # URL for headless see docs/headless
+millers:
+  graph: true
+# will be built from sources.csv
+sources:
+  - sourcetype: sitegraph
+    name: aquadocs
+    logo: ""
+    url: https://oih.aquadocs.org/aquadocs.json
+    headless: false
+    pid: http://hdl.handle.net/1834/41372
+    propername: AquaDocs
+    domain: https://aquadocs.org
+    active: false
+  - sourcetype: sitemap
+    name: opentopography
+    logo: https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png
+    url: https://opentopography.org/sitemap.xml
+    headless: false
+    pid: https://www.re3data.org/repository/r3d100010655
+    propername: OpenTopography
+    domain: http://www.opentopography.org/
+    active: false
+```
+
+A few things we need to look at.
+
+First, in the "mino:" section make sure the accessKey and secretKey here match the ones you have and set via your demo.env file. 
+
+Next, lets look at the "gleaner:" section.  We can set the runid to something.  This is the ID for a run and it allows you to later make different runs and keep the resulting graphs organized.  It can be set to any lower case string with no spaces. 
+
+The miller and summon sections are true and we will leave them that way.  It means we want Gleaner to both fetch the resources and process (mill) them.  
+
+Now look at the "miller:"  section when lets of pick what milling to do.   Currently it is set with only graph set to true.  Let's leave it that way for now.  This means Gleaner will only attempt to make graph and not also run validation or generate prov reports for the process.  
+
+The final section we need to look at is the "sources:" section.   
+Here is where the fun is.  While there are two types, sitegraph and sitemaps we will normally use sitemap type. 
+
+A standard sitemap is below:
+```yaml
+sources:
+  - sourcetype: sitemap
+      name: opentopography
+      logo: https://opentopography.org/sites/opentopography.org/files/ot_transp_logo_2.png
+      url: https://opentopography.org/sitemap.xml
+      headless: false
+      pid: https://www.re3data.org/repository/r3d100010655
+      propername: OpenTopography
+      domain: http://www.opentopography.org/
+      active: true
+```
+
+A sitegraph 
+```yaml
+sources:
+  - sourcetype: sitegraph
+    name: aquadocs
+    logo: ""
+    url: https://oih.aquadocs.org/aquadocs.json
+    headless: false
+    pid: http://hdl.handle.net/1834/41372
+    propername: AquaDocs
+    domain: https://aquadocs.org
+    active: false
+```
+These are the sources we wish to pull and process. 
+Each source has a type, and 8 entries though at this time we no longer use the "logo" value. 
+It was used in the past to provide a page showing all the sources and 
+a logo for them.  However, that's really just out of scope for what we want to do. 
+You can leave it blank or set it to any value, it wont make a difference.  
+
+The name is what you want to call this source.  It should be one word (no space) and be lower case. 
+
+The url value needs to point to the URL for the site map XML file.  This will be created and served by the data provider. 
+
+The headless value should be set to false unless you know this site uses JavaScript to place the JSON-LD into the page.  This is true of some sites and it is supported but not currently auto-detected.  So you will need to know this and set it.  For most place, this will be false. 
+
+You can have as many sources as you wish.  For an example look the configure file for the CDF Semantic Network at: https://github.com/gleanerio/CDFSemanticNetwork/blob/master/configs/cdf.yaml
+
+
+
+
diff --git a/internal/config/sources.go b/internal/config/sources.go
@@ -1,6 +1,7 @@
 package config
 
 import (
+	"errors"
 	"fmt"
 	"github.com/gocarina/gocsv"
 	"github.com/spf13/viper"
@@ -147,7 +148,6 @@ func GetActiveSourceByType(sources []Sources, key string) []Sources {
 	return sourcesSlice
 }
 
-
 func SourceToNabuPrefix(sources []Sources, includeProv bool) []string {
 
 	var prefixes []string
@@ -171,3 +171,36 @@ func SourceToNabuPrefix(sources []Sources, includeProv bool) []string {
 	}
 	return prefixes
 }
+
+func PruneSources(v1 *viper.Viper, useSources []string) (*viper.Viper, error) {
+	var finalSources []Sources
+	allSources, err := GetSources(v1)
+	if err != nil {
+		log.Fatal("error retrieving sources: %s", err)
+	}
+	for _, s := range allSources {
+		if contains(useSources, s.Name) {
+			s.Active = true // we assume you want to run this, even if disabled, normally
+			finalSources = append(finalSources, s)
+		}
+	}
+	if len(finalSources) > 0 {
+		v1.Set("sources", finalSources)
+		return v1, err
+	} else {
+
+		return v1, errors.New("cannot find a source with the name ")
+	}
+
+}
+
+// contains checks if a string is present in a slice
+func contains(s []string, str string) bool {
+	for _, v := range s {
+		if v == str {
+			return true
+		}
+	}
+
+	return false
+}
diff --git a/pkg/cli/batch.go b/pkg/cli/batch.go
@@ -21,32 +21,42 @@ import (
 	configTypes "github.com/gleanerio/gleaner/internal/config"
 	"github.com/gleanerio/gleaner/pkg"
 	bolt "go.etcd.io/bbolt"
+	"os"
 
 	"log"
 	"path"
 
 	"github.com/spf13/cobra"
 )
 
+var sourceVal string
+
 // batchCmd represents the batch command
 var batchCmd = &cobra.Command{
-	Use:   "batch",
-	Short: "Execute gleaner process",
+	Use:              "batch",
+	TraverseChildren: true,
+	Short:            "Execute gleaner process",
 	Long: `run gleaner process to extract JSON-LD from pages using sitemaps, conver to triples
 and store to a S3 server:
 --cfgName
 --mode`,
+
 	Run: func(cmd *cobra.Command, args []string) {
 		fmt.Println("batch called")
-		Batch(glrVal, cfgPath, cfgName, modeVal)
+		var runSources []string
+		if sourceVal != "" {
+			runSources = append(runSources, sourceVal)
+		}
+		Batch(glrVal, cfgPath, cfgName, modeVal, runSources)
 	},
 }
 
 func init() {
 	gleanerCmd.AddCommand(batchCmd)
 
 	// Here you will define your flags and configuration settings.
-
+	batchCmd.Flags().StringVar(&sourceVal, "source", "", "Override config file source(s) to specify an index target")
+	batchCmd.Flags().StringVar(&modeVal, "mode", "mode", "Set the mode")
 	// Cobra supports Persistent Flags which will work for this command
 	// and all subcommands, e.g.:
 	// batchCmd.PersistentFlags().String("foo", "", "A help for foo")
@@ -56,7 +66,7 @@ func init() {
 	// batchCmd.Flags().BoolP("toggle", "t", false, "Help message for toggle")
 }
 
-func Batch(filename string, cfgPath string, cfgName string, mode string) {
+func Batch(filename string, cfgPath string, cfgName string, mode string, runSources []string) {
 
 	v1, err := configTypes.ReadGleanerConfig(filename, path.Join(cfgPath, cfgName))
 	if err != nil {
@@ -70,5 +80,13 @@ func Batch(filename string, cfgPath string, cfgName string, mode string) {
 	}
 	defer db.Close()
 
+	if len(runSources) > 0 {
+
+		v1, err = configTypes.PruneSources(v1, runSources)
+		if err != nil {
+			log.Fatal(err)
+			os.Exit(1)
+		}
+	}
 	pkg.Cli(mc, v1, db)
 }
diff --git a/pkg/cli/config.go b/pkg/cli/config.go
@@ -24,7 +24,7 @@ nabu uploads and manages data processed by gleaner to a sparql triplestore
 var glrVal, nabuVal, sourcesVal, templateGleaner, templateNabu string
 
 var configBaseFiles = map[string]string{"gleaner": "gleaner_base.yaml", "sources": "sources.csv", "sources_min": "sources_min.csv",
-	"nabu": "nabu_base.yaml", "servers": "servers.yaml", "readme": "readme.txt"}
+	"nabu": "nabu_base.yaml", "servers": "servers.yaml", "readme": "readme.txt", "configdoc": "GleanerConfig.md"}
 
 var gleanerFileNameBase = "gleaner"
 var nabuFilenameBase = "nabu"

diff --git a/pkg/cli/gleaner.go b/pkg/cli/gleaner.go
@@ -7,8 +7,9 @@ import (
 
 // gleanerCmd represents the run command
 var gleanerCmd = &cobra.Command{
-	Use:   "gleaner",
-	Short: "command to execute gleaner processes",
+	Use:              "gleaner",
+	TraverseChildren: true,
+	Short:            "command to execute gleaner processes",
 	Long: `run gleaner process to extract JSON-LD from pages using sitemaps, conver to triples
 and store to a S3 server:
 --cfgName
@@ -18,13 +19,11 @@ and store to a S3 server:
 		fmt.Println("gleaner called")
 	},
 }
-var sourceVal, modeVal string
+var modeVal string
 
 func init() {
 	rootCmd.AddCommand(gleanerCmd)
-
 	// Here you will define your flags and configuration settings.
-	gleanerCmd.Flags().StringVar(&modeVal, "mode", "mode", "Set the mode")
 
 	// Cobra supports Persistent Flags which will work for this command
 	// and all subcommands, e.g.:

diff --git a/pkg/cli/init.go b/pkg/cli/init.go
@@ -59,12 +59,12 @@ func initCfg(cfgpath string, cfgName string, configBaseFiles map[string]string)
 	// do not overwrite the source.csv or servers.yaml
 	_, err := os.Stat(path.Join(cfgpath, cfgName, configBaseFiles["sources"]))
 	if err == nil {
-		copy(path.Join(cfgpath, cfgName, configBaseFiles["sources"]), path.Join(cfgpath, cfgName, configBaseFiles["sources"]+"_latest"))
+		copy(path.Join(cfgpath, "template", configBaseFiles["sources"]), path.Join(cfgpath, cfgName, configBaseFiles["sources"]+"_latest"))
 		delete(configBaseFiles, "sources")
 	}
 	_, err = os.Stat(path.Join(cfgpath, cfgName, configBaseFiles["servers"]))
 	if err == nil {
-		copy(path.Join(cfgpath, cfgName, configBaseFiles["servers"]), path.Join(cfgpath, cfgName, configBaseFiles["servers"]+"_latest"))
+		copy(path.Join(cfgpath, "template", configBaseFiles["servers"]), path.Join(cfgpath, cfgName, configBaseFiles["servers"]+"_latest"))
 		delete(configBaseFiles, "servers")
 	}
 	// copy files listed in config.go: configBaseFiles