Skip to content

Commit

Permalink
Merge branch 'dev_source_loading' of https://github.com/gleanerio/gle…
Browse files Browse the repository at this point in the history
…aner into dev_ec
  • Loading branch information
valentinedwv committed Jun 22, 2023
2 parents 5c08532 + 4443007 commit e01be26
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 42 deletions.
14 changes: 11 additions & 3 deletions configs/template/localConfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ minio:
address: 0.0.0.0
# aws need to include the region in the bucket. eg: s3.us-west-2.amazonaws.com
port: 9000
accessKey: worldsbestaccesskey
secretKey: worldsbestsecretkey
ssl: false
bucket: gleaner
region:
accessKey: worldsbestaccesskey
secretKey: worldsbestsecretkey

# can be overridden with MINIO_BUCKET
sparql:
endpoint: http://localhost/blazegraph/namespace/earthcube/sparql
Expand All @@ -25,7 +27,13 @@ sourcesSource:
location: sources.csv
# this can be a remote csv
# type: csv
# location: https://docs.google.com/spreadsheets/d/{key}/gviz/tq?tqx=out:csv&sheet={sheet_name}
# location: https://docs.google.com/spreadsheets/d/e/2PACX-1vTt_45dYd5LMFK9Qm_lCg6P7YxG-ae0GZEtrHMZmNbI-y5tVDd8ZLqnEeIAa-SVTSztejfZeN6xmRZF/pub?gid=1277688039&single=true&output=csv
#### GOOGLE SHEETS ====
# urls changed. Sheet needs to be shared now.
# share> publish to web >
# select sheet, and format csv
# paste url in location
####
# TBD -- Just use the sources in the gleaner file.
# type: yaml
# location: gleaner.yaml
6 changes: 3 additions & 3 deletions internal/config/gleanerConfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ var gleanerTemplate = map[string]interface{}{
"minio": map[string]string{
"address": "localhost",
"port": "9000",
"region": "",
"accesskey": "",
"secretkey": "",
// "region": "us-east-1",
},
"gleaner": map[string]string{},
"context": map[string]string{},
Expand Down Expand Up @@ -50,8 +50,8 @@ func ReadGleanerConfig(filename string, cfgDir string) (*viper.Viper, error) {
v.AutomaticEnv()
err := v.ReadInConfig()
if err != nil {
fmt.Println("cannot find config file. Did you 'glcon generate --cfgName XXX' ")
log.Fatal("cannot find config file. Did you 'glcon generate --cfgName XXX' ")
fmt.Printf("cannot find config file. '%v' If glcon Did you 'glcon generate --cfgName XXX' \n", filename)
log.Fatalf("cannot find config file. '%v' Did you 'glcon generate --cfgName XXX' ", filename)
//panic(err)
}
return v, err
Expand Down
6 changes: 3 additions & 3 deletions internal/config/localConfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@ var serversTemplate = map[string]interface{}{
"minio": map[string]string{
"address": "localhost",
"port": "9000",
"bucket": "",
"region": "",
"accesskey": "",
"secretkey": "",
"bucket": "",
// "region": "us-east-1",
},
"sparql": map[string]string{
"endpoint": "localhost",
},
"headless": "",
"s3": map[string]string{
"bucket": "gleaner",
"domain": "us-east-1",
"domain": "",
},
"identifiertype": JsonSha, // const from config.Sources jsonsha,identifiersha, normalizedjsonsha, identifierstring
}
Expand Down
15 changes: 8 additions & 7 deletions internal/config/minio.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,23 @@ type Minio struct {
Address string // `mapstructure:"MINIO_ADDRESS"`
Port int //`mapstructure:"MINIO_PORT"`
Ssl bool //`mapstructure:"MINIO_USE_SSL"`
Accesskey string //`mapstructure:"MINIO_ACCESS_KEY"`
Secretkey string // `mapstructure:"MINIO_SECRET_KEY"`
Bucket string
Region string
Accesskey string //`mapstructure:"MINIO_ACCESS_KEY"`
Secretkey string // `mapstructure:"MINIO_SECRET_KEY"`

}

// auth fails if a region is set in minioclient...
var MinioTemplate = map[string]interface{}{
"minio": map[string]string{
"address": "localhost",
"port": "9000",
"accesskey": "",
"secretkey": "",
"bucket": "",
"ssl": "false",
"region": "",
"accesskey": "",
"secretkey": "",
},
}

Expand All @@ -39,11 +40,11 @@ func ReadMinioConfig(minioSubtress *viper.Viper) (Minio, error) {
minioSubtress.BindEnv("address", "MINIO_ADDRESS")
minioSubtress.BindEnv("port", "MINIO_PORT")
minioSubtress.BindEnv("ssl", "MINIO_USE_SSL")
minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")
minioSubtress.BindEnv("bucket", "MINIO_BUCKET")
minioSubtress.BindEnv("region", "MINIO_REGION")
minioSubtress.BindEnv("accesskey", "MINIO_ACCESS_KEY")
minioSubtress.BindEnv("secretkey", "MINIO_SECRET_KEY")

minioSubtress.AutomaticEnv()
// config already read. substree passed
err := minioSubtress.Unmarshal(&minioCfg)
Expand Down
77 changes: 51 additions & 26 deletions internal/config/sources.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ const (
StandardizedHttps
StandardizedHttp
)
const AccceptContentType string = "application/ld+json, text/html"

func (s ContextOption) String() string {
switch s {
Expand Down Expand Up @@ -70,12 +71,14 @@ type Sources struct {
// SitemapFormat string
// Active bool

HeadlessWait int // if loading is slow, wait
Delay int64 // A domain-specific crawl delay value
IdentifierPath string // JSON Path to the identifier
ApiPageLimit int
IdentifierType string
FixContextOption ContextOption
HeadlessWait int // if loading is slow, wait
Delay int64 // A domain-specific crawl delay value
IdentifierPath string // JSON Path to the identifier
ApiPageLimit int
IdentifierType string
FixContextOption ContextOption
AcceptContentType string `default:"application/ld+json, text/html"` // accept content type string for http request
JsonProfile string // jsonprofile
}

// add needed for file
Expand All @@ -89,36 +92,46 @@ type SourcesConfig struct {
Domain string
// SitemapFormat string
// Active bool
HeadlessWait int // is loading is slow, wait
Delay int64 // A domain-specific crawl delay value
IdentifierPath string // JSON Path to the identifier
IdentifierType string
FixContextOption ContextOption
HeadlessWait int // is loading is slow, wait
Delay int64 // A domain-specific crawl delay value
IdentifierPath string // JSON Path to the identifier
IdentifierType string
FixContextOption ContextOption
AcceptContentType string `default:"application/ld+json, text/html"` // accept content type string for http request
JsonProfile string // jsonprofile
}

var SourcesTemplate = map[string]interface{}{
"sources": map[string]string{
"sourcetype": "sitemap",
"name": "",
"url": "",
"logo": "",
"headless": "",
"pid": "",
"propername": "",
"domain": "",
"credentialsfile": "",
"headlesswait": "0",
"delay": "0",
"identifierpath": "",
"identifiertype": JsonSha,
"fixcontextoption": "https",
"sourcetype": "sitemap",
"name": "",
"url": "",
"logo": "",
"headless": "",
"pid": "",
"propername": "",
"domain": "",
"credentialsfile": "",
"headlesswait": "0",
"delay": "0",
"identifierpath": "",
"identifiertype": JsonSha,
"fixcontextoption": "https",
"acceptcontenttype": "application/ld+json, text/html",
"jsonprofile": "",
},
}

func populateDefaults(s Sources) Sources {
if s.SourceType == "" {
s.SourceType = "sitemap"
}
if s.AcceptContentType == "" {
s.AcceptContentType = "application/ld+json, text/html"
}
if s.JsonProfile == "" {
s.JsonProfile = "application/ld+json"
}
// fix issues, too. Space from CSV causing url errors
s.URL = strings.TrimSpace(s.URL)
return s
Expand Down Expand Up @@ -151,10 +164,22 @@ func ReadSourcesCSV(filename string, cfgPath string) ([]Sources, error) {
return gocsv.LazyCSVReader(in) // Allows use of quotes in CSV
})

if err := gocsv.Unmarshal(f, &sources); err != nil {
err = gocsv.Unmarshal(f, &sources)
if err != nil {
fmt.Println("error:", err)

}
if len(sources) < 1 {
if strings.HasPrefix(filename, "https://") || strings.HasPrefix(filename, "http://") {

msg := fmt.Sprintf("no sources try downloading csv '%v', and using a local file. %v"+
" if google share, publish to web single page csv", filename, err)
log.Fatal(msg)
} else {
log.Fatalf("no sources in '%v', error parsing csv used for sources %v", filename, err)
}

}
for i, u := range sources {
sources[i] = populateDefaults(u)
fmt.Printf("%+v\n", u)
Expand Down

0 comments on commit e01be26

Please sign in to comment.