Skip to content

Commit ecbab66

Browse files
authored
Merge pull request #133 from gleanerio/mm-dev--api
Consume JSON-LD metadata from a paged API
2 parents 14434be + b443475 commit ecbab66

17 files changed

+466
-119
lines changed

cmd/gleaner/gleaner.db

-8 MB
Binary file not shown.

configs/template/README_Configure_Template.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ hack,SourceType,Active,Name,ProperName,URL,Headless,Domain,PID,Logo
7575

7676
Fields:
7777
1. hack:a hack to make the fields are properly read.
78-
2. SourceType : [sitemap, sitegraph, googledrive] type of source
78+
2. SourceType : [sitemap, sitegraph, googledrive, api] type of source
7979
3. Active: [TRUE,FALSE] is source active.
8080
4. Name: short name of source. It should be one word (no space) and be lower case.
8181
5. ProperName: Long name of source that will be added to organization record for provenance

docs/GleanerConfig.md

+14-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ The miller and summon sections are true and we will leave them that way. It mea
7676
Now look at the "miller:" section when lets of pick what milling to do. Currently it is set with only graph set to true. Let's leave it that way for now. This means Gleaner will only attempt to make graph and not also run validation or generate prov reports for the process.
7777
7878
The final section we need to look at is the "sources:" section.
79-
Here is where the fun is. While there are two types, sitegraph and sitemaps we will normally use sitemap type.
79+
Here is where the fun is. While there multiple types, sitegraph, sitemaps, googledrive and api, we will normally use sitemap type.
8080
8181
8282
A standard [sitemap](./SourceSitemap.md) is below:
@@ -122,6 +122,19 @@ sources:
122122
credentialsfile: configs/credentials/gleaner-331805-030e15e1d9c4.json
123123
other: {}
124124
```
125+
126+
An [API endpoint](./SourceAPI.md)
127+
```yaml
128+
sources:
129+
- sourcetype: api
130+
name: example
131+
url: http://test-metadata-api.com?query=something&page=%d
132+
properName: Example JSON-LD API Source
133+
domain: http://test-metadata-api.com
134+
active: true
135+
apipagelimit: 200
136+
```
137+
125138
These are the sources we wish to pull and process.
126139
Each source has a type, and 8 entries though at this time we no longer use the "logo" value.
127140
It was used in the past to provide a page showing all the sources and

docs/SourceAPI.md

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
## Using a paged API endpoint as a Gleaner source
2+
3+
Sometimes, instead of crawling webpages using a list in a sitemap, we have the opportunity to query an API that will let us directly ingest JSON-LD. To do so, we can specify a `sourcetype: api` in our Gleaner config yaml, and Gleaner will iterate through a paged API, using the given `url` as a template. For example, let's say that you want to use the API endpoint at `http://test-api.com`, and that you can page through it by using a url like `http://test/api.com/page/4`. You would put this in your config:
4+
5+
```yaml
6+
url: http://test-api.com/page/%d
7+
```
8+
9+
Notice the `%d` where the page number goes. Gleaner will then increment that number (starting from 0) until it gets an error back from the API.
10+
11+
Optionally, you can set a limit on the number of pages to iterate through, using `apipagelimit`. This means that Gleaner will page through the API until it gets an error back *or* until it reaches the limit you set. That looks like the example below:
12+
13+
```yaml
14+
url: http://test-api.com/page/%d
15+
apipagelimit: 200
16+
```

internal/config/minio.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func ReadMinioConfig(minioSubtress *viper.Viper) (Minio, error) {
4343
// config already read. substree passed
4444
err := minioSubtress.Unmarshal(&minioCfg)
4545
if err != nil {
46-
log.Fatal("error when parsing minio config: %v", err)
46+
log.Fatal("error when parsing minio config: ", err)
4747
}
4848
return minioCfg, err
4949
}

internal/config/sources.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import (
1515

1616
// as read from csv
1717
type Sources struct {
18-
// Valid values for SourceType: sitemap, sitegraph, csv, googledrive, and robots
18+
// Valid values for SourceType: sitemap, sitegraph, csv, googledrive, api, and robots
1919
SourceType string `default:"sitemap"`
2020
Name string
2121
Logo string
@@ -31,6 +31,7 @@ type Sources struct {
3131
// Active bool
3232
HeadlessWait int // if loading is slow, wait
3333
Delay int64 // A domain-specific crawl delay value
34+
ApiPageLimit int
3435
}
3536

3637
// add needed for file
@@ -124,7 +125,7 @@ func GetSources(g1 *viper.Viper) ([]Sources, error) {
124125
// config already read. substree passed
125126
err := g1.UnmarshalKey(subtreeKey, &cfg)
126127
if err != nil {
127-
log.Fatal("error when parsing %v config: %v", subtreeKey, err)
128+
log.Fatal("error when parsing ", subtreeKey, " config: ", err)
128129
//No sources, so nothing to run
129130
}
130131
for i, s := range cfg {

internal/objects/sourcesAndGraphs.go

+11-11
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,17 @@ import (
77
"github.com/spf13/viper"
88
)
99

10-
//type Sources struct {
11-
// Name string
12-
// Logo string
13-
// URL string
14-
// Headless bool
15-
// PID string
16-
// ProperName string
17-
// Domain string
18-
// // SitemapFormat string
19-
// // Active bool
20-
//}
10+
// type Sources struct {
11+
// Name string
12+
// Logo string
13+
// URL string
14+
// Headless bool
15+
// PID string
16+
// ProperName string
17+
// Domain string
18+
// // SitemapFormat string
19+
// // Active bool
20+
// }
2121
type Sources = configTypes.Sources
2222

2323
// Return all sources and sitegraph domains

internal/summoner/acquire/acquire.go

+86-66
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
)
2222

2323
const EarthCubeAgent = "EarthCube_DataBot/1.0"
24+
const JSONContentType = "application/ld+json"
2425

2526
// ResRetrieve is a function to pull down the data graphs at resources
2627
func ResRetrieve(v1 *viper.Viper, mc *minio.Client, m map[string][]string, db *bolt.DB, runStats *common.RunStats) {
@@ -157,7 +158,8 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
157158
}
158159
defer resp.Body.Close()
159160

160-
doc, err := goquery.NewDocumentFromResponse(resp)
161+
jsonlds, err := FindJSONInResponse(v1, urlloc, repologger, resp)
162+
161163
if err != nil {
162164
log.Error("#", i, " error on ", urlloc, err) // print an message containing the index (won't keep order)
163165
repoStats.Inc(common.Issues)
@@ -166,34 +168,6 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
166168
return
167169
}
168170

169-
var jsonlds []string
170-
var contentTypeHeader = resp.Header["Content-Type"]
171-
172-
// if
173-
// The URL is sending back JSON-LD correctly as application/ld+json
174-
// this should not be here IMHO, but need to support people not setting proper header value
175-
// The URL is sending back JSON-LD but incorrectly sending as application/json
176-
if contains(contentTypeHeader, "application/ld+json") || contains(contentTypeHeader, "application/json") || fileExtensionIsJson(urlloc) {
177-
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "json or ld_json"}).Debug()
178-
log.WithFields(log.Fields{"url": urlloc, "contentType": "json or ld_json"}).Debug(urlloc, " as ", contentTypeHeader)
179-
180-
jsonlds, err = addToJsonListIfValid(v1, jsonlds, doc.Text())
181-
if err != nil {
182-
log.WithFields(log.Fields{"url": urlloc, "contentType": "json or ld_json"}).Error("Error processing json response from ", urlloc, err)
183-
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "json or ld_json"}).Error(err)
184-
}
185-
// look in the HTML page for <script type=application/ld+json>
186-
} else {
187-
doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
188-
jsonlds, err = addToJsonListIfValid(v1, jsonlds, s.Text())
189-
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "script[type='application/ld+json']"}).Info()
190-
if err != nil {
191-
log.WithFields(log.Fields{"url": urlloc, "contentType": "script[type='application/ld+json']"}).Error("Error processing script tag in ", urlloc, err)
192-
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "script[type='application/ld+json']"}).Error(err)
193-
}
194-
})
195-
}
196-
197171
// For incremental indexing I want to know every URL I visit regardless
198172
// if there is a valid JSON-LD document or not. For "full" indexing we
199173
// visit ALL URLs. However, many will not have JSON-LD, so let's also record
@@ -225,43 +199,7 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
225199
repoStats.Inc(common.Summoned)
226200
}
227201

228-
for i, jsonld := range jsonlds {
229-
if jsonld != "" { // traps out the root domain... should do this different
230-
log.WithFields(log.Fields{"url": urlloc, "issue": "Uploading"}).Trace("#", i, "Uploading ")
231-
repologger.WithFields(log.Fields{"url": urlloc, "issue": "Uploading"}).Trace()
232-
sha, err := Upload(v1, mc, bucketName, sourceName, urlloc, jsonld)
233-
if err != nil {
234-
log.WithFields(log.Fields{"url": urlloc, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error("Error uploading jsonld to object store: ", urlloc, err)
235-
repologger.WithFields(log.Fields{"url": urlloc, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error(err)
236-
repoStats.Inc(common.StoreError)
237-
} else {
238-
repologger.WithFields(log.Fields{"url": urlloc, "sha": sha, "issue": "Uploaded to object store"}).Trace(err)
239-
log.WithFields(log.Fields{"url": urlloc, "sha": sha, "issue": "Uploaded to object store"}).Info("Successfully put ", sha, " in summoned bucket for ", urlloc)
240-
repoStats.Inc(common.Stored)
241-
}
242-
// TODO Is here where to add an entry to the KV store
243-
db.Update(func(tx *bolt.Tx) error {
244-
b := tx.Bucket([]byte(sourceName))
245-
err := b.Put([]byte(urlloc), []byte(sha))
246-
if err != nil {
247-
log.Error("Error writing to bolt ", err)
248-
}
249-
return nil
250-
})
251-
} else {
252-
log.WithFields(log.Fields{"url": urlloc, "issue": "Empty JSON-LD document found "}).Info("Empty JSON-LD document found. Continuing.")
253-
repologger.WithFields(log.Fields{"url": urlloc, "issue": "Empty JSON-LD document found "}).Error(err)
254-
// TODO Is here where to add an entry to the KV store
255-
db.Update(func(tx *bolt.Tx) error {
256-
b := tx.Bucket([]byte(sourceName))
257-
err := b.Put([]byte(urlloc), []byte(fmt.Sprintf("NULL: %s", urlloc))) // no JOSN-LD found at this URL
258-
if err != nil {
259-
log.Error("Error writing to bolt ", err)
260-
}
261-
return nil
262-
})
263-
}
264-
}
202+
UploadWrapper(v1, mc, bucketName, sourceName, urlloc, db, repologger, repoStats, jsonlds)
265203

266204
bar.Add(1) // bar.Incr()
267205
log.Trace("#", i, "thread for", urlloc) // print an message containing the index (won't keep order)
@@ -275,6 +213,88 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
275213
}
276214
}
277215

216+
func FindJSONInResponse(v1 *viper.Viper, urlloc string, repologger *log.Logger, response *http.Response) ([]string, error) {
217+
doc, err := goquery.NewDocumentFromResponse(response)
218+
if err != nil {
219+
return nil, err
220+
}
221+
222+
contentTypeHeader := response.Header["Content-Type"]
223+
var jsonlds []string
224+
225+
// if the URL is sending back JSON-LD correctly as application/ld+json
226+
// this should not be here IMHO, but need to support people not setting proper header value
227+
// The URL is sending back JSON-LD but incorrectly sending as application/json
228+
if contains(contentTypeHeader, JSONContentType) || contains(contentTypeHeader, "application/json") || fileExtensionIsJson(urlloc) {
229+
logFields := log.Fields{"url": urlloc, "contentType": "json or ld_json"}
230+
repologger.WithFields(logFields).Debug()
231+
log.WithFields(logFields).Debug(urlloc, " as ", contentTypeHeader)
232+
233+
jsonlds, err = addToJsonListIfValid(v1, jsonlds, doc.Text())
234+
if err != nil {
235+
log.WithFields(logFields).Error("Error processing json response from ", urlloc, err)
236+
repologger.WithFields(logFields).Error(err)
237+
}
238+
// look in the HTML response for <script type=application/ld+json>
239+
} else {
240+
doc.Find("script[type='application/ld+json']").Each(func(i int, s *goquery.Selection) {
241+
jsonlds, err = addToJsonListIfValid(v1, jsonlds, s.Text())
242+
logFields := log.Fields{"url": urlloc, "contentType": "script[type='application/ld+json']"}
243+
repologger.WithFields(logFields).Info()
244+
if err != nil {
245+
log.WithFields(logFields).Error("Error processing script tag in ", urlloc, err)
246+
repologger.WithFields(logFields).Error(err)
247+
}
248+
})
249+
}
250+
251+
return jsonlds, nil
252+
}
253+
254+
func UploadWrapper(v1 *viper.Viper, mc *minio.Client, bucketName string, sourceName string, urlloc string, db *bolt.DB, repologger *log.Logger, repoStats *common.RepoStats, jsonlds []string) {
255+
for i, jsonld := range jsonlds {
256+
if jsonld != "" { // traps out the root domain... should do this different
257+
logFields := log.Fields{"url": urlloc, "issue": "Uploading"}
258+
log.WithFields(logFields).Trace("#", i, "Uploading ")
259+
repologger.WithFields(logFields).Trace()
260+
sha, err := Upload(v1, mc, bucketName, sourceName, urlloc, jsonld)
261+
if err != nil {
262+
logFields = log.Fields{"url": urlloc, "sha": sha, "issue": "Error uploading jsonld to object store"}
263+
log.WithFields(logFields).Error("Error uploading jsonld to object store: ", urlloc, err)
264+
repologger.WithFields(logFields).Error(err)
265+
repoStats.Inc(common.StoreError)
266+
} else {
267+
logFields = log.Fields{"url": urlloc, "sha": sha, "issue": "Uploaded to object store"}
268+
repologger.WithFields(logFields).Trace(err)
269+
log.WithFields(logFields).Info("Successfully put ", sha, " in summoned bucket for ", urlloc)
270+
repoStats.Inc(common.Stored)
271+
}
272+
// TODO Is here where to add an entry to the KV store
273+
db.Update(func(tx *bolt.Tx) error {
274+
b := tx.Bucket([]byte(sourceName))
275+
err := b.Put([]byte(urlloc), []byte(sha))
276+
if err != nil {
277+
log.Error("Error writing to bolt ", err)
278+
}
279+
return nil
280+
})
281+
} else {
282+
logFields := log.Fields{"url": urlloc, "issue": "Empty JSON-LD document found "}
283+
log.WithFields(logFields).Info("Empty JSON-LD document found. Continuing.")
284+
repologger.WithFields(logFields).Error("Empty JSON-LD document found. Continuing.")
285+
// TODO Is here where to add an entry to the KV store
286+
db.Update(func(tx *bolt.Tx) error {
287+
b := tx.Bucket([]byte(sourceName))
288+
err := b.Put([]byte(urlloc), []byte(fmt.Sprintf("NULL: %s", urlloc))) // no JSON-LD found at this URL
289+
if err != nil {
290+
log.Error("Error writing to bolt ", err)
291+
}
292+
return nil
293+
})
294+
}
295+
}
296+
}
297+
278298
func contains(arr []string, str string) bool {
279299
for _, a := range arr {
280300

0 commit comments

Comments
 (0)