Skip to content

Commit 14434be

Browse files
authored
Merge pull request #153 from gleanerio/dv_headlessTesting
Headless rework to implement headlessWait in javascript and make headless testable
2 parents a7f773a + a6f8b00 commit 14434be

File tree

6 files changed

+202
-57
lines changed

6 files changed

+202
-57
lines changed

cmd/husker/main.go

+6-7
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
package main
22

33
import (
4-
"bytes"
54
"flag"
65
"fmt"
76
"log"
@@ -55,10 +54,10 @@ func main() {
5554
// url := "http://dx.doi.org/10.7288/V4/MAGIC/15032" // magic
5655
url := "https://dev.rvdata.us/search/fileset/100142" // rvdata
5756
k := "demo"
58-
var (
59-
buf bytes.Buffer
60-
logger = log.New(&buf, "logger: ", log.Lshortfile)
61-
)
57+
//var (
58+
// buf bytes.Buffer
59+
// logger = log.New(&buf, "logger: ", log.Lshortfile)
60+
//)
6261
// setup the KV store to hold a record of indexed resources
6362
db, err := bolt.Open("gleaner.db", 0600, nil)
6463
if err != nil {
@@ -68,8 +67,8 @@ func main() {
6867
rlogginer, _ := common.LogIssues(v1, k)
6968

7069
runStats := common.NewRunStats()
71-
72-
err = acquire.PageRender(v1, mc, 45*time.Second, url, k, db, rlogginer, runStats)
70+
repostats := runStats.Add(k)
71+
err = acquire.PageRenderAndUpload(v1, mc, 45*time.Second, url, k, db, rlogginer, repostats)
7372
if err != nil {
7473
panic(fmt.Errorf("error when reading config: %v", err))
7574
}

go.mod

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ require (
2828
golang.org/x/crypto v0.0.0-20220525230936-793ad666bf5e
2929
)
3030

31+
require github.com/valyala/fasttemplate v1.2.2
32+
3133
require (
3234
cloud.google.com/go/compute/metadata v0.2.3 // indirect
3335
github.com/googleapis/enterprise-certificate-proxy v0.2.1 // indirect

go.sum

+2
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,8 @@ github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6Kllzaw
596596
github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
597597
github.com/valyala/fasthttp v1.37.1-0.20220607072126-8a320890c08d h1:xS9QTPgKl9ewGsAOPc+xW7DeStJDqYPfisDmeSCcbco=
598598
github.com/valyala/fasthttp v1.37.1-0.20220607072126-8a320890c08d/go.mod h1:t/G+3rLek+CyY9bnIE+YlMRddxVAAGjhxndDB4i4C0I=
599+
github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo=
600+
github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ=
599601
github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
600602
github.com/willf/bitset v1.1.10/go.mod h1:RjeCKbqT1RxIR/KWY6phxZiaY1IyutSBfGjNPySAYV4=
601603
github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2/go.mod h1:UETIi67q53MR2AWcXfiuqkDkRtnGDLqkBTpCHuJHxtU=

internal/summoner/acquire/acquire.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -204,10 +204,10 @@ func getDomain(v1 *viper.Viper, mc *minio.Client, urls []string, sourceName stri
204204
// TODO is her where I then try headless, and scope the following for into an else?
205205
log.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Info("Direct access failed, trying headless for ", urlloc)
206206
repologger.WithFields(log.Fields{"url": urlloc, "contentType": "Direct access failed, trying headless']"}).Error() // this needs to go into the issues file
207-
err := PageRender(v1, mc, 60*time.Second, urlloc, sourceName, db, repologger, repoStats) // TODO make delay configurable
207+
err := PageRenderAndUpload(v1, mc, 60*time.Second, urlloc, sourceName, db, repologger, repoStats) // TODO make delay configurable
208208

209209
if err != nil {
210-
log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRender ", urlloc, "::", err)
210+
log.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error("PageRenderAndUpload ", urlloc, "::", err)
211211
repologger.WithFields(log.Fields{"url": urlloc, "issue": "converting json ld"}).Error(err)
212212
}
213213
db.Update(func(tx *bolt.Tx) error {

internal/summoner/acquire/headlessNG.go

+128-48
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ import (
66
"fmt"
77
"github.com/gleanerio/gleaner/internal/common"
88
log "github.com/sirupsen/logrus"
9-
109
"time"
1110

1211
configTypes "github.com/gleanerio/gleaner/internal/config"
@@ -18,6 +17,7 @@ import (
1817
"github.com/mafredri/cdp/rpcc"
1918
minio "github.com/minio/minio-go/v7"
2019
"github.com/spf13/viper"
20+
"github.com/valyala/fasttemplate"
2121
bolt "go.etcd.io/bbolt"
2222
)
2323

@@ -55,7 +55,7 @@ func HeadlessNG(v1 *viper.Viper, mc *minio.Client, m map[string][]string, db *bo
5555

5656
for i := range m[k] {
5757

58-
err := PageRender(v1, mc, 60*time.Second, m[k][i], k, db, repologger, r) // TODO make delay configurable
58+
err := PageRenderAndUpload(v1, mc, 60*time.Second, m[k][i], k, db, repologger, r) // TODO make delay configurable
5959
if err != nil {
6060
log.Error(m[k][i], "::", err)
6161
}
@@ -129,7 +129,7 @@ func HeadlessNG(v1 *viper.Viper, mc *minio.Client, m map[string][]string, db *bo
129129
// //thread management
130130
// semaphoreChan <- struct{}{}
131131
//
132-
// err := PageRender(v1, mc, 60*time.Second, m[k][i], k, db) // TODO make delay configurable
132+
// err := PageRenderAndUpload(v1, mc, 60*time.Second, m[k][i], k, db) // TODO make delay configurable
133133
// if err != nil {
134134
// log.Error(m[k][i], "::", err)
135135
// }
@@ -148,30 +148,78 @@ func HeadlessNG(v1 *viper.Viper, mc *minio.Client, m map[string][]string, db *bo
148148
//
149149
//}
150150

151-
func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k string, db *bolt.DB, repologger *log.Logger, repoStats *common.RepoStats) error {
152-
repologger.WithFields(log.Fields{"url": url}).Trace("PageRender")
153-
ctx, cancel := context.WithTimeout(context.Background(), timeout)
154-
defer cancel()
151+
func PageRenderAndUpload(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k string, db *bolt.DB, repologger *log.Logger, repoStats *common.RepoStats) error {
152+
repologger.WithFields(log.Fields{"url": url}).Trace("PageRenderAndUpload")
153+
// page render handles this
154+
//ctx, cancel := context.WithTimeout(context.Background(), timeout)
155+
//defer cancel()
155156

156157
// read config file
157158
//miniocfg := v1.GetStringMapString("minio")
158159
//bucketName := miniocfg["bucket"] // get the top level bucket for all of gleaner operations from config file
159160
bucketName, err := configTypes.GetBucketName(v1)
160161

161162
//mcfg := v1.GetStringMapString("summoner")
163+
//mcfg, err := configTypes.ReadSummmonerConfig(v1.Sub("summoner"))
164+
165+
jsonlds, err := PageRender(v1, timeout, url, k, repologger, repoStats)
166+
167+
if err != nil { // from page render
168+
if len(jsonlds) > 1 {
169+
repologger.WithFields(log.Fields{"url": url, "issue": "Multiple JSON"}).Debug(err)
170+
}
171+
for _, jsonld := range jsonlds {
172+
sha, err := Upload(v1, mc, bucketName, k, url, jsonld)
173+
if err != nil {
174+
log.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error("Error uploading jsonld to object store:", url, err, sha)
175+
repologger.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error(err)
176+
repoStats.Inc(common.StoreError)
177+
} else {
178+
repologger.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Uploaded JSONLD to object store"}).Debug()
179+
repoStats.Inc(common.Stored)
180+
}
181+
// TODO Is here where to add an entry to the KV store
182+
err = db.Update(func(tx *bolt.Tx) error {
183+
b := tx.Bucket([]byte(k))
184+
err := b.Put([]byte(url), []byte(sha))
185+
if err != nil {
186+
log.Error("Error writing to bolt", err)
187+
}
188+
return nil
189+
})
190+
}
191+
}
192+
return err
193+
}
194+
195+
func PageRender(v1 *viper.Viper, timeout time.Duration, url, k string, repologger *log.Logger, repoStats *common.RepoStats) ([]string, error) {
196+
repologger.WithFields(log.Fields{"url": url}).Trace("PageRender")
197+
retries := 3
198+
sources, err := configTypes.GetSources(v1)
199+
source, err := configTypes.GetSourceByName(sources, k)
200+
headlessWait := source.HeadlessWait
201+
if timeout*time.Duration(retries) < time.Duration(headlessWait)*time.Second {
202+
timeout = time.Duration(headlessWait) * time.Second
203+
}
204+
205+
ctx, cancel := context.WithTimeout(context.Background(), timeout*time.Duration(retries))
206+
defer cancel()
207+
response := []string{}
208+
// read config file
162209
mcfg, err := configTypes.ReadSummmonerConfig(v1.Sub("summoner"))
163210

164211
// Use the DevTools HTTP/JSON API to manage targets (e.g. pages, webworkers).
165212
//devt := devtool.New(mcfg["headless"])
166213
devt := devtool.New(mcfg.Headless)
214+
167215
pt, err := devt.Get(ctx, devtool.Page)
168216
if err != nil {
169217
pt, err = devt.Create(ctx)
170218
if err != nil {
171219
log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err)
172220
repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. Devtools... Is Headless Container running?")
173221
repoStats.Inc(common.HeadlessError)
174-
return err
222+
return response, err
175223
}
176224
}
177225

@@ -181,7 +229,7 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
181229
log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err)
182230
repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. Devtools... Is Headless Container running?")
183231
repoStats.Inc(common.HeadlessError)
184-
return err
232+
return response, err
185233
}
186234
defer conn.Close() // Leaving connections open will leak memory.
187235

@@ -194,7 +242,7 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
194242
log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err)
195243
repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. Devtools... Is Headless Container running?")
196244
repoStats.Inc(common.HeadlessError)
197-
return err
245+
return response, err
198246
}
199247

200248
// Open a DOMContentEventFired client to buffer this event.
@@ -203,26 +251,42 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
203251
log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err)
204252
repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. Devtools... Is Headless Container running?")
205253
repoStats.Inc(common.HeadlessError)
206-
return err
254+
return response, err
207255
}
208256
defer domContent.Close()
209257

258+
// Open a LoadEventFired client to buffer this event.
259+
loadEventFired, err := c.Page.LoadEventFired(ctx)
260+
if err != nil {
261+
log.WithFields(log.Fields{"url": url, "issue": "Not REPO FAULT. Devtools... Is Headless Container running?"}).Error(err)
262+
repologger.WithFields(log.Fields{"url": url}).Error("Not REPO FAULT. Devtools... Is Headless Container running?")
263+
repoStats.Inc(common.HeadlessError)
264+
return response, err
265+
}
266+
defer loadEventFired.Close()
267+
210268
// Create the Navigate arguments with the optional Referrer field set.
211269
navArgs := page.NewNavigateArgs(url)
212270
nav, err := c.Page.Navigate(ctx, navArgs)
213271
if err != nil {
214272
log.WithFields(log.Fields{"url": url, "issue": "Navigate To Headless"}).Error(err)
215273
repologger.WithFields(log.Fields{"url": url, "issue": "Navigate To Headless"}).Error(err)
216274
repoStats.Inc(common.HeadlessError)
217-
return err
275+
return response, err
218276
}
219277

278+
_, err = loadEventFired.Recv()
279+
if err != nil {
280+
return nil, err
281+
}
282+
loadEventFired.Close()
283+
220284
// Wait until we have a DOMContentEventFired event.
221285
if _, err = domContent.Recv(); err != nil {
222286
log.WithFields(log.Fields{"url": url, "issue": "Dom Error"}).Error(err)
223287
repologger.WithFields(log.Fields{"url": url, "issue": "Dom Error"}).Error(err)
224288
repoStats.Inc(common.HeadlessError)
225-
return err
289+
return response, err
226290
}
227291

228292
log.WithFields(log.Fields{"url": url, "issue": "Navigate Complete"}).Debug(nav.FrameID, "for", url)
@@ -235,7 +299,7 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
235299
* I cannot figure out how to get the cdp Runtime to distinguish between a resolved and a rejected
236300
* promise - so in this case, we simply do not index a document, and fail silently.
237301
**/
238-
expression := `
302+
expressionTmpl := `
239303
function getMetadata() {
240304
return new Promise((resolve, reject) => {
241305
const elements = document.querySelectorAll('script[type="application/ld+json"]');
@@ -249,12 +313,12 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
249313
resolve(metadata);
250314
}
251315
else {
252-
reject("No JSON-LD present after 1 second.");
316+
reject("No JSON-LD present after {{timeout}} second.");
253317
}
254318
});
255319
}
256320
257-
function retry(fn, retriesLeft = 3, interval = 1000) {
321+
function retry(fn, retriesLeft = {{retries}}, interval = {{timeout}}) {
258322
return new Promise((resolve, reject) => {
259323
fn()
260324
.then(resolve)
@@ -270,25 +334,35 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
270334
});
271335
});
272336
}
337+
function sleep(ms) {
338+
return new Promise(resolve => setTimeout(resolve, ms));
339+
}
273340
274-
retry(getMetadata);
341+
sleep( {{headlesswait}} ).then( () => { return retry(getMetadata) } );
342+
275343
`
276-
344+
tmpl := fasttemplate.New(expressionTmpl, "{{", "}}")
345+
expression := tmpl.ExecuteString(map[string]interface{}{
346+
"timeout": fmt.Sprintf("%d", timeout.Milliseconds()),
347+
"headlesswait": fmt.Sprintf("%d", headlessWait*1000),
348+
"retries": "3",
349+
})
350+
log.Trace(expression)
277351
evalArgs := runtime.NewEvaluateArgs(expression).SetAwaitPromise(true).SetReturnByValue(true)
278352
eval, err := c.Runtime.Evaluate(ctx, evalArgs)
279353
if err != nil {
280354
log.WithFields(log.Fields{"url": url, "issue": "Headless Evaluate"}).Error(err)
281355
repologger.WithFields(log.Fields{"url": url, "issue": "Headless Evaluate"}).Error(err)
282356
repoStats.Inc(common.Issues)
283-
return (err)
357+
return response, err
284358
}
285359

286360
// Rejecting that promise just sends null as its value,
287361
// so we need to stop if we got that.
288362
if eval.Result.Value == nil {
289363
repologger.WithFields(log.Fields{"url": url, "issue": "Headless Nil Result"}).Trace()
290364
repoStats.Inc(common.EmptyDoc)
291-
return nil
365+
return response, nil
292366
}
293367

294368
// todo: what are the data types that will always be in this json? we
@@ -298,7 +372,7 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
298372
log.WithFields(log.Fields{"url": url, "issue": "Json Unmarshal"}).Error(err)
299373
repologger.WithFields(log.Fields{"url": url, "issue": "Json Unmarshal"}).Error(err)
300374
repoStats.Inc(common.Issues)
301-
return (err)
375+
return response, err
302376
}
303377

304378
if len(jsonlds) > 1 {
@@ -307,43 +381,49 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
307381
for _, jsonld := range jsonlds {
308382
valid, err := isValid(v1, jsonld)
309383
if err != nil {
384+
// there could be one bad jsonld, and one good. We want to process the jsonld
385+
// so, do not set an err
310386
log.WithFields(log.Fields{"url": url, "issue": "invalid JSON"}).Error("error checking for valid json :", err)
311387
repologger.WithFields(log.Fields{"url": url, "issue": "invalid JSON"}).Error(err)
312388
repoStats.Inc(common.Issues)
313389
} else if valid && jsonld != "" { // traps out the root domain... should do this different
314-
sha, err := Upload(v1, mc, bucketName, k, url, jsonld)
315-
if err != nil {
316-
log.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error("Error uploading jsonld to object store:", url, err, sha)
317-
repologger.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error(err)
318-
repoStats.Inc(common.StoreError)
319-
} else {
320-
repologger.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Uploaded JSONLD to object store"}).Debug()
321-
repoStats.Inc(common.Stored)
322-
}
323-
// TODO Is here where to add an entry to the KV store
324-
err = db.Update(func(tx *bolt.Tx) error {
325-
b := tx.Bucket([]byte(k))
326-
err := b.Put([]byte(url), []byte(sha))
327-
if err != nil {
328-
log.Error("Error writing to bolt", err)
329-
}
330-
return nil
331-
})
390+
response = append(response, jsonld)
391+
err = nil
392+
//sha, err := Upload(v1, mc, bucketName, k, url, jsonld)
393+
//if err != nil {
394+
// log.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error("Error uploading jsonld to object store:", url, err, sha)
395+
// repologger.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error(err)
396+
// repoStats.Inc(common.StoreError)
397+
//} else {
398+
// repologger.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Uploaded JSONLD to object store"}).Debug()
399+
// repoStats.Inc(common.Stored)
400+
//}
401+
//// TODO Is here where to add an entry to the KV store
402+
//err = db.Update(func(tx *bolt.Tx) error {
403+
// b := tx.Bucket([]byte(k))
404+
// err := b.Put([]byte(url), []byte(sha))
405+
// if err != nil {
406+
// log.Error("Error writing to bolt", err)
407+
// }
408+
// return nil
409+
//})
332410
} else {
411+
// there could be one bad jsonld, and one good. We want to process the jsonld
412+
// so, do not set an err
333413
log.Info("Empty JSON-LD document found. Continuing.", url)
334414
repologger.WithFields(log.Fields{"url": url, "issue": "Empty JSON-LD document found"}).Debug()
335415
repoStats.Inc(common.EmptyDoc)
336416
// TODO Is here where to add an entry to the KV store
337-
err = db.Update(func(tx *bolt.Tx) error {
338-
b := tx.Bucket([]byte(k))
339-
err := b.Put([]byte(url), []byte("NULL")) // no JOSN-LD found at this URL
340-
if err != nil {
341-
log.Error("Error writing to bolt", err)
342-
}
343-
return nil
344-
})
417+
//err = db.Update(func(tx *bolt.Tx) error {
418+
// b := tx.Bucket([]byte(k))
419+
// err := b.Put([]byte(url), []byte("NULL")) // no JOSN-LD found at this URL
420+
// if err != nil {
421+
// log.Error("Error writing to bolt", err)
422+
// }
423+
// return nil
424+
//})
345425
}
346426
}
347427

348-
return err
428+
return response, err
349429
}

0 commit comments

Comments
 (0)