6
6
"fmt"
7
7
"github.com/gleanerio/gleaner/internal/common"
8
8
log "github.com/sirupsen/logrus"
9
-
10
9
"time"
11
10
12
11
configTypes "github.com/gleanerio/gleaner/internal/config"
@@ -18,6 +17,7 @@ import (
18
17
"github.com/mafredri/cdp/rpcc"
19
18
minio "github.com/minio/minio-go/v7"
20
19
"github.com/spf13/viper"
20
+ "github.com/valyala/fasttemplate"
21
21
bolt "go.etcd.io/bbolt"
22
22
)
23
23
@@ -55,7 +55,7 @@ func HeadlessNG(v1 *viper.Viper, mc *minio.Client, m map[string][]string, db *bo
55
55
56
56
for i := range m [k ] {
57
57
58
- err := PageRender (v1 , mc , 60 * time .Second , m [k ][i ], k , db , repologger , r ) // TODO make delay configurable
58
+ err := PageRenderAndUpload (v1 , mc , 60 * time .Second , m [k ][i ], k , db , repologger , r ) // TODO make delay configurable
59
59
if err != nil {
60
60
log .Error (m [k ][i ], "::" , err )
61
61
}
@@ -129,7 +129,7 @@ func HeadlessNG(v1 *viper.Viper, mc *minio.Client, m map[string][]string, db *bo
129
129
// //thread management
130
130
// semaphoreChan <- struct{}{}
131
131
//
132
- // err := PageRender (v1, mc, 60*time.Second, m[k][i], k, db) // TODO make delay configurable
132
+ // err := PageRenderAndUpload (v1, mc, 60*time.Second, m[k][i], k, db) // TODO make delay configurable
133
133
// if err != nil {
134
134
// log.Error(m[k][i], "::", err)
135
135
// }
@@ -148,30 +148,78 @@ func HeadlessNG(v1 *viper.Viper, mc *minio.Client, m map[string][]string, db *bo
148
148
//
149
149
//}
150
150
151
- func PageRender (v1 * viper.Viper , mc * minio.Client , timeout time.Duration , url , k string , db * bolt.DB , repologger * log.Logger , repoStats * common.RepoStats ) error {
152
- repologger .WithFields (log.Fields {"url" : url }).Trace ("PageRender" )
153
- ctx , cancel := context .WithTimeout (context .Background (), timeout )
154
- defer cancel ()
151
+ func PageRenderAndUpload (v1 * viper.Viper , mc * minio.Client , timeout time.Duration , url , k string , db * bolt.DB , repologger * log.Logger , repoStats * common.RepoStats ) error {
152
+ repologger .WithFields (log.Fields {"url" : url }).Trace ("PageRenderAndUpload" )
153
+ // page render handles this
154
+ //ctx, cancel := context.WithTimeout(context.Background(), timeout)
155
+ //defer cancel()
155
156
156
157
// read config file
157
158
//miniocfg := v1.GetStringMapString("minio")
158
159
//bucketName := miniocfg["bucket"] // get the top level bucket for all of gleaner operations from config file
159
160
bucketName , err := configTypes .GetBucketName (v1 )
160
161
161
162
//mcfg := v1.GetStringMapString("summoner")
163
+ //mcfg, err := configTypes.ReadSummmonerConfig(v1.Sub("summoner"))
164
+
165
+ jsonlds , err := PageRender (v1 , timeout , url , k , repologger , repoStats )
166
+
167
+ if err != nil { // from page render
168
+ if len (jsonlds ) > 1 {
169
+ repologger .WithFields (log.Fields {"url" : url , "issue" : "Multiple JSON" }).Debug (err )
170
+ }
171
+ for _ , jsonld := range jsonlds {
172
+ sha , err := Upload (v1 , mc , bucketName , k , url , jsonld )
173
+ if err != nil {
174
+ log .WithFields (log.Fields {"url" : url , "sha" : sha , "issue" : "Error uploading jsonld to object store" }).Error ("Error uploading jsonld to object store:" , url , err , sha )
175
+ repologger .WithFields (log.Fields {"url" : url , "sha" : sha , "issue" : "Error uploading jsonld to object store" }).Error (err )
176
+ repoStats .Inc (common .StoreError )
177
+ } else {
178
+ repologger .WithFields (log.Fields {"url" : url , "sha" : sha , "issue" : "Uploaded JSONLD to object store" }).Debug ()
179
+ repoStats .Inc (common .Stored )
180
+ }
181
+ // TODO Is here where to add an entry to the KV store
182
+ err = db .Update (func (tx * bolt.Tx ) error {
183
+ b := tx .Bucket ([]byte (k ))
184
+ err := b .Put ([]byte (url ), []byte (sha ))
185
+ if err != nil {
186
+ log .Error ("Error writing to bolt" , err )
187
+ }
188
+ return nil
189
+ })
190
+ }
191
+ }
192
+ return err
193
+ }
194
+
195
+ func PageRender (v1 * viper.Viper , timeout time.Duration , url , k string , repologger * log.Logger , repoStats * common.RepoStats ) ([]string , error ) {
196
+ repologger .WithFields (log.Fields {"url" : url }).Trace ("PageRender" )
197
+ retries := 3
198
+ sources , err := configTypes .GetSources (v1 )
199
+ source , err := configTypes .GetSourceByName (sources , k )
200
+ headlessWait := source .HeadlessWait
201
+ if timeout * time .Duration (retries ) < time .Duration (headlessWait )* time .Second {
202
+ timeout = time .Duration (headlessWait ) * time .Second
203
+ }
204
+
205
+ ctx , cancel := context .WithTimeout (context .Background (), timeout * time .Duration (retries ))
206
+ defer cancel ()
207
+ response := []string {}
208
+ // read config file
162
209
mcfg , err := configTypes .ReadSummmonerConfig (v1 .Sub ("summoner" ))
163
210
164
211
// Use the DevTools HTTP/JSON API to manage targets (e.g. pages, webworkers).
165
212
//devt := devtool.New(mcfg["headless"])
166
213
devt := devtool .New (mcfg .Headless )
214
+
167
215
pt , err := devt .Get (ctx , devtool .Page )
168
216
if err != nil {
169
217
pt , err = devt .Create (ctx )
170
218
if err != nil {
171
219
log .WithFields (log.Fields {"url" : url , "issue" : "Not REPO FAULT. Devtools... Is Headless Container running?" }).Error (err )
172
220
repologger .WithFields (log.Fields {"url" : url }).Error ("Not REPO FAULT. Devtools... Is Headless Container running?" )
173
221
repoStats .Inc (common .HeadlessError )
174
- return err
222
+ return response , err
175
223
}
176
224
}
177
225
@@ -181,7 +229,7 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
181
229
log .WithFields (log.Fields {"url" : url , "issue" : "Not REPO FAULT. Devtools... Is Headless Container running?" }).Error (err )
182
230
repologger .WithFields (log.Fields {"url" : url }).Error ("Not REPO FAULT. Devtools... Is Headless Container running?" )
183
231
repoStats .Inc (common .HeadlessError )
184
- return err
232
+ return response , err
185
233
}
186
234
defer conn .Close () // Leaving connections open will leak memory.
187
235
@@ -194,7 +242,7 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
194
242
log .WithFields (log.Fields {"url" : url , "issue" : "Not REPO FAULT. Devtools... Is Headless Container running?" }).Error (err )
195
243
repologger .WithFields (log.Fields {"url" : url }).Error ("Not REPO FAULT. Devtools... Is Headless Container running?" )
196
244
repoStats .Inc (common .HeadlessError )
197
- return err
245
+ return response , err
198
246
}
199
247
200
248
// Open a DOMContentEventFired client to buffer this event.
@@ -203,26 +251,42 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
203
251
log .WithFields (log.Fields {"url" : url , "issue" : "Not REPO FAULT. Devtools... Is Headless Container running?" }).Error (err )
204
252
repologger .WithFields (log.Fields {"url" : url }).Error ("Not REPO FAULT. Devtools... Is Headless Container running?" )
205
253
repoStats .Inc (common .HeadlessError )
206
- return err
254
+ return response , err
207
255
}
208
256
defer domContent .Close ()
209
257
258
+ // Open a LoadEventFired client to buffer this event.
259
+ loadEventFired , err := c .Page .LoadEventFired (ctx )
260
+ if err != nil {
261
+ log .WithFields (log.Fields {"url" : url , "issue" : "Not REPO FAULT. Devtools... Is Headless Container running?" }).Error (err )
262
+ repologger .WithFields (log.Fields {"url" : url }).Error ("Not REPO FAULT. Devtools... Is Headless Container running?" )
263
+ repoStats .Inc (common .HeadlessError )
264
+ return response , err
265
+ }
266
+ defer loadEventFired .Close ()
267
+
210
268
// Create the Navigate arguments with the optional Referrer field set.
211
269
navArgs := page .NewNavigateArgs (url )
212
270
nav , err := c .Page .Navigate (ctx , navArgs )
213
271
if err != nil {
214
272
log .WithFields (log.Fields {"url" : url , "issue" : "Navigate To Headless" }).Error (err )
215
273
repologger .WithFields (log.Fields {"url" : url , "issue" : "Navigate To Headless" }).Error (err )
216
274
repoStats .Inc (common .HeadlessError )
217
- return err
275
+ return response , err
218
276
}
219
277
278
+ _ , err = loadEventFired .Recv ()
279
+ if err != nil {
280
+ return nil , err
281
+ }
282
+ loadEventFired .Close ()
283
+
220
284
// Wait until we have a DOMContentEventFired event.
221
285
if _ , err = domContent .Recv (); err != nil {
222
286
log .WithFields (log.Fields {"url" : url , "issue" : "Dom Error" }).Error (err )
223
287
repologger .WithFields (log.Fields {"url" : url , "issue" : "Dom Error" }).Error (err )
224
288
repoStats .Inc (common .HeadlessError )
225
- return err
289
+ return response , err
226
290
}
227
291
228
292
log .WithFields (log.Fields {"url" : url , "issue" : "Navigate Complete" }).Debug (nav .FrameID , "for" , url )
@@ -235,7 +299,7 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
235
299
* I cannot figure out how to get the cdp Runtime to distinguish between a resolved and a rejected
236
300
* promise - so in this case, we simply do not index a document, and fail silently.
237
301
**/
238
- expression := `
302
+ expressionTmpl := `
239
303
function getMetadata() {
240
304
return new Promise((resolve, reject) => {
241
305
const elements = document.querySelectorAll('script[type="application/ld+json"]');
@@ -249,12 +313,12 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
249
313
resolve(metadata);
250
314
}
251
315
else {
252
- reject("No JSON-LD present after 1 second.");
316
+ reject("No JSON-LD present after {{timeout}} second.");
253
317
}
254
318
});
255
319
}
256
320
257
- function retry(fn, retriesLeft = 3 , interval = 1000 ) {
321
+ function retry(fn, retriesLeft = {{retries}} , interval = {{timeout}} ) {
258
322
return new Promise((resolve, reject) => {
259
323
fn()
260
324
.then(resolve)
@@ -270,25 +334,35 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
270
334
});
271
335
});
272
336
}
337
+ function sleep(ms) {
338
+ return new Promise(resolve => setTimeout(resolve, ms));
339
+ }
273
340
274
- retry(getMetadata);
341
+ sleep( {{headlesswait}} ).then( () => { return retry(getMetadata) } );
342
+
275
343
`
276
-
344
+ tmpl := fasttemplate .New (expressionTmpl , "{{" , "}}" )
345
+ expression := tmpl .ExecuteString (map [string ]interface {}{
346
+ "timeout" : fmt .Sprintf ("%d" , timeout .Milliseconds ()),
347
+ "headlesswait" : fmt .Sprintf ("%d" , headlessWait * 1000 ),
348
+ "retries" : "3" ,
349
+ })
350
+ log .Trace (expression )
277
351
evalArgs := runtime .NewEvaluateArgs (expression ).SetAwaitPromise (true ).SetReturnByValue (true )
278
352
eval , err := c .Runtime .Evaluate (ctx , evalArgs )
279
353
if err != nil {
280
354
log .WithFields (log.Fields {"url" : url , "issue" : "Headless Evaluate" }).Error (err )
281
355
repologger .WithFields (log.Fields {"url" : url , "issue" : "Headless Evaluate" }).Error (err )
282
356
repoStats .Inc (common .Issues )
283
- return ( err )
357
+ return response , err
284
358
}
285
359
286
360
// Rejecting that promise just sends null as its value,
287
361
// so we need to stop if we got that.
288
362
if eval .Result .Value == nil {
289
363
repologger .WithFields (log.Fields {"url" : url , "issue" : "Headless Nil Result" }).Trace ()
290
364
repoStats .Inc (common .EmptyDoc )
291
- return nil
365
+ return response , nil
292
366
}
293
367
294
368
// todo: what are the data types that will always be in this json? we
@@ -298,7 +372,7 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
298
372
log .WithFields (log.Fields {"url" : url , "issue" : "Json Unmarshal" }).Error (err )
299
373
repologger .WithFields (log.Fields {"url" : url , "issue" : "Json Unmarshal" }).Error (err )
300
374
repoStats .Inc (common .Issues )
301
- return ( err )
375
+ return response , err
302
376
}
303
377
304
378
if len (jsonlds ) > 1 {
@@ -307,43 +381,49 @@ func PageRender(v1 *viper.Viper, mc *minio.Client, timeout time.Duration, url, k
307
381
for _ , jsonld := range jsonlds {
308
382
valid , err := isValid (v1 , jsonld )
309
383
if err != nil {
384
+ // there could be one bad jsonld, and one good. We want to process the jsonld
385
+ // so, do not set an err
310
386
log .WithFields (log.Fields {"url" : url , "issue" : "invalid JSON" }).Error ("error checking for valid json :" , err )
311
387
repologger .WithFields (log.Fields {"url" : url , "issue" : "invalid JSON" }).Error (err )
312
388
repoStats .Inc (common .Issues )
313
389
} else if valid && jsonld != "" { // traps out the root domain... should do this different
314
- sha , err := Upload (v1 , mc , bucketName , k , url , jsonld )
315
- if err != nil {
316
- log .WithFields (log.Fields {"url" : url , "sha" : sha , "issue" : "Error uploading jsonld to object store" }).Error ("Error uploading jsonld to object store:" , url , err , sha )
317
- repologger .WithFields (log.Fields {"url" : url , "sha" : sha , "issue" : "Error uploading jsonld to object store" }).Error (err )
318
- repoStats .Inc (common .StoreError )
319
- } else {
320
- repologger .WithFields (log.Fields {"url" : url , "sha" : sha , "issue" : "Uploaded JSONLD to object store" }).Debug ()
321
- repoStats .Inc (common .Stored )
322
- }
323
- // TODO Is here where to add an entry to the KV store
324
- err = db .Update (func (tx * bolt.Tx ) error {
325
- b := tx .Bucket ([]byte (k ))
326
- err := b .Put ([]byte (url ), []byte (sha ))
327
- if err != nil {
328
- log .Error ("Error writing to bolt" , err )
329
- }
330
- return nil
331
- })
390
+ response = append (response , jsonld )
391
+ err = nil
392
+ //sha, err := Upload(v1, mc, bucketName, k, url, jsonld)
393
+ //if err != nil {
394
+ // log.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error("Error uploading jsonld to object store:", url, err, sha)
395
+ // repologger.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Error uploading jsonld to object store"}).Error(err)
396
+ // repoStats.Inc(common.StoreError)
397
+ //} else {
398
+ // repologger.WithFields(log.Fields{"url": url, "sha": sha, "issue": "Uploaded JSONLD to object store"}).Debug()
399
+ // repoStats.Inc(common.Stored)
400
+ //}
401
+ //// TODO Is here where to add an entry to the KV store
402
+ //err = db.Update(func(tx *bolt.Tx) error {
403
+ // b := tx.Bucket([]byte(k))
404
+ // err := b.Put([]byte(url), []byte(sha))
405
+ // if err != nil {
406
+ // log.Error("Error writing to bolt", err)
407
+ // }
408
+ // return nil
409
+ //})
332
410
} else {
411
+ // there could be one bad jsonld, and one good. We want to process the jsonld
412
+ // so, do not set an err
333
413
log .Info ("Empty JSON-LD document found. Continuing." , url )
334
414
repologger .WithFields (log.Fields {"url" : url , "issue" : "Empty JSON-LD document found" }).Debug ()
335
415
repoStats .Inc (common .EmptyDoc )
336
416
// TODO Is here where to add an entry to the KV store
337
- err = db .Update (func (tx * bolt.Tx ) error {
338
- b := tx .Bucket ([]byte (k ))
339
- err := b .Put ([]byte (url ), []byte ("NULL" )) // no JOSN-LD found at this URL
340
- if err != nil {
341
- log .Error ("Error writing to bolt" , err )
342
- }
343
- return nil
344
- })
417
+ // err = db.Update(func(tx *bolt.Tx) error {
418
+ // b := tx.Bucket([]byte(k))
419
+ // err := b.Put([]byte(url), []byte("NULL")) // no JOSN-LD found at this URL
420
+ // if err != nil {
421
+ // log.Error("Error writing to bolt", err)
422
+ // }
423
+ // return nil
424
+ // })
345
425
}
346
426
}
347
427
348
- return err
428
+ return response , err
349
429
}
0 commit comments