Skip to content

Commit 603be50

Browse files
Store Technology meta data in BQ and icons in GCS (#73)
* dependency fix and update * uploads * roll back babel-eslint * upload icons * lint * full schema * fix name * service account auth * upload workflow * Merge main into bq-upload * icons upload tested * lint * typo
1 parent dd1b4e9 commit 603be50

14 files changed

+1178
-258
lines changed

.eslintrc.js

-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ module.exports = {
1010
extends: [
1111
'@nuxtjs',
1212
'prettier',
13-
'prettier/vue',
1413
'plugin:prettier/recommended',
1514
'plugin:nuxt/recommended',
1615
'plugin:json/recommended',

.github/workflows/test.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
WPT_SERVER: "webpagetest.httparchive.org"
3535
WPT_API_KEY: ${{ secrets.HA_API_KEY }}
3636
PR_NUMBER: ${{ github.event.pull_request.number }}
37-
run: yarn test
37+
run: yarn run test
3838

3939
- name: Run WebPageTest for more websites
4040
id: wpt-test

.github/workflows/upload.yml

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
name: Tests
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
paths:
8+
- "src/technologies/*.json"
9+
- "src/categories.json"
10+
- "src/groups.json"
11+
workflow_dispatch:
12+
13+
jobs:
14+
test:
15+
name: Test and upload to GCP
16+
runs-on: ubuntu-latest
17+
steps:
18+
- name: Checkout
19+
uses: actions/checkout@v4
20+
with:
21+
ref: ${{ github.event.pull_request.head.sha }}
22+
fetch-depth: 0
23+
24+
- name: Install dependencies
25+
run: yarn install
26+
27+
- name: Validate
28+
run: yarn run validate
29+
30+
- name: Run WebPageTest with unit tests
31+
id: unit-test
32+
env:
33+
WPT_SERVER: "webpagetest.httparchive.org"
34+
WPT_API_KEY: ${{ secrets.HA_API_KEY }}
35+
PR_NUMBER: ${{ github.event.pull_request.number }}
36+
run: yarn run test
37+
38+
- name: Upload to GCP
39+
id: upload
40+
env:
41+
GCP_PROJECT_ID: ${{ secrets.GCP_PROJECT_ID }}
42+
GCP_SA_KEY: ${{ secrets.GCP_SA_KEY }}
43+
run: |
44+
echo $GCP_SA_KEY > /tmp/gcp_key.json
45+
yarn run upload

package.json

+8-5
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
"convert-svg-to-png": "^0.5.0"
66
},
77
"devDependencies": {
8+
"@google-cloud/bigquery": "^7.7.0",
9+
"@google-cloud/storage": "^7.11.0",
810
"@nuxtjs/eslint-config": "^3.1.0",
911
"@nuxtjs/eslint-module": "^2.0.0",
1012
"babel-eslint": "^10.1.0",
@@ -19,12 +21,13 @@
1921
"webpagetest": "github:HTTPArchive/WebPageTest.api-nodejs"
2022
},
2123
"scripts": {
22-
"lint": "eslint src/**/*.{js,json} tests/**/*.js bin/**/*.js && jsonlint -jsV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -js --trim-trailing-commas --enforce-double-quotes ./src/categories.json",
23-
"lint:fix": "eslint --fix src/**/*.{js,json} tests/**/*.js bin/**/*.js && jsonlint -isV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -is --trim-trailing-commas --enforce-double-quotes ./src/categories.json",
24-
"validate": "yarn run lint && node ./bin/validate.js",
24+
"lint": "eslint src/**/*.{js,json} tests/**/*.js scripts/**/*.js && jsonlint -jsV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -js --trim-trailing-commas --enforce-double-quotes ./src/categories.json",
25+
"lint:fix": "eslint --fix src/**/*.{js,json} tests/**/*.js scripts/**/*.js && jsonlint -isV ./schema.json --trim-trailing-commas --enforce-double-quotes ./src/technologies/ && jsonlint -is --trim-trailing-commas --enforce-double-quotes ./src/categories.json",
26+
"validate": "yarn run lint && node ./scripts/validate.js",
2527
"test": "jest",
26-
"convert": "node --no-warnings ./bin/convert.js",
27-
"build": "yarn run validate && yarn run convert && node ./bin/build.js"
28+
"upload": "node ./scripts/upload_technology.js",
29+
"convert": "node --no-warnings ./scripts/convert.js",
30+
"build": "yarn run validate && yarn run convert && node ./scripts/build.js"
2831
},
2932
"jest": {
3033
"reporters": [

bin/build.js scripts/build.js

File renamed without changes.

bin/convert.js scripts/convert.js

File renamed without changes.

scripts/upload_icons.js

+58
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/* eslint-disable no-console */
2+
const fs = require('fs')
3+
const path = require('path')
4+
const { Storage } = require('@google-cloud/storage')
5+
6+
// Configuration
7+
const BUCKET_NAME = 'technology_detections'
8+
const ICONS_DIR = path.resolve(__dirname, '../src/images/icons/converted') // Local directory where your PNG icons are stored
9+
10+
const storage = new Storage({
11+
keyFilename: '/tmp/gcp_key.json',
12+
})
13+
14+
async function syncIcons() {
15+
const bucket = storage.bucket(BUCKET_NAME)
16+
17+
// Get list of files in the bucket
18+
const [filesInBucket] = await bucket.getFiles()
19+
const bucketFilesMap = new Map(
20+
filesInBucket.map((file) => [
21+
file.name,
22+
new Date(file.metadata.updated).getTime(),
23+
])
24+
)
25+
26+
// Read all files from the local icons directory
27+
const localFiles = fs
28+
.readdirSync(ICONS_DIR)
29+
.filter((file) => file.endsWith('.png'))
30+
31+
for (const file of localFiles) {
32+
const filePath = path.join(ICONS_DIR, file)
33+
const fileMetadata = fs.statSync(filePath)
34+
const fileInBucketUpdatedTime = bucketFilesMap.get(file)
35+
36+
// Upload file if it's new or has been updated
37+
if (
38+
!fileInBucketUpdatedTime ||
39+
fileMetadata.mtime.getTime() > fileInBucketUpdatedTime
40+
) {
41+
try {
42+
await bucket.upload(filePath, {
43+
destination: 'icons/' + file,
44+
metadata: {
45+
contentType: 'image/png',
46+
},
47+
})
48+
console.log(`Uploaded: ${file}`)
49+
} catch (err) {
50+
console.error(`Error uploading file ${file}:`, err)
51+
}
52+
} else {
53+
console.log(`File already exists and is up to date: ${file}`)
54+
}
55+
}
56+
}
57+
58+
syncIcons().catch(console.error)

scripts/upload_technology.js

+220
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
/* eslint-disable no-console */
2+
// A script to upload technologies and their categories to BigQuery.
3+
4+
const fs = require('fs')
5+
const path = require('path')
6+
const { BigQuery } = require('@google-cloud/bigquery')
7+
8+
const readJsonFiles = (directory) => {
9+
const files = fs.readdirSync(directory)
10+
return files.reduce((mergedData, file) => {
11+
const filePath = path.join(directory, file)
12+
const data = fs.readFileSync(filePath, 'utf8')
13+
return { ...mergedData, ...JSON.parse(data) }
14+
}, {})
15+
}
16+
17+
const getArray = (value) =>
18+
typeof value === 'string' ? [value] : Array.isArray(value) ? value : []
19+
20+
const getRuleObject = (value) => {
21+
if (typeof value === 'string') {
22+
return [{ name: value, value: null }]
23+
}
24+
if (Array.isArray(value)) {
25+
return value.map((key) => ({ name: key, value: null }))
26+
}
27+
if (typeof value === 'object') {
28+
return Object.keys(value).map((key) => ({
29+
name: key,
30+
value:
31+
typeof value[key] === 'object'
32+
? JSON.stringify(value[key])
33+
: value[key].toString(),
34+
}))
35+
}
36+
return []
37+
}
38+
39+
const loadToBigQuery = async (
40+
data,
41+
tableName = 'apps',
42+
datasetName = 'wappalyzer',
43+
writeDisposition = 'WRITE_TRUNCATE',
44+
sourceFormat = 'NEWLINE_DELIMITED_JSON'
45+
) => {
46+
if (!data) {
47+
throw new Error(`No data to load to \`${datasetName}.${tableName}\`.`)
48+
}
49+
50+
const bigquery = new BigQuery({
51+
keyFilename: '/tmp/gcp_key.json',
52+
})
53+
const schema = {
54+
fields: [
55+
{ name: 'name', type: 'STRING' },
56+
{ name: 'categories', type: 'STRING', mode: 'REPEATED' },
57+
{ name: 'website', type: 'STRING' },
58+
{ name: 'description', type: 'STRING' },
59+
{ name: 'icon', type: 'STRING' },
60+
{ name: 'cpe', type: 'STRING' },
61+
{ name: 'saas', type: 'BOOLEAN' },
62+
{ name: 'oss', type: 'BOOLEAN' },
63+
{ name: 'pricing', type: 'STRING', mode: 'REPEATED' },
64+
{ name: 'implies', type: 'STRING', mode: 'REPEATED' },
65+
{ name: 'requires', type: 'STRING', mode: 'REPEATED' },
66+
{ name: 'requiresCategory', type: 'STRING', mode: 'REPEATED' },
67+
{ name: 'excludes', type: 'STRING', mode: 'REPEATED' },
68+
{
69+
name: 'cookies',
70+
type: 'RECORD',
71+
mode: 'REPEATED',
72+
fields: [
73+
{ name: 'name', type: 'STRING' },
74+
{ name: 'value', type: 'STRING' },
75+
],
76+
},
77+
{
78+
name: 'dom',
79+
type: 'RECORD',
80+
mode: 'REPEATED',
81+
fields: [
82+
{ name: 'name', type: 'STRING' },
83+
{ name: 'value', type: 'STRING' },
84+
],
85+
},
86+
{
87+
name: 'dns',
88+
type: 'RECORD',
89+
mode: 'REPEATED',
90+
fields: [
91+
{ name: 'name', type: 'STRING' },
92+
{ name: 'value', type: 'STRING' },
93+
],
94+
},
95+
{
96+
name: 'js',
97+
type: 'RECORD',
98+
mode: 'REPEATED',
99+
fields: [
100+
{ name: 'name', type: 'STRING' },
101+
{ name: 'value', type: 'STRING' },
102+
],
103+
},
104+
{
105+
name: 'headers',
106+
type: 'RECORD',
107+
mode: 'REPEATED',
108+
fields: [
109+
{ name: 'name', type: 'STRING' },
110+
{ name: 'value', type: 'STRING' },
111+
],
112+
},
113+
{ name: 'text', type: 'STRING', mode: 'REPEATED' },
114+
{ name: 'css', type: 'STRING', mode: 'REPEATED' },
115+
{
116+
name: 'probe',
117+
type: 'RECORD',
118+
mode: 'REPEATED',
119+
fields: [
120+
{ name: 'name', type: 'STRING' },
121+
{ name: 'value', type: 'STRING' },
122+
],
123+
},
124+
{ name: 'robots', type: 'STRING', mode: 'REPEATED' },
125+
{ name: 'url', type: 'STRING', mode: 'REPEATED' },
126+
{ name: 'xhr', type: 'STRING', mode: 'REPEATED' },
127+
{
128+
name: 'meta',
129+
type: 'RECORD',
130+
mode: 'REPEATED',
131+
fields: [
132+
{ name: 'name', type: 'STRING' },
133+
{ name: 'value', type: 'STRING' },
134+
],
135+
},
136+
{ name: 'scriptSrc', type: 'STRING', mode: 'REPEATED' },
137+
{ name: 'script', type: 'STRING', mode: 'REPEATED' },
138+
{ name: 'html', type: 'STRING', mode: 'REPEATED' },
139+
],
140+
}
141+
142+
const options = { schema, sourceFormat, writeDisposition }
143+
const [job] = await bigquery
144+
.dataset(datasetName)
145+
.table(tableName)
146+
.load(data, options)
147+
148+
if (job.status.errors && job.status.errors.length > 0) {
149+
console.error('Errors encountered:', job.status.errors)
150+
throw new Error('Error loading data into BigQuery')
151+
}
152+
153+
console.log(
154+
`Loaded ${job.numRowsLoaded} rows into ${datasetName}.${tableName}...`
155+
)
156+
}
157+
158+
const main = async () => {
159+
const technologies = readJsonFiles('./src/technologies')
160+
const categories = JSON.parse(
161+
fs.readFileSync('./src/categories.json', 'utf8')
162+
)
163+
164+
const transformedTechnologies = Object.keys(technologies).map((key) => {
165+
const app = {
166+
name: key,
167+
categories: technologies[key].cats.map(
168+
(category) => categories[category].name
169+
),
170+
}
171+
172+
;[
173+
'implies',
174+
'requires',
175+
'requiresCategory',
176+
'excludes',
177+
'text',
178+
'css',
179+
'robots',
180+
'url',
181+
'xhr',
182+
'scriptSrc',
183+
'script',
184+
'html',
185+
].forEach((field) => {
186+
app[field] = getArray(technologies[key][field])
187+
})
188+
;['cookies', 'dom', 'dns', 'js', 'headers', 'probe', 'meta'].forEach(
189+
(field) => {
190+
app[field] = getRuleObject(technologies[key][field])
191+
}
192+
)
193+
;[
194+
'website',
195+
'description',
196+
'icon',
197+
'cpe',
198+
'saas',
199+
'oss',
200+
'pricing',
201+
].forEach((field) => {
202+
app[field] = technologies[key][field]
203+
})
204+
205+
return app
206+
})
207+
208+
const transformedTechnologiesJsonL = transformedTechnologies
209+
.map((line) => JSON.stringify(line))
210+
.join('\n')
211+
const filePath = './transformedTechnologies.jsonl'
212+
fs.writeFileSync(filePath, transformedTechnologiesJsonL)
213+
214+
await loadToBigQuery(filePath, 'apps')
215+
216+
// cleanup file
217+
fs.unlinkSync(filePath)
218+
}
219+
220+
main().catch(console.error)
File renamed without changes.

src/technologies/r.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -632,10 +632,10 @@
632632
"description": "React Router provides declarative routing for React.",
633633
"icon": "React Router.svg",
634634
"implies": "React",
635-
"oss": true,
636635
"js": {
637636
"__reactRouterVersion": "([\\d\\.]+)\\;version:\\1"
638637
},
638+
"oss": true,
639639
"website": "https://reactrouter.com"
640640
},
641641
"Reactive": {

src/technologies/s.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -4132,8 +4132,8 @@
41324132
"description": "Slider Revolution is a flexible and highly customisable slider.",
41334133
"icon": "Slider Revolution.svg",
41344134
"js": {
4135-
"SR7.version": "^Slider Revolution\\s([\\d\\.]+)$\\;version:\\1",
41364135
"RS_MODULES.main.version": "^Slider Revolution\\s([\\d\\.]+)$\\;version:\\1",
4136+
"SR7.version": "^Slider Revolution\\s([\\d\\.]+)$\\;version:\\1",
41374137
"revapi1": "",
41384138
"revapi2": "",
41394139
"revapi3": "",

0 commit comments

Comments
 (0)