forked from LuisCorales/webcrawler-js
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.js
110 lines (91 loc) · 4.15 KB
/
app.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
const puppeteer = require('puppeteer');
const fs = require('fs');
const url = 'https://news.ycombinator.com/';
/**
* Scrapes all the data of the specified URL. In this case, it works only for the Hacker News website.
* @param {string} url The website url
* @returns An array of objects of all the scraped data
*/
const scrapeData = (async (url) =>
{
// Open the browser and go to the specified url
const browser = await puppeteer.launch({headless: true});
const page = await browser.newPage();
await page.goto(url);
const data = await page.evaluate(() =>
{
const orderRows = [];
const titleRows = [];
const commentRows = [];
// Get all needed data elements (order, title, comments, points)
document.querySelectorAll('.athing .title .rank').forEach((element) => orderRows.push(parseInt(element.innerText)));
document.querySelectorAll('.athing .title a.titleLink').forEach((element) => titleRows.push(element.innerText));
document.querySelectorAll('.subtext').forEach((element) => {
let text = element.lastElementChild.innerText.match(/\d+/g);
commentRows.push(text != null ? parseInt(text[0]) : 0);
});
const pointsElements = document.querySelectorAll('.subtext .score');
const getPointsRow = (pointsElements, row = []) => {
// If there is no news without points
if(pointsElements.length === 30) {
pointsElements.forEach((element) => row.push(parseInt(element.innerText.split(" ")[0])));
return row;
}
// If there is at least one news without points, then proceed to do another process
document.querySelectorAll('.subtext').forEach(element => {
let numbersInInfo = element.innerText.match(/\d+/g);
row.push(numbersInInfo.length < 2 ? 0 : parseInt(numbersInInfo[0]));
});
return row;
};
// Create an object of each news
const createNewsArray = (orderRows, titleRows, commentRows, pointsRows) => {
return orderRows.map((element, index) => {
return {order: element, title: titleRows[index], comments: commentRows[index], points: pointsRows[index]};
});
};
const pointsRows = getPointsRow(pointsElements);
return createNewsArray(orderRows, titleRows, commentRows, pointsRows);
});
await browser.close();
return data;
});
/**
* Filter titles with less or equal than 5 words and sort by points.
* @param {*} objectsArray The array of objects of filtered data
* @returns An array of objects filtered by points.
*/
const filterDataByPoints = (objectsArray => {
// Filters object titles by checking if it have less or equal than 5 words
objectsArray = objectsArray.filter((obj) => {
if (obj.title.split(" ").length <= 5) return true;
return false;
});
return objectsArray.sort((a,b) => b.points - a.points);
});
/**
* Filter titles with more than 5 words and sort by comments.
* @param {*} objectsArray The array of objects of filtered data.
* @returns An array of objects filtered by comments number.
*/
const filterDataByComments = (objectsArray => {
// Filters object titles by checking if it have more than 5 words
objectsArray = objectsArray.filter((obj) => {
if (obj.title.split(" ").length > 5) return true;
return false;
});
return objectsArray.sort((a,b) => b.comments - a.comments);
});
// Just an auto executed function to run all functions
(async () => {
const allNews = await scrapeData(url);
// Create a json file for all the news
const news = JSON.stringify(allNews);
fs.writeFileSync("./data/news.json", news);
// Create a json file for the news filtered by comments
const newsFilteredByComments = JSON.stringify(filterDataByComments(allNews));
fs.writeFileSync("./data/newsByComments.json", newsFilteredByComments);
// Create a json file for the news filtered by points
const newsFilteredByPoints = JSON.stringify(filterDataByPoints(allNews));
fs.writeFileSync("./data/newsByPoints.json", newsFilteredByPoints);
})();