Skip to content

Commit 7dc2cc7

Browse files
committed
fixing scrape failures
1 parent 86915d6 commit 7dc2cc7

File tree

3 files changed

+25
-78
lines changed

3 files changed

+25
-78
lines changed

notice-of-property-value-pdf/scrape.js

+2-38
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ var textract = require('textract');
44
var _ = require('underscore');
55
var walk = require('walk');
66
var path = require('path');
7-
//var fs = require('fs');
87
var incomeTest1 =
98
/(\s*)Gross Income:(\s*)We estimated gross income at \$(.*)\./;
109
var incomeTest2 = /Estimated Gross Income:(\s*)\$(.*)/;
@@ -62,19 +61,11 @@ walkOptions = {
6261
});
6362
},
6463
directories: function (root, dirStatsArray, next) {
65-
// dirStatsArray is an array of `stat` objects with the additional
66-
// attributes
67-
// * type
68-
// * error
69-
// * name
70-
7164
next();
7265
},
7366
file: function (root, fileStats, next) {
74-
//fs.readFile(fileStats.name, function () {
75-
//});
7667
if (fileStats.type === 'file' &&
77-
fileStats.name.match(/Notice of Property Value\.pdf/)) {
68+
fileStats.name.match(/Notice of Property Value\.pdf/i)) {
7869
var fullPath = path.join(root, fileStats.name);
7970
textract('application/pdf', fullPath, {
8071
preserveLineBreaks:true
@@ -93,37 +84,10 @@ walkOptions = {
9384
}
9485
},
9586
errors: function (root, nodeStatsArray, next) {
87+
console.error('walk error');
9688
next();
9789
}
9890
}
9991
};
10092

101-
//_.each(process.argv, function (path, i) {
102-
// if (i < 2) {
103-
// return;
104-
// }
105-
// var absPath = __dirname + "/" + path;
106-
//
107-
// var synchronize = setInterval(function () {
108-
// if (lock === true) {
109-
// return;
110-
// } else {
111-
// lock = true;
112-
// }
113-
// textract('application/pdf', absPath, {
114-
// preserveLineBreaks:true
115-
// }, function(error, text) {
116-
// if (error) {
117-
// console.error("Could not read '" + absPath + "': " + error);
118-
// } else {
119-
// console.log(
120-
// _.values(_.pick(parse(path, text.split('\n')), headers)).join(','));
121-
// }
122-
// lock = false;
123-
// });
124-
// clearInterval(synchronize);
125-
// }, 10);
126-
//});
127-
12893
walk.walk(process.argv[2], walkOptions);
129-
//walk.walkSync(process.argv[2], walkOptions);

quarterly-property-tax-bills-pdf/run.js

+21-38
Original file line numberDiff line numberDiff line change
@@ -22,51 +22,34 @@ var headers = [
2222

2323
function makeCallback(next) {
2424
return function(taxDoc) {
25-
if (!taxDoc) {
26-
next();
27-
return;
28-
}
29-
taxDoc.activityThrough = new Date(
30-
taxDoc.activityThrough).toISOString().split('T')[0];
31-
taxDoc.annualPropertyTax = taxDoc.annualPropertyTax ? Number(
32-
taxDoc.annualPropertyTax.replace(/[$,]/g, '')) : taxDoc.annualPropertyTax;
33-
taxDoc.billableAssessedValue = taxDoc.billableAssessedValue ? Number(
34-
taxDoc.billableAssessedValue.replace(/[$,]/g, '')) :
35-
taxDoc.billableAssessedValue;
36-
taxDoc.taxRate = taxDoc.taxRate ?
37-
Number(taxDoc.taxRate.replace(/[%]/g, '')) :
38-
taxDoc.taxRate;
39-
_.each(taxDoc, function (v, k) {
40-
if (typeof v === 'string') {
41-
if (v.search(',') !== -1) {
42-
v = v.replace(/'"'/g, '\\"');
43-
taxDoc[k] = '"' + v + '"';
25+
if (taxDoc) {
26+
taxDoc.activityThrough = new Date(
27+
taxDoc.activityThrough).toISOString().split('T')[0];
28+
taxDoc.annualPropertyTax = taxDoc.annualPropertyTax ? Number(
29+
taxDoc.annualPropertyTax.replace(/[$,]/g, '')) :
30+
taxDoc.annualPropertyTax;
31+
taxDoc.billableAssessedValue = taxDoc.billableAssessedValue ? Number(
32+
taxDoc.billableAssessedValue.replace(/[$,]/g, '')) :
33+
taxDoc.billableAssessedValue;
34+
taxDoc.taxRate = taxDoc.taxRate ?
35+
Number(taxDoc.taxRate.replace(/[%]/g, '')) :
36+
taxDoc.taxRate;
37+
_.each(taxDoc, function (v, k) {
38+
if (typeof v === 'string') {
39+
if (v.search(',') !== -1) {
40+
v = v.replace(/'"'/g, '\\"');
41+
taxDoc[k] = '"' + v + '"';
42+
}
4443
}
45-
}
46-
});
47-
console.log(_.values(_.pick(taxDoc, headers)).join(','));
44+
});
45+
console.log(_.values(_.pick(taxDoc, headers)).join(','));
46+
}
4847
next();
4948
};
5049
}
5150

5251
console.log(headers.join(','));
5352

54-
//_.each(process.argv, function (path, i) {
55-
// if (i < 2) {
56-
// return;
57-
// }
58-
// var synchronize = setInterval(function () {
59-
// if (lock === true) {
60-
// return;
61-
// } else {
62-
// lock = true;
63-
// }
64-
//
65-
// scraper(path, callback);
66-
// clearInterval(synchronize);
67-
// }, 10);
68-
//});
69-
7053
walkOptions = {
7154
listeners: {
7255
names: function (root, nodeNamesArray) {

quarterly-property-tax-bills-pdf/scrape.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,17 @@ var scrape = function (filepath, callback) {
99
if (err) {
1010
console.error(err);
1111
console.error('error with: ' + filepath);
12+
callback();
1213
} else {
1314
try {
1415
var taxDoc = parse_pdf(text.split(" "));
1516
callback(taxDoc);
16-
return;
1717
} catch (err) {
1818
console.error(err);
1919
console.error('error with: ' + filepath);
20+
callback();
2021
}
2122
}
22-
callback();
2323
})
2424
}
2525

0 commit comments

Comments
 (0)