-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtmlExtractor.js
143 lines (132 loc) · 6.25 KB
/
htmlExtractor.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
var fs = require('fs');
var json2csv = require('json2csv');
var mkdirp = require('mkdirp');
var csv = require('csv-parser');
var empty = require('is-empty');
var counter = 1; // For html extractions
var rowCount = 0; // For records
var outlines = [];
var columnMap = [];
var fields = [];
var extractHtmlFromFields = [];
var timeStamp = Date.now();
//{"check bij":"check bij","cluster":"cluster","vraag":"vraag","status":"status","prio":"prio",
// "antwoord":"antwoord","e_antwoord":"on","nieuwe tekst":"nieuwe tekst","e_nieuwe tekst":"on","zoekmachine termen":"TERMS",
// "zoekmachine beschrijving":"zoekmachine beschrijving","update gevraagd":"update gevraagd","\r\n":"",
// "createimportprops":"","
// importprops":"CSVEncoding=UTF8\r\nRTAEncoding=UTF8\r\nCSVSeparator=,\r\n#DateFormat=yyyy-MM-dd\r\n"}
module.exports = function(inputFile, req, callback) {
var mappingdata = req.body;
console.log('IN HTML EXTRACTOR '+JSON.stringify(mappingdata));
var options = req.session.csvopts;
/*var options = {
raw: false, // do decode to utf-8 strings
separator: csvParameters.sep, // specify optional cell separator
quote: csvParameters.qte, // specify optional quote character
escape: csvParameters.esc, // specify optional escape character (defaults to quote value)
newline: csvParameters.nl // specify a newline character
};*/
var stream = csv(options);
mkdirp('output/' + timeStamp + '/html', function (err) {
//
// Import.props?
if(mappingdata.createimportprops==="on") {
fs.writeFile('output/' + timeStamp+'/import.properties', mappingdata.importprops, function(err) {
console.log('Import props written');
});
}
fs.createReadStream(inputFile)
.pipe(stream)
.on('headers', function (hdrs) {
//console.log('IN HEADERS');
//
// Set the appropriate headers for the appropriate actions
for(var hd in hdrs) {
//
// For output
// Collect headers and extraction columns
if(!empty(mappingdata[hdrs[hd]])) {
fields.push(mappingdata[hdrs[hd]]);
var headerMap = {};
headerMap[hdrs[hd]] = mappingdata[hdrs[hd]];
columnMap.push(headerMap);
if(mappingdata["e_"+hdrs[hd]] === "on") {
//
// For extraction
extractHtmlFromFields.push(hdrs[hd]);
}
}
}
})
.on('data', function (dta) {
//console.log('IN DATA '+Date.now());
rowCount++;
if(dta) {
stream.pause();
//console.log('### dta: '+JSON.stringify(dta));
var row = {};
//
// Iterate the input headers and put data
// in object with output headers as properties
for(var colNr in columnMap) {
for(var prop in columnMap[colNr]) {
//
// Uncomment for semantics ;)
//console.log('### Input header: ' + JSON.stringify(prop));
//console.log('### Input data: ' + JSON.stringify(dta[prop]));
//console.log('### Output header: ' + JSON.stringify(columnMap[colNr][prop]));
if (!empty(prop) && !empty(dta[prop]) && !empty(columnMap[colNr][prop])) {
if (extractHtmlFromFields.includes(prop)) {
//
// Extract html to separate file
var fileLocation = 'output/' + timeStamp + '/';
var propName = columnMap[colNr][prop].replace(/\s/,'_');
var fileName = 'html/html_extraction_'+propName+'_'+ counter + '.html';
counter++;
fs.writeFile(fileLocation+fileName, dta[prop], function (err) {
//
// Replace csv contents with file location
row[columnMap[colNr][prop]] = fileName;
//
console.log('### ROW '+JSON.stringify(row));
stream.resume();
});
} else {
row[columnMap[colNr][prop]] = dta[prop];
stream.resume();
}
}
}
}
outlines.push(row);
}
})
.on('end', function () {
// We are done, write the final csv
console.log('File END with ' + rowCount + ' rows');
var timeOut = setTimeout(function() {
console.log('Setting timeout');
writeOutputCsv('output/' + timeStamp, callback);
}, 1000);
// clearTimeout(timeOut);
//stream.close();
//console.log('outlines '+JSON.stringify(outlines));
//writeOutputCsv('output/' + timeStamp, callback);
})
.on('finish', function () {
// We are done, write the final csv
console.log('File FINISH with ' + rowCount + ' rows');
//console.log('outlines '+JSON.stringify(outlines));
//writeOutputCsv('output/' + timeStamp, callback);
});
});
};
function writeOutputCsv(outputDir, callback) {
console.log('IN OUTPUT');
var csvData = json2csv({ data: outlines, fields: fields});
fs.writeFile(outputDir+'/result.csv', csvData, function(err) {
if (err) throw err;
console.log('Saved CSV');
callback(outputDir);
});
}