Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- "use strict";
- const fs = require('fs').promises;
- const html = require('node-html-parser');
- const request = require('request');
- // - Read HTML document INPUT_FILE from file system
- // - Extract all <a> tags
- //
- // The <a href> links are supposed to point to wikipedia articles of locations
- // Therefore, following the wikipedia convention, there should be
- // elements of class name "longitude" and "latitude",
- // specifying the geographical coordinates of the location
- //
- // - Read the HTML content of the specified links from the internet
- // - Extract coordinates from selectors ".longitude" and ".latitude"
- // - Output results in csv format:
- // nr,name,url,lon,lat
- // Here, nr is the running index of the array of <a> elements of INPUT_FILE
- // Parameters, by default or from the command line
- const [
- INPUT_FILE, // Name of the HTML input file containing links
- CHUNK_SIZE, // Chunk size for parallel HTTP(S) request execution
- DELAY // Delay in ms between chunk execution
- ] = getParameters.apply(process.argv.slice(2));
- run();
- async function run() {
- try {
- const fileContent = await fs.readFile(INPUT_FILE, 'utf8');
- const allGeoPos = await resolveAllGeoPosFromLinks( fileContent );
- generateCSV(allGeoPos);
- } catch(err) {
- console.error(err);
- }
- }
- // Extract coordinates of all given locations linked in INPUT_FILE
- async function resolveAllGeoPosFromLinks( inputFileHTML ) {
- let allGeoPos = [], i=0;
- const links = extractLinks( inputFileHTML );
- const chunks = chunk( links, CHUNK_SIZE );
- for (let c of chunks) {
- await doChunk(c);
- if (DELAY>0) await delay(DELAY);
- };
- return allGeoPos;
- function doChunk(c) {
- let pChunk = c.map( a=>getGeoPos(a,++i).then(x=>allGeoPos.push(x)) );
- return Promise.all( pChunk );
- }
- }
- // Promise to extract coordinates from this location at some time
- function getGeoPos(a,i) {
- return new Promise((resolve,reject)=>{
- const url = encodeURI( a.href );
- fetchURL(url).then( htmlContent => {
- let pos = extractPos(htmlContent,a,i);
- resolve(pos);
- });
- });
- }
- // Extract coordinates from wikipedia page of location
- function extractPos(htmlContent,a,i) {
- const doc = html.parse( htmlContent );
- const coord = [".longitude", ".latitude" ].map(selectText);
- return {
- nr:i,
- name:a.name,
- href:a.href,
- lon:coord[0],
- lat:coord[1]
- };
- function selectText(selector) {
- var element = doc.querySelector(selector);
- return element && element.text || "";
- }
- }
- function generateCSV(results) {
- // Sort by index of occurrence in input file
- results.sort((a,b)=>a.nr*1-b.nr);
- // Prepare data rows in .csv format
- const csv = results.map(generateCSVRow).join('\n');
- // Put result to standard output
- console.log(csv);
- function generateCSVRow( v ) {
- return `${v.nr},${v.name},${v.href},${v.lon},${v.lat}`;
- }
- }
- // Parameters, by default or read from command line
- function getParameters(
- file_name = 'orte-w.html',
- chunk_size = 5,
- delay = 200
- ) {
- return [ file_name, chunk_size, delay ];
- }
- // General: parse HTML source code and extract all <a> hyperlinks
- function extractLinks( htmlSourceCode ) {
- const doc = html.parse( htmlSourceCode );
- return doc.querySelectorAll("a").map( extractNameAndHref );
- function extractNameAndHref(a) {
- return {
- name:a.text,
- href:a.getAttribute("href")
- }
- }
- }
- // General: setTimeout(), but as a promise
- function delay(timeToDelay) {
- return new Promise(
- (resolve) => setTimeout(resolve, timeToDelay)
- );
- }
- // General: Promise to perform a delayed HTTP(S) request
- // Delay = 0 just waits for the event loop to be finished
- function fetchURLDelayed(url,delay=0) {
- return new Promise(function (resolve, reject) {
- setTimeout(
- ()=>fetchURL(url)(resolve,reject),
- delay)
- });
- }
- // General: Promise to perform an HTTP(S) request
- function fetchURL(url) {
- return new Promise(function (resolve, reject) {
- request(url, (error, response, body) => {
- if (error) return reject(error);
- if (response.statusCode != 200) {
- return reject('Invalid status code <' + response.statusCode + '> for url '+url);
- }
- return resolve(body);
- })
- });
- }
- // General: Split an array into chunks of size "chunkSize"
- function chunk(a,chunkSize) {
- let chunks = [],i = 0,n = a.length;
- while (i < n) {
- chunks.push(a.slice(i, i += chunkSize));
- }
- return chunks;
- }
Add Comment
Please, Sign In to add comment