Advertisement
rplantiko

Extract HTML data from successive HTTP requests

Jan 6th, 2021 (edited)
665
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. "use strict";
  2.  
  3. // Node script to sequentially perform HTTP requests on a site
  4. // - with a varying query parameter,  
  5. // - extracting data from the HTML response
  6. // - collecting results in an array
  7. // - finally writing them to a file
  8. //
  9. // The HTTP requests are performed one by one, synchronously, waiting for the result,
  10. // in order to avoid server overload
  11. // Starting the requests asynchronously instead could result in a server side HTTP error 502
  12. //
  13. // In this example, we fetch data from our local version of https://divinumofficium.com/
  14. // incrementing the query parameter 'date' one by one
  15. //
  16. // For very large result sets, one might use a buffered write stream instead
  17. // In this case, in Node, one may use a stream with fs.createWriteStream(),
  18. // or the low level API fs.openSync(), generating a file descriptor like in C
  19. //
  20. // We are using the simple library node-html-parser for extracting the data from the HTML response.
  21. // jsdom would be an alternative, but would be too heavyweight for a simple selector extraction,
  22. // the purpose of jsdom seems to be more to provide a kind of browser emulation
  23.  
  24. const fs      = require('fs');
  25. const html    = require('node-html-parser');
  26. const request = require('request');
  27.  
  28. const RESULT_FILE = 'missa.json';
  29. const URL         = 'http://localhost/missa/cgi-bin/missa/missa.pl';
  30.  
  31. // startDate (yyyy-mm-dd) and number of iterations can be passed as command line parameters
  32. const startDate = process.argv.length > 2 ? process.argv[2] : '2021-01-01';
  33. const numDays   = process.argv.length > 3 ? process.argv[3] : 10;
  34.  
  35. run(startDate,numDays);
  36.  
  37. // Main loop: call getMissa() for successive dates, collecting results
  38. async function run(startDate,numDays) {
  39.  
  40.   let date = new Date(startDate),
  41.       results = [];
  42.  
  43.   for (let i=0;i<numDays;i++) {  
  44.     try {
  45.       results.push( await getMissa(date) );
  46.     } catch(e) {
  47.       console.error( e );
  48.     }
  49.     date.setDate(date.getDate()+1)
  50.   }
  51.  
  52.   writeResults(results);
  53.  
  54. }
  55.  
  56. // Perform the HTTP request and return extracted data as a promise
  57. function getMissa(date) {
  58.  
  59.   const dateFormatted = format(date);
  60.  
  61.   return fetchURL(`${URL}?date=${dateFormatted}`,extractResult);
  62.  
  63.   function extractResult(response,body) {
  64.     let title = extract(body);
  65.     return {date:dateFormatted, title:title};
  66.   }
  67.  
  68. }
  69.  
  70. // Parse HTML document and extract some data
  71. function extract( htmlSourceCode ) {
  72.   const doc = html.parse( htmlSourceCode );
  73.   return( doc.querySelector("font").text); // Extract content of first <font>-Tag in HTML doc.
  74. }
  75.  
  76. // Performing an HTTP request as promise
  77. function fetchURL(url,onResponse) {
  78.   return new Promise((resolve,reject)=>{
  79.     request(url, (error, response, body) => {
  80.         if (error) return reject(error);
  81.         if (response.statusCode != 200) {
  82.           return reject('Invalid status code <' + response.statusCode + '> for url '+url);
  83.         }
  84.         let result = onResponse(response,body);
  85.         resolve(result);
  86.       });
  87.   })
  88. }
  89.  
  90. // Date format mm-dd-yyyy
  91. function format(date) {
  92.   const day = date.getUTCDate();
  93.   const month = date.getUTCMonth()+1;
  94.   const year = date.getUTCFullYear();
  95.   return `${month}-${day}-${year}`;
  96. }
  97.  
  98. // Write results as JSON array to file
  99. function writeResults(results) {
  100.   console.log(`${results.length} entries generated`);
  101.   fs.writeFileSync(
  102.     RESULT_FILE,
  103.     JSON.stringify(results,null,2)
  104.   );
  105. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement