Advertisement
Wyvern67

cambridge-scrapper.js

Mar 18th, 2020
1,243
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. list = document.querySelectorAll(".title > h5 > .part-link");
  2. authors = document.querySelectorAll(".details > .author");
  3.  
  4. if (list.length != authors.length) {
  5.     error("list author mismatch");
  6. } else {
  7.     dump = "";
  8.     for (let i=0; i < list.length; i++) {
  9.         let id = list[i].href.match(/\/([0-9A-F]{32})/)[1];
  10.         let name = list[i].href.match(/\/([^/]*)\/[0-9A-F]{32}/)[1];
  11.         let author = authors[i].innerText.replace(/[^a-z0-9]/gi, '-').toLowerCase();
  12.         let tmp = authors[i].parentElement.querySelector(".source > a.url")
  13.         let filename;
  14.         if (tmp) {
  15.             let book = tmp.href.match(/\/([0-9A-F]{32})/)[1];
  16.             filename = `${name}_${author}_(book-${book})`;
  17.         } else {
  18.             filename = `${name}_${author}`;
  19.         }
  20.         let curlQuery = `curl "https://www.cambridge.org/core/services/online-view/get/${id}"  -H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0" -H "Accept: */*" -H "Accept-Language: fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3" --compressed -H "X-Requested-With: XMLHttpRequest" -H "Connection: keep-alive" -H "Referer: https://www.cambridge.org/core/books/mathematics-for-machine-learning/when-models-meet-data/${id}" -H "DNT: 1" -H "Pragma: no-cache" -H "Cache-Control: no-cache" -H "TE: Trailers" > "${filename}.html"`;
  21.         dump += curlQuery + "\n";
  22.     }
  23.  
  24.     console.log(dump);
  25. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement