Advertisement
huutho_96

Crawl

Nov 6th, 2018
362
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.03 KB | None | 0 0
  1. import requests
  2. import urllib3
  3. urllib3.disable_warnings()
  4.  
  5. class CrawlConfig:
  6.     def __init__(self, proxy={}, headers={}, cookies={}, timeout=30.0, verify=False):
  7.         self.proxy = proxy
  8.         self.headers = headers
  9.         self.timeout = timeout
  10.         self.verify = verify
  11.         self.cookies = cookies
  12.        
  13.  
  14. class Crawl:
  15.     def __init__(self, config: CrawlConfig):
  16.         self.set_config(config)
  17.        
  18.     def set_config(self, config: CrawlConfig):
  19.         self.proxy = config.proxy
  20.         self.headers = config.headers
  21.         self.timeout = config.timeout
  22.         self.verify = config.verify
  23.         self.cookies = config.cookies
  24.    
  25.     def crawl(self, url):
  26.         try:
  27.             r = requests.get(url, headers=self.headers, cookies=self.cookies, timeout=self.timeout, verify=self.verify,
  28.                                     proxies=self.proxy)
  29.             return r.content.decode()
  30.         except:
  31.             return ""
  32.  
  33.  
  34. from enum import Enum
  35. from urllib.parse import urlparse
  36. import re
  37.  
  38. class RE:
  39.     TAG = r"<(|[a-z]+)(| ([^<]+))>"
  40.     IMG = r"<img (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
  41.     JS = r"<script (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
  42.     CSS = r"<link (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"
  43.     LINK = r"<a (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"
  44.    
  45.     EMAIL = r"([\w\.,]+@[\w\.,]+\.\w+)"
  46.  
  47. class ParseHTML:
  48.     def __init__(self, base_url: str, html: str):
  49.         self.html = html
  50.         self.base_url = base_url
  51.         if "https" in self.base_url:
  52.             self.protocol = "https"
  53.         else:
  54.             self.protocol = "http"
  55.  
  56.     def __get_link(self, link: str) -> str:
  57.         link = link.split("#")[0]
  58.         if link == "" or link[0: 10] == "javascript" or link[0:3]=="tel" or link[0:6] == "mailto":
  59.             return self.base_url
  60.            
  61.         if link[0:2] == "//":
  62.             return self.protocol + ":" + link
  63.         else:
  64.             if link[0:1] == "/":
  65.                 return self.base_url + link
  66.         return link
  67.  
  68.  
  69.     def __get_vector(self):
  70.         matches = re.finditer(RE.TAG, self.html, re.MULTILINE)
  71.         vector = {}
  72.         for tag in [e.groups()[0] for e in matches]:
  73.             if tag not in vector:
  74.                 vector[tag] = 1
  75.                 continue
  76.             vector[tag] += 1
  77.        
  78.         return vector
  79.    
  80.     def __get_resource(self):
  81.         return {
  82.             "js": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.JS, self.html)])),
  83.             "img": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.IMG, self.html)])),
  84.             "css": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.CSS, self.html)])),
  85.             "link": list(set([self.__get_link(e.groups()[1]) for e in re.finditer(RE.LINK, self.html)]))
  86.         }
  87.    
  88.     def parse(self):
  89.         return {
  90.             "resource": self.__get_resource(),
  91.             "vector": self.__get_vector()
  92.         }
  93.  
  94.  
  95. if __name__ == "__main__":
  96.     from multiprocessing import Pool
  97.     from datetime import datetime
  98.     pool = Pool(processes=8)
  99.     c = Crawl(CrawlConfig())
  100.  
  101.     checked_urls = set([])
  102.     current_urls = ["http://www.bbcamerica.com"]
  103.     print (datetime.now())
  104.     while len(list(checked_urls)) < 10000:
  105.         if len(current_urls) + len(list(checked_urls)) > 10000:
  106.             current_urls = current_urls[0: 10000 - len(list(checked_urls))]
  107.         checked_urls.update(set(current_urls))
  108.         contents = pool.map(c.crawl, current_urls)
  109.         current_urls = []
  110.         for html in contents:
  111.             parse = ParseHTML(base_url="http://www.bbcamerica.com", html=html)
  112.             obj = parse.parse()
  113.             current_urls += obj['resource']['link']
  114.  
  115.         current_urls = [url for url in current_urls if url not in checked_urls]
  116.         print ("Current len: ", len(list(checked_urls)))
  117.  
  118.    
  119.     print ("Len: ", len(list(checked_urls)))
  120.    
  121.     arr = list(checked_urls)
  122.     arr.sort()
  123.  
  124.     import json
  125.     json.dump(arr, open("urls.json", "w"))
  126.     print (datetime.now())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement