Advertisement
huutho_96

crawl.py

Nov 6th, 2018
228
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.49 KB | None | 0 0
  1. import requests
  2. import urllib3
  3. urllib3.disable_warnings()
  4.  
  5. class CrawlConfig:
  6.     def __init__(self, proxy={}, headers={}, cookies={}, timeout=30.0, verify=False):
  7.         self.proxy = proxy
  8.         self.headers = headers
  9.         self.timeout = timeout
  10.         self.verify = verify
  11.         self.cookies = cookies
  12.        
  13.  
  14. class Crawl:
  15.     def __init__(self, config: CrawlConfig):
  16.         self.set_config(config)
  17.        
  18.     def set_config(self, config: CrawlConfig):
  19.         self.proxy = config.proxy
  20.         self.headers = config.headers
  21.         self.timeout = config.timeout
  22.         self.verify = config.verify
  23.         self.cookies = config.cookies
  24.    
  25.     def crawl(self, url):
  26.         try:
  27.             r = requests.get(url, headers=self.headers, cookies=self.cookies, timeout=self.timeout, verify=self.verify,
  28.                                     proxies=self.proxy)
  29.             return r.content.decode()
  30.         except:
  31.             return ""
  32.  
  33.  
  34. from enum import Enum
  35. from urllib.parse import urlparse
  36. import re
  37.  
  38. class RE:
  39.     TAG = r"<(|[a-z]+)(| ([^<]+))>"
  40.     IMG = r"<img (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
  41.     JS = r"<script (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
  42.     CSS = r"<link (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"
  43.     LINK = r"<a (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"
  44.    
  45.     EMAIL = r"([\w\.,]+@[\w\.,]+\.\w+)"
  46.  
  47. class ParseHTML:
  48.     def __init__(self, base_url: str, html: str):
  49.         self.html = html
  50.         self.base_url = base_url
  51.         if "https" in self.base_url:
  52.             self.protocol = "https"
  53.         else:
  54.             self.protocol = "http"
  55.  
  56.     def __get_link(self, link: str) -> str:
  57.         link = link.split("#")[0]
  58.         if link == "" or link[0: 10] == "javascript" or link[0:3]=="tel" or link[0:6] == "mailto":
  59.             return self.base_url
  60.            
  61.         if link[0:2] == "//":
  62.             return self.protocol + ":" + link
  63.         else:
  64.             if link[0:1] == "/":
  65.                 return self.base_url + link
  66.         return link
  67.  
  68.  
  69.     def __get_vector(self):
  70.         matches = re.finditer(RE.TAG, self.html, re.MULTILINE)
  71.         vector = {}
  72.         for tag in [e.groups()[0] for e in matches]:
  73.             if tag not in vector:
  74.                 vector[tag] = 1
  75.                 continue
  76.             vector[tag] += 1
  77.        
  78.         return vector
  79.    
  80.     def __get_resource(self):
  81.         return {
  82.             "js": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.JS, self.html)])),
  83.             "img": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.IMG, self.html)])),
  84.             "css": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.CSS, self.html)])),
  85.             "link": list(set([self.__get_link(e.groups()[1]) for e in re.finditer(RE.LINK, self.html)]))
  86.         }
  87.    
  88.     def parse(self):
  89.         return {
  90.             "resource": self.__get_resource(),
  91.             "vector": self.__get_vector()
  92.         }
  93.  
  94.  
  95. def run(urls):
  96.     from multiprocessing import Pool
  97.     from datetime import datetime
  98.     pool = Pool(processes=8)
  99.     c = Crawl(CrawlConfig())
  100.    
  101.     print (len(urls), "From", datetime.now())
  102.     contents = pool.map(c.crawl, urls)
  103.     print ("To", datetime.now())
  104.  
  105.     return True
  106.  
  107. if __name__ == "__main__":
  108.     from datetime import datetime
  109.     print (datetime.now())
  110.     run([
  111.         "http://diemthi.vnexpress.net",
  112.         "http://e.vnexpress.net/privacy_policy.html",
  113.         "http://member.vnexpress.net/lang-que-viet-nam/",
  114.         "http://raovat.vnexpress.net",
  115.         "http://raovat.vnexpress.net/",
  116.         "http://raovat.vnexpress.net/?utm_campaign=VNEXPRESS&utm_source=footer&utm_medium=menu",
  117.         "http://raovat.vnexpress.net/am-thuc-du-lich/am-thuc",
  118.         "http://raovat.vnexpress.net/am-thuc-du-lich/du-lich",
  119.         "http://raovat.vnexpress.net/dich-vu/dau-thau",
  120.         "http://raovat.vnexpress.net/dich-vu/dich-vu-gia-dinh",
  121.         "http://raovat.vnexpress.net/dich-vu/thong-bao-thanh-l-p-cong-ty",
  122.         "http://raovat.vnexpress.net/dich-vu/tim-doi-ta-c",
  123.         "http://raovat.vnexpress.net/dich-vu/tim-nha-phan-phoi-dai-ly",
  124.         "http://raovat.vnexpress.net/dien-thoai-sim/dich-vu",
  125.         "http://raovat.vnexpress.net/dien-thoai-sim/dien-thoai",
  126.         "http://raovat.vnexpress.net/dien-thoai-sim/sim",
  127.         "http://raovat.vnexpress.net/dien-tu-ky-thuat-so/am-thanh-audio",
  128.         "http://raovat.vnexpress.net/dien-tu-ky-thuat-so/dien-lanh-gia-dung"
  129.     ])
  130.     print (datetime.now())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement