Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import urllib3
- urllib3.disable_warnings()
- class CrawlConfig:
- def __init__(self, proxy={}, headers={}, cookies={}, timeout=30.0, verify=False):
- self.proxy = proxy
- self.headers = headers
- self.timeout = timeout
- self.verify = verify
- self.cookies = cookies
- class Crawl:
- def __init__(self, config: CrawlConfig):
- self.set_config(config)
- def set_config(self, config: CrawlConfig):
- self.proxy = config.proxy
- self.headers = config.headers
- self.timeout = config.timeout
- self.verify = config.verify
- self.cookies = config.cookies
- def crawl(self, url):
- try:
- r = requests.get(url, headers=self.headers, cookies=self.cookies, timeout=self.timeout, verify=self.verify,
- proxies=self.proxy)
- return r.content.decode()
- except:
- return ""
- from enum import Enum
- from urllib.parse import urlparse
- import re
- class RE:
- TAG = r"<(|[a-z]+)(| ([^<]+))>"
- IMG = r"<img (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
- JS = r"<script (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
- CSS = r"<link (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"
- LINK = r"<a (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"
- EMAIL = r"([\w\.,]+@[\w\.,]+\.\w+)"
- class ParseHTML:
- def __init__(self, base_url: str, html: str):
- self.html = html
- self.base_url = base_url
- if "https" in self.base_url:
- self.protocol = "https"
- else:
- self.protocol = "http"
- def __get_link(self, link: str) -> str:
- link = link.split("#")[0]
- if link == "" or link[0: 10] == "javascript" or link[0:3]=="tel" or link[0:6] == "mailto":
- return self.base_url
- if link[0:2] == "//":
- return self.protocol + ":" + link
- else:
- if link[0:1] == "/":
- return self.base_url + link
- return link
- def __get_vector(self):
- matches = re.finditer(RE.TAG, self.html, re.MULTILINE)
- vector = {}
- for tag in [e.groups()[0] for e in matches]:
- if tag not in vector:
- vector[tag] = 1
- continue
- vector[tag] += 1
- return vector
- def __get_resource(self):
- return {
- "js": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.JS, self.html)])),
- "img": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.IMG, self.html)])),
- "css": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.CSS, self.html)])),
- "link": list(set([self.__get_link(e.groups()[1]) for e in re.finditer(RE.LINK, self.html)]))
- }
- def parse(self):
- return {
- "resource": self.__get_resource(),
- "vector": self.__get_vector()
- }
- def run(urls):
- from multiprocessing import Pool
- from datetime import datetime
- pool = Pool(processes=8)
- c = Crawl(CrawlConfig())
- print (len(urls), "From", datetime.now())
- contents = pool.map(c.crawl, urls)
- print ("To", datetime.now())
- return True
- if __name__ == "__main__":
- from datetime import datetime
- print (datetime.now())
- run([
- "http://diemthi.vnexpress.net",
- "http://e.vnexpress.net/privacy_policy.html",
- "http://member.vnexpress.net/lang-que-viet-nam/",
- "http://raovat.vnexpress.net",
- "http://raovat.vnexpress.net/",
- "http://raovat.vnexpress.net/?utm_campaign=VNEXPRESS&utm_source=footer&utm_medium=menu",
- "http://raovat.vnexpress.net/am-thuc-du-lich/am-thuc",
- "http://raovat.vnexpress.net/am-thuc-du-lich/du-lich",
- "http://raovat.vnexpress.net/dich-vu/dau-thau",
- "http://raovat.vnexpress.net/dich-vu/dich-vu-gia-dinh",
- "http://raovat.vnexpress.net/dich-vu/thong-bao-thanh-l-p-cong-ty",
- "http://raovat.vnexpress.net/dich-vu/tim-doi-ta-c",
- "http://raovat.vnexpress.net/dich-vu/tim-nha-phan-phoi-dai-ly",
- "http://raovat.vnexpress.net/dien-thoai-sim/dich-vu",
- "http://raovat.vnexpress.net/dien-thoai-sim/dien-thoai",
- "http://raovat.vnexpress.net/dien-thoai-sim/sim",
- "http://raovat.vnexpress.net/dien-tu-ky-thuat-so/am-thanh-audio",
- "http://raovat.vnexpress.net/dien-tu-ky-thuat-so/dien-lanh-gia-dung"
- ])
- print (datetime.now())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement