Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import requests
- import urllib3
- urllib3.disable_warnings()
- class CrawlConfig:
- def __init__(self, proxy={}, headers={}, cookies={}, timeout=30.0, verify=False):
- self.proxy = proxy
- self.headers = headers
- self.timeout = timeout
- self.verify = verify
- self.cookies = cookies
- class Crawl:
- def __init__(self, config: CrawlConfig):
- self.set_config(config)
- def set_config(self, config: CrawlConfig):
- self.proxy = config.proxy
- self.headers = config.headers
- self.timeout = config.timeout
- self.verify = config.verify
- self.cookies = config.cookies
- def crawl(self, url):
- try:
- r = requests.get(url, headers=self.headers, cookies=self.cookies, timeout=self.timeout, verify=self.verify,
- proxies=self.proxy)
- return r.content.decode()
- except:
- return ""
- from enum import Enum
- from urllib.parse import urlparse
- import re
- class RE:
- TAG = r"<(|[a-z]+)(| ([^<]+))>"
- IMG = r"<img (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
- JS = r"<script (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
- CSS = r"<link (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"
- LINK = r"<a (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"
- EMAIL = r"([\w\.,]+@[\w\.,]+\.\w+)"
- class ParseHTML:
- def __init__(self, base_url: str, html: str):
- self.html = html
- self.base_url = base_url
- if "https" in self.base_url:
- self.protocol = "https"
- else:
- self.protocol = "http"
- def __get_link(self, link: str) -> str:
- link = link.split("#")[0]
- if link == "" or link[0: 10] == "javascript" or link[0:3]=="tel" or link[0:6] == "mailto":
- return self.base_url
- if link[0:2] == "//":
- return self.protocol + ":" + link
- else:
- if link[0:1] == "/":
- return self.base_url + link
- return link
- def __get_vector(self):
- matches = re.finditer(RE.TAG, self.html, re.MULTILINE)
- vector = {}
- for tag in [e.groups()[0] for e in matches]:
- if tag not in vector:
- vector[tag] = 1
- continue
- vector[tag] += 1
- return vector
- def __get_resource(self):
- return {
- "js": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.JS, self.html)])),
- "img": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.IMG, self.html)])),
- "css": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.CSS, self.html)])),
- "link": list(set([self.__get_link(e.groups()[1]) for e in re.finditer(RE.LINK, self.html)]))
- }
- def parse(self):
- return {
- "resource": self.__get_resource(),
- "vector": self.__get_vector()
- }
- if __name__ == "__main__":
- from multiprocessing import Pool
- from datetime import datetime
- pool = Pool(processes=8)
- c = Crawl(CrawlConfig())
- checked_urls = set([])
- current_urls = ["http://www.bbcamerica.com"]
- print (datetime.now())
- while len(list(checked_urls)) < 10000:
- if len(current_urls) + len(list(checked_urls)) > 10000:
- current_urls = current_urls[0: 10000 - len(list(checked_urls))]
- checked_urls.update(set(current_urls))
- contents = pool.map(c.crawl, current_urls)
- current_urls = []
- for html in contents:
- parse = ParseHTML(base_url="http://www.bbcamerica.com", html=html)
- obj = parse.parse()
- current_urls += obj['resource']['link']
- current_urls = [url for url in current_urls if url not in checked_urls]
- print ("Current len: ", len(list(checked_urls)))
- print ("Len: ", len(list(checked_urls)))
- arr = list(checked_urls)
- arr.sort()
- import json
- json.dump(arr, open("urls.json", "w"))
- print (datetime.now())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement