crawl.py

import requests
import urllib3
urllib3.disable_warnings()

class CrawlConfig:
    def __init__(self, proxy={}, headers={}, cookies={}, timeout=30.0, verify=False):
        self.proxy = proxy
        self.headers = headers
        self.timeout = timeout
        self.verify = verify
        self.cookies = cookies


class Crawl:
    def __init__(self, config: CrawlConfig):
        self.set_config(config)

    def set_config(self, config: CrawlConfig):
        self.proxy = config.proxy
        self.headers = config.headers
        self.timeout = config.timeout
        self.verify = config.verify
        self.cookies = config.cookies

    def crawl(self, url):
        try:
            r = requests.get(url, headers=self.headers, cookies=self.cookies, timeout=self.timeout, verify=self.verify,
                                    proxies=self.proxy)
            return r.content.decode()
        except:
            return ""


from enum import Enum
from urllib.parse import urlparse
import re

class RE:
    TAG = r"<(|[a-z]+)(| ([^<]+))>"
    IMG = r"<img (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
    JS = r"<script (|[^<]+)src=\"([^\"]+)\"(|[^>]+)"
    CSS = r"<link (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"
    LINK = r"<a (|[^<]+)href=\"([^\"]+)\"(|[^>]+)"

    EMAIL = r"([\w\.,]+@[\w\.,]+\.\w+)"

class ParseHTML:
    def __init__(self, base_url: str, html: str):
        self.html = html
        self.base_url = base_url
        if "https" in self.base_url:
            self.protocol = "https"
        else:
            self.protocol = "http"

    def __get_link(self, link: str) -> str:
        link = link.split("#")[0]
        if link == "" or link[0: 10] == "javascript" or link[0:3]=="tel" or link[0:6] == "mailto":
            return self.base_url

        if link[0:2] == "//":
            return self.protocol + ":" + link
        else:
            if link[0:1] == "/":
                return self.base_url + link
        return link


    def __get_vector(self):
        matches = re.finditer(RE.TAG, self.html, re.MULTILINE)
        vector = {}
        for tag in [e.groups()[0] for e in matches]:
            if tag not in vector:
                vector[tag] = 1
                continue
            vector[tag] += 1

        return vector

    def __get_resource(self):
        return {
            "js": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.JS, self.html)])),
            "img": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.IMG, self.html)])),
            "css": list(set([urlparse(e.groups()[1]).netloc for e in re.finditer(RE.CSS, self.html)])),
            "link": list(set([self.__get_link(e.groups()[1]) for e in re.finditer(RE.LINK, self.html)]))
        }

    def parse(self):
        return {
            "resource": self.__get_resource(),
            "vector": self.__get_vector()
        }


def run(urls):
    from multiprocessing import Pool
    from datetime import datetime
    pool = Pool(processes=8)
    c = Crawl(CrawlConfig())

    print (len(urls), "From", datetime.now())
    contents = pool.map(c.crawl, urls)
    print ("To", datetime.now())

    return True

if __name__ == "__main__":
    from datetime import datetime
    print (datetime.now())
    run([
        "http://diemthi.vnexpress.net",
        "http://e.vnexpress.net/privacy_policy.html",
        "http://member.vnexpress.net/lang-que-viet-nam/",
        "http://raovat.vnexpress.net",
        "http://raovat.vnexpress.net/",
        "http://raovat.vnexpress.net/?utm_campaign=VNEXPRESS&utm_source=footer&utm_medium=menu",
        "http://raovat.vnexpress.net/am-thuc-du-lich/am-thuc",
        "http://raovat.vnexpress.net/am-thuc-du-lich/du-lich",
        "http://raovat.vnexpress.net/dich-vu/dau-thau",
        "http://raovat.vnexpress.net/dich-vu/dich-vu-gia-dinh",
        "http://raovat.vnexpress.net/dich-vu/thong-bao-thanh-l-p-cong-ty",
        "http://raovat.vnexpress.net/dich-vu/tim-doi-ta-c",
        "http://raovat.vnexpress.net/dich-vu/tim-nha-phan-phoi-dai-ly",
        "http://raovat.vnexpress.net/dien-thoai-sim/dich-vu",
        "http://raovat.vnexpress.net/dien-thoai-sim/dien-thoai",
        "http://raovat.vnexpress.net/dien-thoai-sim/sim",
        "http://raovat.vnexpress.net/dien-tu-ky-thuat-so/am-thanh-audio",
        "http://raovat.vnexpress.net/dien-tu-ky-thuat-so/dien-lanh-gia-dung"
    ])
    print (datetime.now())