Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- from bs4 import BeautifulSoup as bs
- import urllib.request
- import requests
- from selenium.webdriver.common.by import By
- import xml
- import ssl
- ssl._create_default_https_context = ssl._create_unverified_context
- count = 0
- with open(f"list_of_lict", "r") as out:
- for line in out:
- line = line.replace('"', "")
- line = line.replace(u"\n", "")
- #os.mkdir(f"{line}")
- for year in range(2007, 2022 + 1):
- #URL_TEMPLATE = f"{url}" # url from 2018 year
- URL_TEMPLATE = f"{url}" #This url to 2017(include)
- if requests.get(URL_TEMPLATE).status_code == 200:
- subfolder_name = f"{year}"
- print(requests.get(URL_TEMPLATE).status_code, year, line)
- # count += 1
- parser = 'html.parser'
- resp = urllib.request.urlopen(URL_TEMPLATE)
- soup = bs(resp, parser, from_encoding=resp.info().get_param('charset'))
- for i in soup.find_all('div', class_="document-regular_name"):
- count += 1
- if i.a['href'].endswith('rar') or i.a['href'].endswith('zip') or i.a['href'].endswith('pdf'):
- if os.path.isdir(os.path.join(f"{line}", subfolder_name)) != True:
- os.makedirs(os.path.join(f"{line}", subfolder_name))
- # print(f"{url}{i.a['href']}")
- url = f"{url}{i.a['href']}"
- r_ = requests.get(url, allow_redirects=True)
- i.a['href'] = i.a['href'].replace(u"/", u"_")
- # i.a['href'] = i.a['href'].replace(u"\\\\", u"\\")
- # open(f"{line}_{i.a['href']}", 'wb').write(r_.content)
- urllib.request.urlretrieve(url, f"{line}/{year}/{line}{i.a['href']}")
- # print(i.a['href'].endswith('fun'), count)
- # count += 1
- else:
- print(requests.get(URL_TEMPLATE).status_code, year)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement