Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- from bs4 import BeautifulSoup as bs
- import urllib.request
- import requests
- from selenium.webdriver.common.by import By
- import xml
- import ssl
- ssl._create_default_https_context = ssl._create_unverified_context
- def downloadfile(soup, year, line): #func of download files
- subfolder_name = f"{year}"
- for i in soup.find_all('div', class_="document-regular_name"):
- if i.a['href'].endswith('rar') or i.a['href'].endswith('zip') or i.a['href'].endswith('pdf'):
- if os.path.isdir(
- (os.path.join(f"{line}", subfolder_name))) != True: # if there is no folder with year, create it
- os.makedirs(os.path.join(f"{line}", subfolder_name))
- url = f"https://cbr.ru{i.a['href']}" # get url of download file
- r_ = requests.get(url, allow_redirects=True)
- i.a['href'] = i.a['href'].replace(u"/", u"_") # change link for rename filename
- urllib.request.urlretrieve(url, f"{line}/{year}/{line}{i.a['href']}")
- with open(f"list_of_lict", "r") as out: #read file with licenses
- for line in out:
- line = line.replace('"', "") #replace " and \n
- line = line.replace(u"\n", "")
- os.mkdir(f"{line}") #create folder with licenses names
- for year in range(2007, 2022 + 1): #iterate over years
- URL_TEMPLATE = f"https://cbr.ru/banking_sector/credit/coinfo/a{year}/?regnum={line}" # url for search company from 2018 year
- #URL_TEMPLATE = f"https://cbr.ru/banking_sector/credit/coinfo/AnnaulReport{year}/?regnum={line}" #This url to 2017(include)
- if requests.get(URL_TEMPLATE).status_code == 200:
- #subfolder_name = f"{year}"
- print(requests.get(URL_TEMPLATE).status_code, year)
- parser = 'html.parser'
- resp = urllib.request.urlopen(URL_TEMPLATE)
- soup = bs(resp, parser, from_encoding=resp.info().get_param('charset'))
- downloadfile(soup, year, line)
- else:
- print(requests.get(URL_TEMPLATE).status_code, year)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement