Parser

import os
from bs4 import BeautifulSoup as bs
import urllib.request
import requests
from selenium.webdriver.common.by import By
import xml
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


def downloadfile(soup, year, line):     #func of download files
    subfolder_name = f"{year}"
    for i in soup.find_all('div', class_="document-regular_name"):
        if i.a['href'].endswith('rar') or i.a['href'].endswith('zip') or i.a['href'].endswith('pdf'):
            if os.path.isdir(
                    (os.path.join(f"{line}", subfolder_name))) != True:  # if there is no folder with year, create it
                os.makedirs(os.path.join(f"{line}", subfolder_name))

            url = f"https://cbr.ru{i.a['href']}"  # get url of download file
            r_ = requests.get(url, allow_redirects=True)
            i.a['href'] = i.a['href'].replace(u"/", u"_")  # change link for rename filename
            urllib.request.urlretrieve(url, f"{line}/{year}/{line}{i.a['href']}")

with open(f"list_of_lict", "r") as out:     #read file with licenses
    for line in out:
        line = line.replace('"', "")        #replace " and \n
        line = line.replace(u"\n", "")
        os.mkdir(f"{line}")         #create folder with licenses names
        for year in range(2007, 2022 + 1):      #iterate over years
            URL_TEMPLATE = f"https://cbr.ru/banking_sector/credit/coinfo/a{year}/?regnum={line}"  # url for search company from 2018 year
            #URL_TEMPLATE = f"https://cbr.ru/banking_sector/credit/coinfo/AnnaulReport{year}/?regnum={line}" #This url to 2017(include)
            if requests.get(URL_TEMPLATE).status_code == 200:
                #subfolder_name = f"{year}"
                print(requests.get(URL_TEMPLATE).status_code, year)
                parser = 'html.parser'
                resp = urllib.request.urlopen(URL_TEMPLATE)
                soup = bs(resp, parser, from_encoding=resp.info().get_param('charset'))

                downloadfile(soup, year, line)

            else:
                print(requests.get(URL_TEMPLATE).status_code, year)