Advertisement
Danila_lipatov

Parser

Oct 24th, 2022 (edited)
202
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.12 KB | None | 0 0
  1. import os
  2. from bs4 import BeautifulSoup as bs
  3. import urllib.request
  4. import requests
  5. from selenium.webdriver.common.by import By
  6. import xml
  7. import ssl
  8. ssl._create_default_https_context = ssl._create_unverified_context
  9.  
  10.  
  11. def downloadfile(soup, year, line): #func of download files
  12. subfolder_name = f"{year}"
  13. for i in soup.find_all('div', class_="document-regular_name"):
  14. if i.a['href'].endswith('rar') or i.a['href'].endswith('zip') or i.a['href'].endswith('pdf'):
  15. if os.path.isdir(
  16. (os.path.join(f"{line}", subfolder_name))) != True: # if there is no folder with year, create it
  17. os.makedirs(os.path.join(f"{line}", subfolder_name))
  18.  
  19. url = f"https://cbr.ru{i.a['href']}" # get url of download file
  20. r_ = requests.get(url, allow_redirects=True)
  21. i.a['href'] = i.a['href'].replace(u"/", u"_") # change link for rename filename
  22. urllib.request.urlretrieve(url, f"{line}/{year}/{line}{i.a['href']}")
  23.  
  24. with open(f"list_of_lict", "r") as out: #read file with licenses
  25. for line in out:
  26. line = line.replace('"', "") #replace " and \n
  27. line = line.replace(u"\n", "")
  28. os.mkdir(f"{line}") #create folder with licenses names
  29. for year in range(2007, 2022 + 1): #iterate over years
  30. URL_TEMPLATE = f"https://cbr.ru/banking_sector/credit/coinfo/a{year}/?regnum={line}" # url for search company from 2018 year
  31. #URL_TEMPLATE = f"https://cbr.ru/banking_sector/credit/coinfo/AnnaulReport{year}/?regnum={line}" #This url to 2017(include)
  32. if requests.get(URL_TEMPLATE).status_code == 200:
  33. #subfolder_name = f"{year}"
  34. print(requests.get(URL_TEMPLATE).status_code, year)
  35. parser = 'html.parser'
  36. resp = urllib.request.urlopen(URL_TEMPLATE)
  37. soup = bs(resp, parser, from_encoding=resp.info().get_param('charset'))
  38.  
  39. downloadfile(soup, year, line)
  40.  
  41. else:
  42. print(requests.get(URL_TEMPLATE).status_code, year)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement