Advertisement
Danila_lipatov

parser_of_zip/pdf

Apr 23rd, 2023
120
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.02 KB | None | 0 0
  1. import os
  2. from bs4 import BeautifulSoup as bs
  3. import urllib.request
  4. import requests
  5. from selenium.webdriver.common.by import By
  6. import xml
  7. import ssl
  8.  
  9. ssl._create_default_https_context = ssl._create_unverified_context
  10.  
  11.  
  12. count = 0
  13. with open(f"list_of_lict", "r") as out:
  14.     for line in out:
  15.         line = line.replace('"', "")
  16.         line = line.replace(u"\n", "")
  17.         #os.mkdir(f"{line}")
  18.         for year in range(2007, 2022 + 1):
  19.             #URL_TEMPLATE = f"{url}"  # url from 2018 year
  20.             URL_TEMPLATE = f"{url}" #This url to 2017(include)
  21.             if requests.get(URL_TEMPLATE).status_code == 200:
  22.                 subfolder_name = f"{year}"
  23.                 print(requests.get(URL_TEMPLATE).status_code, year, line)
  24.                 # count += 1
  25.                 parser = 'html.parser'
  26.                 resp = urllib.request.urlopen(URL_TEMPLATE)
  27.                 soup = bs(resp, parser, from_encoding=resp.info().get_param('charset'))
  28.  
  29.                 for i in soup.find_all('div', class_="document-regular_name"):
  30.                     count += 1
  31.                     if i.a['href'].endswith('rar') or i.a['href'].endswith('zip') or i.a['href'].endswith('pdf'):
  32.                         if os.path.isdir(os.path.join(f"{line}", subfolder_name)) != True:
  33.                             os.makedirs(os.path.join(f"{line}", subfolder_name))
  34.                         # print(f"{url}{i.a['href']}")
  35.                         url = f"{url}{i.a['href']}"
  36.                         r_ = requests.get(url, allow_redirects=True)
  37.                         i.a['href'] = i.a['href'].replace(u"/", u"_")
  38.                         # i.a['href'] = i.a['href'].replace(u"\\\\", u"\\")
  39.  
  40.                         # open(f"{line}_{i.a['href']}", 'wb').write(r_.content)
  41.                         urllib.request.urlretrieve(url, f"{line}/{year}/{line}{i.a['href']}")
  42.                     # print(i.a['href'].endswith('fun'), count)
  43.                 # count += 1
  44.             else:
  45.                 print(requests.get(URL_TEMPLATE).status_code, year)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement