Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import lxml
- import openpyxl as pxl
- import parser
- from datetime import datetime as dt
- import requests
- import bs4
- from bs4 import BeautifulSoup as bs
- from unicodedata import normalize
- headers = []
- df = pd.DataFrame()
- df_why = pd.DataFrame()
- dict_lin = {}
- indexes = []
- for_iterate = []
- #######TODO UNDERSTAND HOW TO PARSE WEBPAGE AND GET TOTAL COUNT OF PAGES
- """URL_TEMPLATE = f"https://www.banki.ru/banks/memory/"
- r = requests.get(URL_TEMPLATE)
- soup = bs(r.text, "html.parser")
- vacancies_names = soup.find('div', class_= "layout-wrapper padding-top-default bg-white position-relative")
- tax = ""
- for i in vacancies_names.find('div'):
- print(i.text)
- print(tax)
- """
- k = 0
- for g in range(1, 21):
- #URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
- URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
- #r = requests.get(URL_TEMPLATE)
- r = requests.get(URL_TEMPLATE, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
- timeout=15)
- print(r.content)
- soup = bs(r.text, "html.parser")
- df_temp = pd.read_html(URL_TEMPLATE, encoding= 'utf8')
- for i in df_temp:
- df = pd.concat([df, i], axis=0)
- #df = i.drop(index=[i[i['причина'] == 'ликв.'].index])
- vacancies_names = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
- count = 3
- #for i in vacancies_names.find_all('strong'):
- for i, td in enumerate(vacancies_names.find_all('td')):
- if i == count:
- for_iterate.append(td.text)
- count += 6
- for td in (vacancies_names.find_all('strong')):
- if for_iterate[k] == 'отозв.':
- headers.append(td.a['href'])
- k += 1
- #if a == 'отозв.':
- #print(td.a['href'], count)
- #count += 1
- #print(driver.find_element(By.XPATH, "/html/body/div[2]/div[1]/table/tbody/tr[2]/td[4]"), count)
- #print(vacancies_names.find_all('td'))
- #if vacancies_names.find_element(By.XPATH, "/html/body/div[2]/div[1]/table/tbody/tr[2]/td[4]") == r'<td>ликв.</td>':
- #title = i.a['href']
- #headers.append(i.a['href'])
- stop = 0
- df = df.reset_index(drop=True)
- for throw in df[df['причина'] == 'ликв.'].index:
- df = df.drop(index=[throw])
- df = df.drop_duplicates(subset=['номер лицензии'])
- df = df.reset_index(drop=True)
- for link in headers:
- last = []
- url_banki = f"https://www.banki.ru{link}"
- r_ = requests.get(url_banki)
- soup_ = bs(r_.text, "lxml")
- #vacancies_text = soup_.find('dd', class_='margin-bottom-zero')
- #for i in vacancies_text.find_all('dd', class_='margin-bottom-zero'):
- if link not in dict_lin:
- dict_lin[f"https://www.banki.ru{link}"] = []
- #ar = i
- for i in soup_.find_all('dd', class_='margin-bottom-zero'):
- print(i.text, last)
- last.append(i.text)
- stop = 0
- #i.text = i.text.replace(u'\n', u' ')
- #i.text = i.text.replace(u'\xa0', u' ')
- dict_lin[f"https://www.banki.ru{link}"] = last
- stop = 0
- data = list(dict_lin.items())
- an_array = np.array(data, dtype=object)
- print(an_array)
- df_why = pd.DataFrame(an_array)
- ## save to xlsx file
- #filepath = 'my_excel_file.xlsx'
- #df.to_excel(filepath, index=False)
- #df_why.to_excel("somth_6.xlsx", sheet_name='Sheet1', index=False, header=True)
- # headers.append(title)
- df = pd.concat([df, df_why], axis= 1)
- df.to_excel("somth_12.xlsx", sheet_name='Sheet1', index=False, header=True)
- #print(df)
- """
- URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
- r = requests.get(URL_TEMPLATE)
- soup = bs(r.text, "html.parser")
- vacancies_names = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
- #vacancies_info = soup.find('th', class_="th-sortable", title="дата отзыва")
- #print(vacancies_names.text)
- g = 1
- for i in vacancies_names.find_all('td'):
- #title = i.text
- #print(title)
- #print(i.text)
- headers.append(i.text)
- stop = 0
- g += 1
- #headers.append(title)
- dates_count = 4
- count = 0
- for i in range(len(headers)):
- dates_count += 6
- if dt.strptime(headers[dates_count], '%d.%m.%Y') > dt.strptime("01.01.2005", '%d.%m.%Y'):
- print(headers[dates_count])
- count += 1
- print(count)
- stop = 0 """
- """vacancies_info = soup.find_all('tr', class_="standard-table standard-table--row-highlight margin-bottom-default")
- for info in vacancies_info:
- print(info)"""
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement