Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import lxml
- import openpyxl as pxl
- import parser
- from datetime import datetime as dt
- import requests
- import bs4
- from bs4 import BeautifulSoup as bs
- from unicodedata import normalize
- def getlink(table, count, k):
- #k = 0
- for_iterate = []
- headers = []
- for i, td in enumerate(table.find_all('td')):
- if i == count:
- for_iterate.append(td.text)
- count += 6
- for td in (table.find_all('strong')):
- if for_iterate[k] == 'отозв.':
- headers.append(td.a['href'])
- print(for_iterate[k])
- print(k)
- k += 1
- return (headers, k)
- headers = []
- df = pd.DataFrame()
- df_why = pd.DataFrame()
- dict_lin = {}
- for_iterate = []
- k = 0
- #######TODO UNDERSTAND HOW TO PARSE WEBPAGE AND GET TOTAL COUNT OF PAGES
- for g in range(1, 3):
- URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
- r = requests.get(URL_TEMPLATE, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
- timeout=15)
- print(r.content)
- soup = bs(r.text, "html.parser")
- df_temp = pd.read_html(URL_TEMPLATE, encoding= 'utf8')
- for i in df_temp:
- df = pd.concat([df, i], axis=0) # or df = pd.concat([df_temp[0], i], axis=0)
- vacancies_names = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
- count = 3
- #for i in vacancies_names.find_all('strong'):
- """for i, td in enumerate(vacancies_names.find_all('td')):
- if i == count:
- for_iterate.append(td.text)
- count += 6
- for td in (vacancies_names.find_all('strong')):
- if for_iterate[k] == 'отозв.':
- headers.append(td.a['href'])
- k += 1"""
- headers += getlink(vacancies_names, count, k)[0]
- #k = getlink(vacancies_names, count, k)[1]
- df = df.reset_index(drop=True)
- for throw in df[df['причина'] == 'ликв.'].index:
- df = df.drop(index=[throw])
- df = df.drop_duplicates(subset=['номер лицензии'])
- df = df.reset_index(drop=True)
- for link in headers:
- last = []
- df_temp_2 = pd.DataFrame()
- str_temp = ''
- url_banki = f"https://www.banki.ru{link}"
- r_ = requests.get(url_banki)
- soup_ = bs(r_.text, "lxml")
- #vacancies_text = soup_.find_all('dl', class_='definition-list padding-default')
- #for i in vacancies_text.find_all('dd', class_='margin-bottom-zero'):
- if link not in dict_lin:
- dict_lin[f"https://www.banki.ru{link}"] = []
- #ar = i
- for i in soup_.find_all("dd", {"class": "margin-bottom-zero"}):
- #ar = i.text.strip()
- ai = i.text
- #print(str(i.text.strip()), last)
- """str_temp += ar
- stop = 0"""
- ai = ai.replace(u'\n', u' ')
- ai = ai.replace(u'\xa0', u' ')
- str_temp += ai
- dict_lin[f"https://www.banki.ru{link}"].append(str(str_temp))
- #df_temp_2 = pd.DataFrame(last)
- #df_why = pd.concat([df_why, df_temp_2], axis= 0)
- stop = 0
- data = list(dict_lin.items())
- an_array = np.array(data, dtype=object)
- print(an_array)
- df_why= pd.DataFrame(an_array)
- ## save to xlsx file
- #filepath = 'my_excel_file.xlsx'
- #df.to_excel(filepath, index=False)
- #df_why.to_excel("somth_6.xlsx", sheet_name='Sheet1', index=False, header=True)
- # headers.append(title)
- df = pd.concat([df, df_why], axis= 1)
- df = df.set_index('Unnamed: 0')
- df.rename(columns={0:'link', 1: 'text'}, inplace=True)
- df.to_excel("somth_12.xlsm", sheet_name='Sheet1',index=False, header=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement