Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import lxml
- import openpyxl as pxl
- import parser
- from datetime import datetime as dt
- import requests
- import bs4
- from bs4 import BeautifulSoup as bs
- from unicodedata import normalize
- ####for_new_task
- ###url is https://cbr.ru/banking_sector/credit/coinfo/a2020/?regnum=600
- headers = [] # list for links
- df = pd.DataFrame() #output df
- df_why = pd.DataFrame() #df of links + text
- dict_lin = {} #dict for create df in prev descrip
- for_iterate = [] # ликв. or отозв. mass
- k = 0 #value for iteration in for_iterate mass
- def Getlink(table, count, k): # tuple of mass for links and iteration value
- #k = 0
- for_iterate_temp = [] #temp list for func
- headers_temp = [] #temp list for func
- for i, td in enumerate(table.find_all('td')):
- if i == count:
- for_iterate_temp.append(td.text)
- count += 6
- for td in (table.find_all('strong')): #get full info per each ID
- if for_iterate_temp[k] == 'отозв.':
- headers_temp.append(td.a['href']) #saving links for each ID
- k += 1
- return (headers_temp, k)
- def GetDictoflinks(headers, dict_lin): #dict of links
- for link in headers:
- last = []
- str_temp = '' #contain all text
- url_banki = f"https://www.banki.ru{link}" #link for search info
- r_ = requests.get(url_banki)
- soup_ = bs(r_.text, "lxml")
- if link not in dict_lin:
- dict_lin[f"https://www.banki.ru{link}"] = [] #adding link as key case some duplicates at the web
- for i in soup_.find_all("dd", {"class": "margin-bottom-zero"}): #search text of release
- # ar = i.text.strip()
- ai = i.text #get text as str
- """str_temp += ar
- stop = 0"""
- ai = ai.replace(u'\n', u' ') #replace bad symbols
- ai = ai.replace(u'\xa0', u' ') #replace bad symbols
- str_temp += ai #concat string
- dict_lin[f"https://www.banki.ru{link}"].append(str(str_temp)) #adding str_temp as value for each key
- return dict_lin
- def GetFullDF(df, dict_lin): #get final output df for excel
- data = list(dict_lin.items()) #convert dict to list
- an_array = np.array(data, dtype=object) #np array for convert to DF
- df_why = pd.DataFrame(an_array) #get df of links + texts
- df = pd.concat([df, df_why], axis=1) #get output df
- return df
- def DropReason(df):
- df = df.reset_index(drop=True)
- for throw in df[df['причина'] == 'ликв.'].index: # drop where reason is ликв.
- df = df.drop(index=[throw])
- df = df.drop_duplicates(subset=['номер лицензии']) # drop duplicates
- df = df.reset_index(drop=True)
- return df
- #######TODO UNDERSTAND HOW TO PARSE WEBPAGE AND GET TOTAL COUNT OF PAGES still active
- for g in range(1, 3):
- URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}" #url for search table
- r = requests.get(URL_TEMPLATE, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
- timeout=15)
- print(r.content)
- soup = bs(r.text, "html.parser")
- df_temp = pd.read_html(URL_TEMPLATE, encoding= 'utf8') #get table(list) of df
- for i in df_temp:
- df = pd.concat([df, i], axis=0) # or df = pd.concat([df_temp[0], i], axis=0)
- table = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
- count = 3 #counter for get reason
- headers += Getlink(table, count, k)[0] #collect output list of links
- #k = getlink(vacancies_names, count, k)[1]
- df = DropReason(df)
- dict_lin = GetDictoflinks(headers, dict_lin)
- df = GetFullDF(df, dict_lin)
- df = df.set_index('Unnamed: 0') #drop useless column
- df.rename(columns={0:'link', 1: 'text'}, inplace=True) #get better name
- df.to_csv("name.csv",index=False, header=True) #Saving as csv cause df.to_excel has some troubles
- #TODO AFTER CREATE OUTPUT.CSV FILE
- #After save open random xlsx file(or create)
- #Move to DATA
- #open "get from txt file/Csv file
- #click to UTF8 coding
- #get table
- #THATS ALL
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement