Advertisement
Danila_lipatov

Final_parse

Oct 12th, 2022 (edited)
135
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.42 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import lxml
  4. import openpyxl as pxl
  5. import parser
  6. from datetime import datetime as dt
  7. import requests
  8. import bs4
  9. from bs4 import BeautifulSoup as bs
  10. from unicodedata import normalize
  11. ####for_new_task
  12. ###url is https://cbr.ru/banking_sector/credit/coinfo/a2020/?regnum=600
  13.  
  14. headers = []        # list for links
  15. df = pd.DataFrame()     #output df
  16. df_why = pd.DataFrame()     #df of links + text
  17. dict_lin = {}       #dict for create df in prev descrip
  18. for_iterate = []    # ликв. or отозв. mass
  19. k = 0   #value for iteration in for_iterate mass
  20.  
  21. def Getlink(table, count, k):               # tuple of mass for links and iteration value
  22.     #k = 0
  23.     for_iterate_temp = []       #temp list for func
  24.     headers_temp = []           #temp list for func
  25.     for i, td in enumerate(table.find_all('td')):
  26.         if i == count:
  27.             for_iterate_temp.append(td.text)
  28.             count += 6
  29.  
  30.     for td in (table.find_all('strong')):           #get full info per each ID
  31.         if for_iterate_temp[k] == 'отозв.':
  32.             headers_temp.append(td.a['href'])       #saving links for each ID
  33.         k += 1
  34.     return (headers_temp, k)
  35.  
  36.  
  37. def GetDictoflinks(headers, dict_lin):              #dict of links
  38.     for link in headers:
  39.         last = []
  40.         str_temp = ''                       #contain all text
  41.         url_banki = f"https://www.banki.ru{link}"       #link for search info
  42.         r_ = requests.get(url_banki)
  43.         soup_ = bs(r_.text, "lxml")
  44.         if link not in dict_lin:
  45.             dict_lin[f"https://www.banki.ru{link}"] = []        #adding link as key  case some duplicates at the web
  46.         for i in soup_.find_all("dd", {"class": "margin-bottom-zero"}):     #search text of release
  47.             # ar = i.text.strip()
  48.             ai = i.text             #get text as str
  49.             """str_temp += ar
  50.            stop = 0"""
  51.             ai = ai.replace(u'\n', u' ')    #replace bad symbols
  52.             ai = ai.replace(u'\xa0', u' ')  #replace bad symbols
  53.             str_temp += ai              #concat string
  54.         dict_lin[f"https://www.banki.ru{link}"].append(str(str_temp))       #adding str_temp as value for each key
  55.  
  56.     return dict_lin
  57.  
  58. def GetFullDF(df, dict_lin):                #get final output df for excel
  59.     data = list(dict_lin.items())           #convert dict to list
  60.     an_array = np.array(data, dtype=object)             #np array for convert to DF
  61.     df_why = pd.DataFrame(an_array)                 #get df of links + texts
  62.  
  63.     df = pd.concat([df, df_why], axis=1)                #get output df
  64.  
  65.     return df
  66.  
  67.  
  68. def DropReason(df):
  69.     df = df.reset_index(drop=True)
  70.  
  71.     for throw in df[df['причина'] == 'ликв.'].index:  # drop where reason is ликв.
  72.         df = df.drop(index=[throw])
  73.  
  74.     df = df.drop_duplicates(subset=['номер лицензии'])  # drop duplicates
  75.     df = df.reset_index(drop=True)
  76.     return df
  77.  
  78. #######TODO UNDERSTAND HOW TO PARSE WEBPAGE AND GET TOTAL COUNT OF PAGES            still active
  79.  
  80. for g in range(1, 3):
  81.     URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"                #url for search table
  82.  
  83.     r = requests.get(URL_TEMPLATE, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
  84.                      timeout=15)
  85.     print(r.content)
  86.     soup = bs(r.text, "html.parser")
  87.  
  88.     df_temp = pd.read_html(URL_TEMPLATE, encoding= 'utf8')          #get table(list) of df
  89.  
  90.     for i in df_temp:
  91.             df = pd.concat([df, i], axis=0)             # or df = pd.concat([df_temp[0], i], axis=0)
  92.  
  93.     table = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
  94.     count = 3       #counter for get reason
  95.     headers += Getlink(table, count, k)[0]      #collect output list of links
  96.     #k = getlink(vacancies_names, count, k)[1]
  97.  
  98. df = DropReason(df)
  99.  
  100. dict_lin = GetDictoflinks(headers, dict_lin)
  101.  
  102. df = GetFullDF(df, dict_lin)
  103.  
  104. df = df.set_index('Unnamed: 0')             #drop useless column
  105. df.rename(columns={0:'link', 1: 'text'}, inplace=True)              #get better name
  106. df.to_csv("name.csv",index=False, header=True)              #Saving as csv cause df.to_excel has some troubles
  107.  
  108. #TODO AFTER CREATE OUTPUT.CSV FILE
  109. #After save open random xlsx file(or create)
  110. #Move to DATA
  111. #open "get from txt file/Csv file
  112. #click to UTF8 coding
  113. #get table
  114.  
  115. #THATS ALL
  116.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement