Advertisement
Danila_lipatov

test_parse

Oct 11th, 2022
97
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.61 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import lxml
  4. import openpyxl as pxl
  5. import parser
  6. from datetime import datetime as dt
  7. import requests
  8. import bs4
  9. from bs4 import BeautifulSoup as bs
  10. from unicodedata import normalize
  11.  
  12.  
  13.  
  14. def getlink(table, count, k):
  15. #k = 0
  16. for_iterate = []
  17. headers = []
  18. for i, td in enumerate(table.find_all('td')):
  19. if i == count:
  20. for_iterate.append(td.text)
  21. count += 6
  22.  
  23. for td in (table.find_all('strong')):
  24. if for_iterate[k] == 'отозв.':
  25. headers.append(td.a['href'])
  26. print(for_iterate[k])
  27. print(k)
  28. k += 1
  29. return (headers, k)
  30.  
  31.  
  32.  
  33.  
  34.  
  35.  
  36. headers = []
  37. df = pd.DataFrame()
  38. df_why = pd.DataFrame()
  39. dict_lin = {}
  40. for_iterate = []
  41. k = 0
  42. #######TODO UNDERSTAND HOW TO PARSE WEBPAGE AND GET TOTAL COUNT OF PAGES
  43.  
  44. for g in range(1, 3):
  45. URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
  46.  
  47. r = requests.get(URL_TEMPLATE, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
  48. timeout=15)
  49. print(r.content)
  50. soup = bs(r.text, "html.parser")
  51.  
  52. df_temp = pd.read_html(URL_TEMPLATE, encoding= 'utf8')
  53.  
  54. for i in df_temp:
  55. df = pd.concat([df, i], axis=0) # or df = pd.concat([df_temp[0], i], axis=0)
  56.  
  57. vacancies_names = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
  58. count = 3
  59. #for i in vacancies_names.find_all('strong'):
  60. """for i, td in enumerate(vacancies_names.find_all('td')):
  61. if i == count:
  62. for_iterate.append(td.text)
  63. count += 6
  64.  
  65. for td in (vacancies_names.find_all('strong')):
  66. if for_iterate[k] == 'отозв.':
  67. headers.append(td.a['href'])
  68. k += 1"""
  69. headers += getlink(vacancies_names, count, k)[0]
  70. #k = getlink(vacancies_names, count, k)[1]
  71. df = df.reset_index(drop=True)
  72.  
  73. for throw in df[df['причина'] == 'ликв.'].index:
  74. df = df.drop(index=[throw])
  75.  
  76. df = df.drop_duplicates(subset=['номер лицензии'])
  77. df = df.reset_index(drop=True)
  78.  
  79. for link in headers:
  80. last = []
  81. df_temp_2 = pd.DataFrame()
  82. str_temp = ''
  83. url_banki = f"https://www.banki.ru{link}"
  84. r_ = requests.get(url_banki)
  85. soup_ = bs(r_.text, "lxml")
  86. #vacancies_text = soup_.find_all('dl', class_='definition-list padding-default')
  87. #for i in vacancies_text.find_all('dd', class_='margin-bottom-zero'):
  88. if link not in dict_lin:
  89. dict_lin[f"https://www.banki.ru{link}"] = []
  90. #ar = i
  91. for i in soup_.find_all("dd", {"class": "margin-bottom-zero"}):
  92. #ar = i.text.strip()
  93. ai = i.text
  94. #print(str(i.text.strip()), last)
  95. """str_temp += ar
  96. stop = 0"""
  97. ai = ai.replace(u'\n', u' ')
  98. ai = ai.replace(u'\xa0', u' ')
  99. str_temp += ai
  100. dict_lin[f"https://www.banki.ru{link}"].append(str(str_temp))
  101. #df_temp_2 = pd.DataFrame(last)
  102. #df_why = pd.concat([df_why, df_temp_2], axis= 0)
  103.  
  104.  
  105.  
  106. stop = 0
  107. data = list(dict_lin.items())
  108. an_array = np.array(data, dtype=object)
  109. print(an_array)
  110. df_why= pd.DataFrame(an_array)
  111.  
  112. ## save to xlsx file
  113.  
  114. #filepath = 'my_excel_file.xlsx'
  115.  
  116. #df.to_excel(filepath, index=False)
  117. #df_why.to_excel("somth_6.xlsx", sheet_name='Sheet1', index=False, header=True)
  118. # headers.append(title)
  119. df = pd.concat([df, df_why], axis= 1)
  120.  
  121. df = df.set_index('Unnamed: 0')
  122. df.rename(columns={0:'link', 1: 'text'}, inplace=True)
  123. df.to_excel("somth_12.xlsm", sheet_name='Sheet1',index=False, header=True)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement