Advertisement
Danila_lipatov

Parser_cont_test

Oct 5th, 2022 (edited)
128
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.60 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import lxml
  4. import openpyxl as pxl
  5. import parser
  6. from datetime import datetime as dt
  7. import requests
  8. import bs4
  9. from bs4 import BeautifulSoup as bs
  10. from unicodedata import normalize
  11.  
  12. headers = []
  13. df = pd.DataFrame()
  14. df_why = pd.DataFrame()
  15. dict_lin = {}
  16. for_iterate = []
  17. #######TODO UNDERSTAND HOW TO PARSE WEBPAGE AND GET TOTAL COUNT OF PAGES
  18.  
  19. k = 0
  20. for g in range(1, 21):
  21. #URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
  22. URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
  23. #r = requests.get(URL_TEMPLATE)
  24.  
  25. r = requests.get(URL_TEMPLATE, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
  26. timeout=15)
  27. print(r.content)
  28. soup = bs(r.text, "html.parser")
  29.  
  30. df_temp = pd.read_html(URL_TEMPLATE, encoding= 'utf8')
  31.  
  32. for i in df_temp:
  33. df = pd.concat([df, i], axis=0)
  34. #df = i.drop(index=[i[i['причина'] == 'ликв.'].index])
  35.  
  36. vacancies_names = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
  37. count = 3
  38. #for i in vacancies_names.find_all('strong'):
  39. for i, td in enumerate(vacancies_names.find_all('td')):
  40. if i == count:
  41. for_iterate.append(td.text)
  42. count += 6
  43.  
  44. for td in (vacancies_names.find_all('strong')):
  45. if for_iterate[k] == 'отозв.':
  46. headers.append(td.a['href'])
  47. k += 1
  48. #if a == 'отозв.':
  49. #print(td.a['href'], count)
  50. #count += 1
  51. #print(driver.find_element(By.XPATH, "/html/body/div[2]/div[1]/table/tbody/tr[2]/td[4]"), count)
  52. #print(vacancies_names.find_all('td'))
  53. #if vacancies_names.find_element(By.XPATH, "/html/body/div[2]/div[1]/table/tbody/tr[2]/td[4]") == r'<td>ликв.</td>':
  54. #title = i.a['href']
  55. #headers.append(i.a['href'])
  56. stop = 0
  57.  
  58.  
  59. df = df.reset_index(drop=True)
  60.  
  61. for throw in df[df['причина'] == 'ликв.'].index:
  62. df = df.drop(index=[throw])
  63.  
  64. df = df.drop_duplicates(subset=['номер лицензии'])
  65. df = df.reset_index(drop=True)
  66.  
  67. for link in headers:
  68. last = []
  69. df_temp_2 = pd.DataFrame()
  70. str_temp = ''
  71. url_banki = f"https://www.banki.ru{link}"
  72. r_ = requests.get(url_banki)
  73. soup_ = bs(r_.text, "lxml")
  74. #vacancies_text = soup_.find_all('dl', class_='definition-list padding-default')
  75. #for i in vacancies_text.find_all('dd', class_='margin-bottom-zero'):
  76. if link not in dict_lin:
  77. dict_lin[f"https://www.banki.ru{link}"] = []
  78. #ar = i
  79. for i in soup_.find_all("dd", {"class": "margin-bottom-zero"}):
  80. #ar = i.text.strip()
  81. ai = i.text
  82. #print(str(i.text.strip()), last)
  83. """str_temp += ar
  84. stop = 0"""
  85. ai = ai.replace(u'\n', u' ')
  86. ai = ai.replace(u'\xa0', u' ')
  87. str_temp += ai
  88. dict_lin[f"https://www.banki.ru{link}"].append(str(str_temp))
  89. #df_temp_2 = pd.DataFrame(last)
  90. #df_why = pd.concat([df_why, df_temp_2], axis= 0)
  91.  
  92.  
  93.  
  94. stop = 0
  95. data = list(dict_lin.items())
  96. an_array = np.array(data, dtype=object)
  97. print(an_array)
  98. df_why= pd.DataFrame(an_array)
  99.  
  100. ## save to xlsx file
  101.  
  102. #filepath = 'my_excel_file.xlsx'
  103.  
  104. #df.to_excel(filepath, index=False)
  105. #df_why.to_excel("somth_6.xlsx", sheet_name='Sheet1', index=False, header=True)
  106. # headers.append(title)
  107. df = pd.concat([df, df_why], axis= 1)
  108.  
  109. df = df.set_index('Unnamed: 0')
  110. df.rename(columns={0:'link', 1: 'text'}, inplace=True)
  111. df.to_excel("somth_12.xlsm", sheet_name='Sheet1',index=False, header=True)
  112. #####vba
  113.  
  114.  
  115. Sub check()
  116. Dim capital(3) As Variant
  117. capital(0) = "êàïèòàë"
  118. capital(1) = "ðåçåðâ"
  119. capital(2) = "äîñîçä"
  120. capital(3) = "íåäîñîçä"
  121. Dim j As Integer
  122.  
  123. Dim c As Range
  124. Dim firstAddress As String
  125.  
  126. With Worksheets(1).Range("G2:G754")
  127. Set c = .Find("êàïèòàë", LookIn:=xlValues)
  128. If Not c Is Nothing Then
  129. firstAddress = c.Address
  130. Do
  131. Cells(c.Row, 8).Value = 1
  132.  
  133. Set c = .FindNext(c)
  134. Loop While c.Row < 754
  135. ElseIf c Is Nothin Then
  136. firstAddress = c.Address
  137. Do
  138. Cells(c.Row, 8).Value = 0
  139.  
  140. Set c = .FindNext(c)
  141. Loop While c.Row < 754
  142. End If
  143. End With
  144.  
  145.  
  146.  
  147. For i = 2 To 754
  148. If Cells(i, 8).Value <> 1 Then
  149. Cells(i, 8).Value = 0
  150. End If
  151. Next i
  152.  
  153. End Sub
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement