Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import lxml
- import openpyxl as pxl
- import parser
- from datetime import datetime as dt
- import requests
- import bs4
- from bs4 import BeautifulSoup as bs
- from unicodedata import normalize
- headers = []
- df = pd.DataFrame()
- df_why = pd.DataFrame()
- dict_lin = {}
- for_iterate = []
- #######TODO UNDERSTAND HOW TO PARSE WEBPAGE AND GET TOTAL COUNT OF PAGES
- k = 0
- for g in range(1, 21):
- #URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
- URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
- #r = requests.get(URL_TEMPLATE)
- r = requests.get(URL_TEMPLATE, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
- timeout=15)
- print(r.content)
- soup = bs(r.text, "html.parser")
- df_temp = pd.read_html(URL_TEMPLATE, encoding= 'utf8')
- for i in df_temp:
- df = pd.concat([df, i], axis=0)
- #df = i.drop(index=[i[i['причина'] == 'ликв.'].index])
- vacancies_names = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
- count = 3
- #for i in vacancies_names.find_all('strong'):
- for i, td in enumerate(vacancies_names.find_all('td')):
- if i == count:
- for_iterate.append(td.text)
- count += 6
- for td in (vacancies_names.find_all('strong')):
- if for_iterate[k] == 'отозв.':
- headers.append(td.a['href'])
- k += 1
- #if a == 'отозв.':
- #print(td.a['href'], count)
- #count += 1
- #print(driver.find_element(By.XPATH, "/html/body/div[2]/div[1]/table/tbody/tr[2]/td[4]"), count)
- #print(vacancies_names.find_all('td'))
- #if vacancies_names.find_element(By.XPATH, "/html/body/div[2]/div[1]/table/tbody/tr[2]/td[4]") == r'<td>ликв.</td>':
- #title = i.a['href']
- #headers.append(i.a['href'])
- stop = 0
- df = df.reset_index(drop=True)
- for throw in df[df['причина'] == 'ликв.'].index:
- df = df.drop(index=[throw])
- df = df.drop_duplicates(subset=['номер лицензии'])
- df = df.reset_index(drop=True)
- for link in headers:
- last = []
- df_temp_2 = pd.DataFrame()
- str_temp = ''
- url_banki = f"https://www.banki.ru{link}"
- r_ = requests.get(url_banki)
- soup_ = bs(r_.text, "lxml")
- #vacancies_text = soup_.find_all('dl', class_='definition-list padding-default')
- #for i in vacancies_text.find_all('dd', class_='margin-bottom-zero'):
- if link not in dict_lin:
- dict_lin[f"https://www.banki.ru{link}"] = []
- #ar = i
- for i in soup_.find_all("dd", {"class": "margin-bottom-zero"}):
- #ar = i.text.strip()
- ai = i.text
- #print(str(i.text.strip()), last)
- """str_temp += ar
- stop = 0"""
- ai = ai.replace(u'\n', u' ')
- ai = ai.replace(u'\xa0', u' ')
- str_temp += ai
- dict_lin[f"https://www.banki.ru{link}"].append(str(str_temp))
- #df_temp_2 = pd.DataFrame(last)
- #df_why = pd.concat([df_why, df_temp_2], axis= 0)
- stop = 0
- data = list(dict_lin.items())
- an_array = np.array(data, dtype=object)
- print(an_array)
- df_why= pd.DataFrame(an_array)
- ## save to xlsx file
- #filepath = 'my_excel_file.xlsx'
- #df.to_excel(filepath, index=False)
- #df_why.to_excel("somth_6.xlsx", sheet_name='Sheet1', index=False, header=True)
- # headers.append(title)
- df = pd.concat([df, df_why], axis= 1)
- df = df.set_index('Unnamed: 0')
- df.rename(columns={0:'link', 1: 'text'}, inplace=True)
- df.to_excel("somth_12.xlsm", sheet_name='Sheet1',index=False, header=True)
- #####vba
- Sub check()
- Dim capital(3) As Variant
- capital(0) = "êàïèòàë"
- capital(1) = "ðåçåðâ"
- capital(2) = "äîñîçä"
- capital(3) = "íåäîñîçä"
- Dim j As Integer
- Dim c As Range
- Dim firstAddress As String
- With Worksheets(1).Range("G2:G754")
- Set c = .Find("êàïèòàë", LookIn:=xlValues)
- If Not c Is Nothing Then
- firstAddress = c.Address
- Do
- Cells(c.Row, 8).Value = 1
- Set c = .FindNext(c)
- Loop While c.Row < 754
- ElseIf c Is Nothin Then
- firstAddress = c.Address
- Do
- Cells(c.Row, 8).Value = 0
- Set c = .FindNext(c)
- Loop While c.Row < 754
- End If
- End With
- For i = 2 To 754
- If Cells(i, 8).Value <> 1 Then
- Cells(i, 8).Value = 0
- End If
- Next i
- End Sub
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement