Parser_cont_test

import pandas as pd
import numpy as np
import lxml
import openpyxl as pxl
import parser
from datetime import datetime as dt
import requests
import bs4
from bs4 import BeautifulSoup as bs
from unicodedata import normalize

headers = []
df = pd.DataFrame()
df_why = pd.DataFrame()
dict_lin = {}
for_iterate = []
#######TODO UNDERSTAND HOW TO PARSE WEBPAGE AND GET TOTAL COUNT OF PAGES

k = 0
for g in range(1, 21):
    #URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
    URL_TEMPLATE = f"https://www.banki.ru/banks/memory/?PAGEN_1={g}"
    #r = requests.get(URL_TEMPLATE)

    r = requests.get(URL_TEMPLATE, headers={'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'},
                     timeout=15)
    print(r.content)
    soup = bs(r.text, "html.parser")

    df_temp = pd.read_html(URL_TEMPLATE, encoding= 'utf8')

    for i in df_temp:
            df = pd.concat([df, i], axis=0)
#df = i.drop(index=[i[i['причина'] == 'ликв.'].index])

    vacancies_names = soup.find('table', class_="standard-table standard-table--row-highlight margin-bottom-default")
    count = 3
    #for i in vacancies_names.find_all('strong'):
    for i, td in enumerate(vacancies_names.find_all('td')):
        if i == count:
            for_iterate.append(td.text)
            count += 6

    for td in (vacancies_names.find_all('strong')):
        if for_iterate[k] == 'отозв.':
            headers.append(td.a['href'])
        k += 1
        #if a == 'отозв.':
            #print(td.a['href'], count)
            #count += 1
        #print(driver.find_element(By.XPATH, "/html/body/div[2]/div[1]/table/tbody/tr[2]/td[4]"), count)
        #print(vacancies_names.find_all('td'))
        #if vacancies_names.find_element(By.XPATH, "/html/body/div[2]/div[1]/table/tbody/tr[2]/td[4]") == r'<td>ликв.</td>':
        #title = i.a['href']
        #headers.append(i.a['href'])
stop = 0


df = df.reset_index(drop=True)

for throw in df[df['причина'] == 'ликв.'].index:
         df = df.drop(index=[throw])

df = df.drop_duplicates(subset=['номер лицензии'])
df = df.reset_index(drop=True)

for link in headers:
    last = []
    df_temp_2 = pd.DataFrame()
    str_temp = ''
    url_banki = f"https://www.banki.ru{link}"
    r_ = requests.get(url_banki)
    soup_ = bs(r_.text, "lxml")
    #vacancies_text = soup_.find_all('dl', class_='definition-list padding-default')
    #for i in vacancies_text.find_all('dd', class_='margin-bottom-zero'):
    if link not in dict_lin:
        dict_lin[f"https://www.banki.ru{link}"] = []
    #ar = i
    for i in soup_.find_all("dd", {"class": "margin-bottom-zero"}):
        #ar = i.text.strip()
        ai = i.text
        #print(str(i.text.strip()), last)
        """str_temp += ar
        stop = 0"""
        ai = ai.replace(u'\n', u' ')
        ai = ai.replace(u'\xa0', u' ')
        str_temp += ai
    dict_lin[f"https://www.banki.ru{link}"].append(str(str_temp))
    #df_temp_2 = pd.DataFrame(last)
    #df_why = pd.concat([df_why, df_temp_2], axis= 0)


stop = 0
data = list(dict_lin.items())
an_array = np.array(data, dtype=object)
print(an_array)
df_why= pd.DataFrame(an_array)

## save to xlsx file

#filepath = 'my_excel_file.xlsx'

#df.to_excel(filepath, index=False)
#df_why.to_excel("somth_6.xlsx", sheet_name='Sheet1', index=False, header=True)
# headers.append(title)
df = pd.concat([df, df_why], axis= 1)

df = df.set_index('Unnamed: 0')
df.rename(columns={0:'link', 1: 'text'}, inplace=True)
df.to_excel("somth_12.xlsm", sheet_name='Sheet1',index=False, header=True)
#####vba


Sub check()
    Dim capital(3) As Variant
    capital(0) = "êàïèòàë"
    capital(1) = "ðåçåðâ"
    capital(2) = "äîñîçä"
    capital(3) = "íåäîñîçä"
    Dim j As Integer

    Dim c As Range
    Dim firstAddress As String

        With Worksheets(1).Range("G2:G754")
            Set c = .Find("êàïèòàë", LookIn:=xlValues)
        If Not c Is Nothing Then
            firstAddress = c.Address
            Do
                Cells(c.Row, 8).Value = 1

                Set c = .FindNext(c)
            Loop While c.Row < 754
        ElseIf c Is Nothin Then
         firstAddress = c.Address
            Do
                Cells(c.Row, 8).Value = 0

                Set c = .FindNext(c)
            Loop While c.Row < 754
        End If
        End With


    For i = 2 To 754
        If Cells(i, 8).Value <> 1 Then
            Cells(i, 8).Value = 0
        End If
    Next i

End Sub