Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import logging
- from multiprocessing.resource_sharer import stop
- from bs4 import BeautifulSoup
- import requests
- import re
- import psycopg2
- from psycopg2 import sql
- import time
- import requests
- import json
- import time
- from api_connection_test import open_api_connection
- from api_connection_test import api_send_batch
- def get_pages(doc):
- pages = doc.find(attrs={"class":'page-numbers'})
- end_page = pages.find(class_="next page-numbers")
- last_page = end_page.find_previous("a").text
- return int(last_page)
- def create_list(products,name,price,link,volume,stock,product_type,bottler):
- prod_dict={"name": name,"link": link,"price": price,"currency_id": 32,"size":volume,"stock":stock,"product_type":product_type,"bottler":bottler,"auction":'False'}
- products.append(prod_dict)
- def scrap_data(base_url,json_item_list):
- page = 1
- max_pages=1
- headers = {
- 'Accept': '*/*',
- 'Accept-Encoding':'gzip,deflate',
- 'User-agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
- 'referer': 'https://www.google.com/',
- # 'DNT':'1'
- }
- while page <= max_pages:
- #scrapper sleep between requests
- #time.sleep(11)
- #set url with paging
- url = f'{base_url}kategoria/rum/page/{page}/?currency=EUR'
- print(f'Processing page:{page}')
- result = requests.get(url,headers=headers)
- if int(result.status_code) > 202:
- print('Shit issue with loading page')
- print(result.status_code)
- return
- doc = BeautifulSoup(result.text,'html5lib')
- #update maximum pages - check only once
- if max_pages <= 1:
- max_pages=get_pages(doc)
- products = doc.find_all(class_=re.compile('product__info_box'))
- print(f'Product per page:{len(products)}')
- for product in products:
- name = product.find(class_='product__name').a['title']
- link = product.find(class_='product__name').a['href']
- try:
- volume = product.find(class_='product__capacity').find('span').text
- volume = list(filter(str.isdigit, volume))
- volume = int(''.join(volume)) * 100
- except:
- volume = 0
- print("error volume")
- try:
- voltage = product.find(class_='product__capacity').find_all('span')[1].text.replace("\n", "").replace("\t", "")
- #voltage = list(filter(str.isdigit, volume))
- #volume = int(''.join(volume)) * 100
- except:
- print("error voltage")
- try:
- price = product.find(class_='woocommerce-Price-amount amount').text.replace(".","")
- price = re.sub(r'[^0-9'+ ','+'.'+r']+', '', str(price).replace(",","."))
- price = float(price)
- except:
- print("error price")
- bottler = product.find(class_='product__excerpt').text.replace("\n", "").replace("\t", "")
- product_type = product.find(class_='product__category').text.replace("\n", "").replace("\t", "")
- stock = 1 #!!!!!!!!! needs to be fixed in future ... they using css to grey unaviable products
- create_list(json_item_list,name,price,link,volume,stock,product_type,bottler)
- page += 1
- if __name__ == "__main__":
- base_url='https://dunder.store/'
- eshop_id=5
- #open_page(url)
- token=open_api_connection()
- data = []
- scrap_data(base_url,data)
- print(data)
- api_send_batch(token,eshop_id,data)
- #conn.commit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement