Advertisement
Zgragselus

dunder.py

Feb 19th, 2023
869
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.64 KB | None | 0 0
  1. import logging
  2. from multiprocessing.resource_sharer import stop
  3. from bs4 import BeautifulSoup
  4. import requests
  5. import re
  6. import psycopg2
  7. from psycopg2 import sql
  8. import time
  9. import requests
  10. import json
  11. import time
  12. from api_connection_test import open_api_connection
  13. from api_connection_test import api_send_batch
  14.  
  15.  
  16. def get_pages(doc):
  17.     pages = doc.find(attrs={"class":'page-numbers'})
  18.     end_page = pages.find(class_="next page-numbers")
  19.     last_page = end_page.find_previous("a").text
  20.  
  21.     return int(last_page)
  22.  
  23.  
  24. def create_list(products,name,price,link,volume,stock,product_type,bottler):
  25.     prod_dict={"name": name,"link": link,"price": price,"currency_id": 32,"size":volume,"stock":stock,"product_type":product_type,"bottler":bottler,"auction":'False'}
  26.     products.append(prod_dict)
  27.  
  28.  
  29.  
  30.  
  31. def scrap_data(base_url,json_item_list):
  32.     page = 1
  33.     max_pages=1
  34.     headers = {
  35.     'Accept': '*/*',
  36.     'Accept-Encoding':'gzip,deflate',  
  37.     'User-agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)',
  38.     'referer': 'https://www.google.com/',
  39.     # 'DNT':'1'
  40.     }
  41.     while page <= max_pages:
  42.         #scrapper sleep between requests
  43.         #time.sleep(11)
  44.        
  45.         #set url with paging
  46.         url = f'{base_url}kategoria/rum/page/{page}/?currency=EUR'
  47.         print(f'Processing page:{page}')
  48.         result = requests.get(url,headers=headers)
  49.         if int(result.status_code) > 202:
  50.             print('Shit issue with loading page')
  51.             print(result.status_code)
  52.             return
  53.         doc = BeautifulSoup(result.text,'html5lib')
  54.         #update maximum pages - check only once
  55.         if max_pages <= 1:
  56.             max_pages=get_pages(doc)
  57.  
  58.         products = doc.find_all(class_=re.compile('product__info_box'))
  59.         print(f'Product per page:{len(products)}')
  60.         for product in products:
  61.  
  62.             name = product.find(class_='product__name').a['title']
  63.             link = product.find(class_='product__name').a['href']
  64.             try:
  65.                 volume = product.find(class_='product__capacity').find('span').text
  66.                 volume = list(filter(str.isdigit, volume))
  67.                 volume = int(''.join(volume)) * 100
  68.             except:
  69.                 volume = 0
  70.                 print("error volume")
  71.  
  72.             try:
  73.                 voltage =  product.find(class_='product__capacity').find_all('span')[1].text.replace("\n", "").replace("\t", "")
  74.                 #voltage = list(filter(str.isdigit, volume))
  75.                 #volume = int(''.join(volume)) * 100
  76.             except:
  77.                 print("error voltage")
  78.  
  79.             try:
  80.                 price =  product.find(class_='woocommerce-Price-amount amount').text.replace(".","")
  81.                 price = re.sub(r'[^0-9'+ ','+'.'+r']+', '', str(price).replace(",","."))
  82.                 price = float(price)
  83.             except:
  84.                 print("error price")
  85.  
  86.             bottler = product.find(class_='product__excerpt').text.replace("\n", "").replace("\t", "")
  87.  
  88.             product_type = product.find(class_='product__category').text.replace("\n", "").replace("\t", "")
  89.  
  90.             stock = 1          #!!!!!!!!! needs to be fixed in future ... they using css to grey unaviable products
  91.  
  92.  
  93.  
  94.             create_list(json_item_list,name,price,link,volume,stock,product_type,bottler)
  95.  
  96.         page += 1
  97.          
  98.  
  99.  
  100.  
  101.  
  102. if __name__ == "__main__":
  103.    base_url='https://dunder.store/'
  104.    eshop_id=5
  105.    #open_page(url)
  106.    token=open_api_connection()
  107.    data = []
  108.    scrap_data(base_url,data)
  109.    print(data)
  110.    api_send_batch(token,eshop_id,data)
  111.    #conn.commit()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement