Advertisement
dzocesrce

[VNP] Scraping Web Sites

Nov 8th, 2024
145
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.27 KB | None | 0 0
  1. #Lab1: Scraping
  2. # using Jupyter Notebook
  3. #WarmUp Mode (2 points)
  4.  
  5. import requests
  6. from bs4 import BeautifulSoup
  7. import pandas as pd
  8.  
  9. base_url = "https://clevershop.mk/product-category/mobilni-laptopi-i-tableti/page/"
  10. add_to_cart_base_url ="https://clevershop.mk/product-category/mobilni-laptopi-i-tableti/"
  11.  
  12. def extract_product(product):
  13.     title = product.select_one('.wd-entities-title').text.strip()
  14.     product_URL = product.select_one('.product-image-link').get("href")
  15.     add_to_cart = add_to_cart_base_url + product.select_one('.add_to_cart_button').get("href")
  16.     prices = product.select('.woocommerce-Price-amount')
  17.  
  18.     if prices:
  19.         regular_price = prices[0].text.strip()
  20.         discount_price = prices[1].text.strip() if len(prices) > 1 else None
  21.  
  22.         if discount_price is None or discount_price == "":
  23.             discount_price = 0
  24.  
  25.         else:
  26.             regular_price = None
  27.             discount_price = 0
  28.  
  29.         product_dict = {
  30.             "ProductTitle": title,
  31.             "ProductRegularPrice": regular_price,
  32.             "ProductDiscountPrice": discount_price,
  33.             "ProductURL": product_URL,
  34.             "AddToCartURL": add_to_cart
  35.         }
  36.  
  37.         return product_dict
  38.  
  39. all_products = []
  40. for i in range(1,15):
  41.     url = base_url + str(i)
  42.     response = requests.get(url)
  43.     soup = BeautifulSoup(response.text, "html.parser")
  44.     products = soup.select('.product')
  45.    
  46.     for product in products:
  47.         result = extract_product(product)
  48.         all_products.append(result)
  49.  
  50. df = pd.DataFrame(all_products)
  51. df
  52. df.to_csv('Lab1_201182_WarmUp.csv', index=False)
  53.  
  54. #-----------------------------------------------------
  55.  
  56. #Run Mode (8 points)
  57.  
  58. !pip install selenium
  59.  
  60. from selenium import webdriver
  61. from selenium.webdriver.common.by import By
  62. import pandas as pd
  63. import matplotlib.pyplot as plt
  64. import seaborn as sns
  65. import plotly.express as px
  66. import statistics
  67. import time
  68.  
  69. options = webdriver.ChromeOptions()
  70. options.add_argument('--headless')
  71. options.add_argument('--no-sandbox')
  72. options.add_argument('--disable-dev-shm-usage')
  73.  
  74. browser = webdriver.Chrome(options=options)
  75.  
  76. base_url = "https://sandbox.oxylabs.io/products?page="
  77.  
  78. def extract_product(product):
  79.     name = product.find_element(By.TAG_NAME,"h4").text
  80.     price = product.find_element(By.CLASS_NAME,'price-wrapper').text
  81.     div = product.find_element(By.CLASS_NAME,'rating')
  82.     svg_list = div.find_elements(By.TAG_NAME,"svg")
  83.     rating = len(svg_list)
  84.     #rating = product.find_element(By.CLASS_NAME,'rating').text
  85.     time.sleep(1)
  86.     availability = product.find_element(By.CSS_SELECTOR,'p:last-of-type').text
  87.    
  88.     product_dict={
  89.         "ProductName":name,
  90.         "ProductPrice":float(price[:-1].replace(',','.')),
  91.         "ProductRating":float(rating),
  92.         "ProductAvailability":availability,
  93.        
  94.     }
  95.    
  96.     return product_dict
  97.  
  98. all_products = []
  99. for i in range(1,2):
  100.     url = base_url + str(i)
  101.     browser.get(url)
  102.     products = browser.find_elements(By.CLASS_NAME,'product-card')
  103.     for product in products:
  104.         result= extract_product(product)
  105.         all_products.append(result)
  106.  
  107. df = pd.DataFrame(all_products)
  108. df
  109.  
  110. plt.figure(figsize=(10,7))
  111. plt.hist(df['ProductPrice'],bins=12)
  112. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement