Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Lab1: Scraping
- # using Jupyter Notebook
- #WarmUp Mode (2 points)
- import requests
- from bs4 import BeautifulSoup
- import pandas as pd
- base_url = "https://clevershop.mk/product-category/mobilni-laptopi-i-tableti/page/"
- add_to_cart_base_url ="https://clevershop.mk/product-category/mobilni-laptopi-i-tableti/"
- def extract_product(product):
- title = product.select_one('.wd-entities-title').text.strip()
- product_URL = product.select_one('.product-image-link').get("href")
- add_to_cart = add_to_cart_base_url + product.select_one('.add_to_cart_button').get("href")
- prices = product.select('.woocommerce-Price-amount')
- if prices:
- regular_price = prices[0].text.strip()
- discount_price = prices[1].text.strip() if len(prices) > 1 else None
- if discount_price is None or discount_price == "":
- discount_price = 0
- else:
- regular_price = None
- discount_price = 0
- product_dict = {
- "ProductTitle": title,
- "ProductRegularPrice": regular_price,
- "ProductDiscountPrice": discount_price,
- "ProductURL": product_URL,
- "AddToCartURL": add_to_cart
- }
- return product_dict
- all_products = []
- for i in range(1,15):
- url = base_url + str(i)
- response = requests.get(url)
- soup = BeautifulSoup(response.text, "html.parser")
- products = soup.select('.product')
- for product in products:
- result = extract_product(product)
- all_products.append(result)
- df = pd.DataFrame(all_products)
- df
- df.to_csv('Lab1_201182_WarmUp.csv', index=False)
- #-----------------------------------------------------
- #Run Mode (8 points)
- !pip install selenium
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- import pandas as pd
- import matplotlib.pyplot as plt
- import seaborn as sns
- import plotly.express as px
- import statistics
- import time
- options = webdriver.ChromeOptions()
- options.add_argument('--headless')
- options.add_argument('--no-sandbox')
- options.add_argument('--disable-dev-shm-usage')
- browser = webdriver.Chrome(options=options)
- base_url = "https://sandbox.oxylabs.io/products?page="
- def extract_product(product):
- name = product.find_element(By.TAG_NAME,"h4").text
- price = product.find_element(By.CLASS_NAME,'price-wrapper').text
- div = product.find_element(By.CLASS_NAME,'rating')
- svg_list = div.find_elements(By.TAG_NAME,"svg")
- rating = len(svg_list)
- #rating = product.find_element(By.CLASS_NAME,'rating').text
- time.sleep(1)
- availability = product.find_element(By.CSS_SELECTOR,'p:last-of-type').text
- product_dict={
- "ProductName":name,
- "ProductPrice":float(price[:-1].replace(',','.')),
- "ProductRating":float(rating),
- "ProductAvailability":availability,
- }
- return product_dict
- all_products = []
- for i in range(1,2):
- url = base_url + str(i)
- browser.get(url)
- products = browser.find_elements(By.CLASS_NAME,'product-card')
- for product in products:
- result= extract_product(product)
- all_products.append(result)
- df = pd.DataFrame(all_products)
- df
- plt.figure(figsize=(10,7))
- plt.hist(df['ProductPrice'],bins=12)
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement