Advertisement
GeorgiLukanov87

scrapy-that-shoe

Feb 6th, 2025 (edited)
49
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.62 KB | None | 0 0
  1. from selenium.webdriver.common.action_chains import ActionChains
  2. from selenium.webdriver.chrome.service import Service
  3. from selenium.webdriver.common.by import By
  4. from selenium_stealth import stealth
  5. from selenium import webdriver
  6.  
  7. from bs4 import BeautifulSoup
  8.  
  9. import time
  10. import json
  11.  
  12. # Path to ChromeDriver
  13. chrome_driver_path = "chromedriver-win64/chromedriver.exe"
  14. # My Chrome version: 133.0.6943.54
  15. # ChromeDriver Download link: https://storage.googleapis.com/chrome-for-testing-public/133.0.6943.53/win64/
  16.  
  17.  
  18. # Browser configuration
  19. options = webdriver.ChromeOptions()
  20. options.add_argument("--disable-blink-features=AutomationControlled")
  21. options.add_argument("--incognito")
  22. options.add_argument("--no-sandbox")
  23. options.add_argument("--disable-dev-shm-usage")
  24. options.add_argument("--disable-geolocation")  # Disable location requests
  25.  
  26. # Start WebDriver
  27. service = Service(chrome_driver_path)
  28. driver = webdriver.Chrome(service=service, options=options)
  29.  
  30. # Apply stealth settings
  31. stealth(driver,
  32.         languages=["en-US", "en"],
  33.         vendor="Google Inc.",
  34.         platform="Win32",
  35.         webgl_vendor="Intel Inc.",
  36.         renderer="Intel Iris OpenGL Engine",
  37.         fix_hairline=True,
  38.         )
  39.  
  40. try:
  41.     # Load the page
  42.     url = "https://www.academy.com/p/nike-womens-court-legacy-next-nature-shoes"
  43.     # url = "https://www.academy.com/p/nike-womens-air-max-sc-se-running-shoes-139654084?sku=dark-red-10-b"
  44.     driver.get(url)
  45.     time.sleep(5)  # Wait for the page to load
  46.  
  47.     # **Fake geolocation if required**
  48.     driver.execute_cdp_cmd("Emulation.setGeolocationOverride", {
  49.         "latitude": 37.7749,  # Fake geolocation - San Francisco
  50.         "longitude": -122.4194,
  51.         "accuracy": 100
  52.     })
  53.  
  54.     # Check for CAPTCHA button
  55.     try:
  56.         captcha_button = driver.find_element(By.ID, "px-captcha")  # ID may change
  57.         action = ActionChains(driver)
  58.         action.click_and_hold(captcha_button).perform()
  59.         time.sleep(15)  # Hold button for CAPTCHA
  60.         action.release().perform()
  61.         print("CAPTCHA successfully bypassed.")
  62.     except Exception as e:
  63.         print(f"CAPTCHA not found or already bypassed. Error: {e}")
  64.  
  65.     # Extract HTML content
  66.     html_content = driver.page_source
  67.     # print(html_content)
  68.  
  69.     # Create a BeautifulSoup object
  70.     soup = BeautifulSoup(html_content, 'html.parser')
  71.     # print(soup)
  72.  
  73.     # Extract product data
  74.     # Product name
  75.     product_name_element = soup.find('title')
  76.     product_name = product_name_element.text.split('|')[0].strip() if product_name_element else "No name"
  77.  
  78.     # Price
  79.     price_element = soup.find('span', class_='pricing nowPrice lg')
  80.     price = float(price_element.text.strip()[1:]) if price_element else 0.0
  81.  
  82.     # Number of reviews
  83.     reviews_count_element = soup.find('button', class_='ratingCount linkBtn focusable smallLink')
  84.     reviews_count = int(reviews_count_element.text.strip('()')) if reviews_count_element else 0
  85.  
  86.     # Average rating
  87.     average_rating_element = soup.find('span', class_='ratingAvg textCaption')
  88.     average_rating = float(average_rating_element.text.strip()) if average_rating_element else 0.0
  89.  
  90.     # Extract available colors
  91.     color_buttons = soup.find_all('button', class_='buttonWrapper--S9sgu')
  92.     available_colours = []
  93.     for button in color_buttons:
  94.         color = button.get('aria-label', '').strip() or button.get('id', '').strip()
  95.         if color:
  96.             clean_color = color.replace("selected", "").replace("unavailable", "").replace("Clearance", "").strip()
  97.             available_colours.append(clean_color)
  98.  
  99.     # Set the first available color as the main color
  100.     main_colour = available_colours[0] if available_colours else "No color"
  101.  
  102.     # pdpContentWrapper > section:nth-child(2) > div > div.headerWrapper--ycKHq > div > div > span > span.swatchName--KWu4Q
  103.     if main_colour == "No color":
  104.         search_main_color_by_selector = soup.select_one(".swatchName--KWu4Q")
  105.         main_colour = search_main_color_by_selector.text.strip()
  106.  
  107.     # Create product data dictionary
  108.     product_data = {
  109.         "name": product_name,
  110.         "price": price,
  111.         "colour": main_colour,
  112.         "availableColours": available_colours[1:],
  113.         "reviews_count": reviews_count,
  114.         "reviews_score": average_rating
  115.     }
  116.  
  117.     # Save data to a JSON file
  118.     with open('product_data.json', 'w') as f:
  119.         json.dump(product_data, f, indent=4)
  120.  
  121.     print("Data successfully saved to product_data.json!")
  122.     print(json.dumps(product_data, indent=4))
  123.  
  124. except Exception as e:
  125.     print(f"Error: {str(e)}")
  126.  
  127. finally:
  128.     driver.quit()
  129.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement