Instagram_scrape.py

import time
import json
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

# Prompt user for Instagram username
username = input("Enter the Instagram username you want to scrape: ")

# Set up Chrome options for headless operation
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")

# Specify path to your ChromeDriver
service = Service("/usr/local/bin/chromedriver")  # Correct path to your chromedriver

# Initialize WebDriver
driver = webdriver.Chrome(service=service, options=chrome_options)

# Instagram login URL
login_url = "https://www.instagram.com/accounts/login/"

# Go to the Instagram login page
driver.get(login_url)

# Wait until login form is present
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, "username")))

# Prompt user for Instagram credentials (use securely in real-world scenarios)
insta_username = input("Enter your Instagram username: ")
insta_password = input("Enter your Instagram password: ")

# Log in to Instagram
driver.find_element(By.NAME, "username").send_keys(insta_username)
driver.find_element(By.NAME, "password").send_keys(insta_password)
driver.find_element(By.NAME, "password").send_keys(Keys.RETURN)

# Wait for login to complete and profile page to load
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//a[contains(@href, '/accounts/edit/')]")))

print("Login successful.")

# Navigate to the user's profile
driver.get(f"https://www.instagram.com/{username}/")

# Wait for the page to load
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//header//section//h2")))

# Scrape data from the profile
profile_data = {}

# Scrape the profile details (followers, following, posts count)
profile_data['username'] = username
profile_data['bio'] = driver.find_element(By.XPATH, "//div[@class='-vDIg']//span").text if len(driver.find_elements(By.XPATH, "//div[@class='-vDIg']//span")) > 0 else None
profile_data['full_name'] = driver.find_element(By.XPATH, "//h1").text
profile_data['followers'] = driver.find_element(By.XPATH, "//a[contains(@href,'/followers')]//span").text
profile_data['following'] = driver.find_element(By.XPATH, "//a[contains(@href,'/following')]//span").text
profile_data['posts'] = driver.find_element(By.XPATH, "//span[@class='-nal3']//span").text

# Optionally, save data to a JSON file
with open(f"{username}_profile_data.json", "w") as outfile:
    json.dump(profile_data, outfile, indent=4)

print(f"Profile data for {username} has been saved to {username}_profile_data.json.")

# Scrape the latest posts from the profile (up to 5 posts in this case)
posts_data = []
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)  # Let the page load more posts

# Scrape the first 5 posts
post_elements = driver.find_elements(By.XPATH, "//div[@class='v1Nh3 kIKUG  _bz0w']")
for i, post in enumerate(post_elements[:5]):
    post_data = {}
    post.click()
    time.sleep(1)

    # Wait for post details to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//article//header//div[@class='tD3Xx']")))

    # Get the post's caption
    post_data['caption'] = driver.find_element(By.XPATH, "//div[@class='C4VMK']/span").text
    post_data['likes'] = driver.find_element(By.XPATH, "//button[@class='wpO6b']").text
    post_data['comments'] = driver.find_element(By.XPATH, "//ul[@class='Mr508']").text

    # Scrape the post image/video URLs
    media = driver.find_element(By.XPATH, "//div[@class='KL4Bh']")
    media_url = media.find_element(By.TAG_NAME, "img").get_attribute("src") if media.find_elements(By.TAG_NAME, "img") else media.find_element(By.TAG_NAME, "video").get_attribute("src")
    post_data['media_url'] = media_url

    posts_data.append(post_data)

    # Close the post modal
    driver.find_element(By.XPATH, "//div[@class='Igw0E IwRSH eGOV_ _4EzTm']").click()
    time.sleep(1)

# Optionally, save post data to a JSON file
with open(f"{username}_posts_data.json", "w") as outfile:
    json.dump(posts_data, outfile, indent=4)

print(f"Post data for {username} has been saved to {username}_posts_data.json.")

# Close the browser
driver.quit()