MdC

import os
import subprocess
import mimetypes
import time
import re

# Function to download the file from the given URL, detect its MIME type, and save it with the correct file extension
def download_image_with_correct_extension(image_url, output_dir, filename_without_ext, row_num):
    # Create a temporary file path (with .temp extension), we'll rename this after determining the correct file type
    temp_output_file = os.path.join(output_dir, f"{filename_without_ext}.temp")

    # Prepare the curl command to download the image from the URL
    curl_command = f'curl -L -o "{temp_output_file}" "{image_url}"'

    try:
        # Execute the curl command using subprocess to download the image
        result = subprocess.run(curl_command, shell=True, capture_output=True, text=True)

        # If curl command fails (returns a non-zero status), we log an error
        if result.returncode != 0:
            print(f"Failed to download {image_url} at row {row_num}")
            return False

        # Use the Python `mimetypes` module to detect the MIME type of the downloaded file
        mime_type, _ = mimetypes.guess_type(temp_output_file)

        # If the MIME type is detected, we use it to determine the appropriate file extension
        if mime_type:
            extension = mimetypes.guess_extension(mime_type)
        else:
            # If no MIME type is detected, default to .jpg as a fallback
            extension = '.jpg'

        # Create the final file path with the correct extension
        final_output_file = os.path.join(output_dir, f"{filename_without_ext}{extension}")
        # Rename the temp file to the new file name with the correct extension
        os.rename(temp_output_file, final_output_file)

        print(f"Downloaded and saved {final_output_file}")
        return True

    # Catch any exception during the file download or renaming process and log the error
    except Exception as e:
        print(f"Exception occurred while downloading {image_url} at row {row_num}: {e}")
        return False

# Function to iterate through the extracted curl commands and download images accordingly
def download_images(output_file, output_dir, log_file):
    # Ensure that the output directory exists, if not, create it
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # List to keep track of failed downloads
    failed_downloads = []

    # Open the file containing the curl commands and read each line
    with open(output_file, 'r') as file:
        curl_commands = file.readlines()

        # Iterate over each curl command
        for row_num, command in enumerate(curl_commands, start=1):
            # Use regex to extract the filename without the extension and the URL from the curl command
            filename_match = re.search(r'curl -o "([\w_\-\d]+)(\.[\w]+)?"', command)
            image_url_match = re.search(r'"(https://drive\.google\.com/open\?id=[\w\d_-]+)"', command)

            # If both filename and URL are successfully extracted
            if filename_match and image_url_match:
                # Extract the filename without extension
                filename_without_ext = filename_match.group(1)
                # Extract the image URL
                image_url = image_url_match.group(1)

                # Call the function to download the image and save it with the correct extension
                success = download_image_with_correct_extension(image_url, output_dir, filename_without_ext, row_num)

                # If the download fails, log the failed URL
                if not success:
                    failed_downloads.append(f"Row {row_num}: {image_url}")
            else:
                # If the curl command is malformed, log the issue
                print(f"Skipping malformed row {row_num}: {command}")
                failed_downloads.append(f"Row {row_num}: Malformed command")

            # Add a delay to prevent overloading the server by downloading too fast
            time.sleep(2)  # Increase this delay if necessary

    # Log any failed downloads to the log file
    if failed_downloads:
        with open(log_file, 'w') as log:
            for failure in failed_downloads:
                log.write(f"{failure}\n")
        print(f"Failed downloads logged in {log_file}")
    else:
        # If all images were downloaded successfully, print a success message
        print("All images downloaded successfully!")

if __name__ == "__main__":
    # Path to the input file containing the curl commands (this file contains the curl download commands)
    log_file = "bases.txt"  # This is the input file with curl commands, adjust the filename as needed
    output_file = "extracted_commands.txt"  # File to store the extracted curl commands

    # Directory to save the downloaded images
    output_directory = "./downloaded_images"
    # Log file to record any failed downloads
    log_file_path = "./failed_downloads.log"

    # Call the function to start downloading images based on the extracted curl commands
    download_images(output_file, output_directory, log_file_path)