Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import subprocess
- import mimetypes
- import time
- import re
- # Function to download the file from the given URL, detect its MIME type, and save it with the correct file extension
- def download_image_with_correct_extension(image_url, output_dir, filename_without_ext, row_num):
- # Create a temporary file path (with .temp extension), we'll rename this after determining the correct file type
- temp_output_file = os.path.join(output_dir, f"{filename_without_ext}.temp")
- # Prepare the curl command to download the image from the URL
- curl_command = f'curl -L -o "{temp_output_file}" "{image_url}"'
- try:
- # Execute the curl command using subprocess to download the image
- result = subprocess.run(curl_command, shell=True, capture_output=True, text=True)
- # If curl command fails (returns a non-zero status), we log an error
- if result.returncode != 0:
- print(f"Failed to download {image_url} at row {row_num}")
- return False
- # Use the Python `mimetypes` module to detect the MIME type of the downloaded file
- mime_type, _ = mimetypes.guess_type(temp_output_file)
- # If the MIME type is detected, we use it to determine the appropriate file extension
- if mime_type:
- extension = mimetypes.guess_extension(mime_type)
- else:
- # If no MIME type is detected, default to .jpg as a fallback
- extension = '.jpg'
- # Create the final file path with the correct extension
- final_output_file = os.path.join(output_dir, f"{filename_without_ext}{extension}")
- # Rename the temp file to the new file name with the correct extension
- os.rename(temp_output_file, final_output_file)
- print(f"Downloaded and saved {final_output_file}")
- return True
- # Catch any exception during the file download or renaming process and log the error
- except Exception as e:
- print(f"Exception occurred while downloading {image_url} at row {row_num}: {e}")
- return False
- # Function to iterate through the extracted curl commands and download images accordingly
- def download_images(output_file, output_dir, log_file):
- # Ensure that the output directory exists, if not, create it
- if not os.path.exists(output_dir):
- os.makedirs(output_dir)
- # List to keep track of failed downloads
- failed_downloads = []
- # Open the file containing the curl commands and read each line
- with open(output_file, 'r') as file:
- curl_commands = file.readlines()
- # Iterate over each curl command
- for row_num, command in enumerate(curl_commands, start=1):
- # Use regex to extract the filename without the extension and the URL from the curl command
- filename_match = re.search(r'curl -o "([\w_\-\d]+)(\.[\w]+)?"', command)
- image_url_match = re.search(r'"(https://drive\.google\.com/open\?id=[\w\d_-]+)"', command)
- # If both filename and URL are successfully extracted
- if filename_match and image_url_match:
- # Extract the filename without extension
- filename_without_ext = filename_match.group(1)
- # Extract the image URL
- image_url = image_url_match.group(1)
- # Call the function to download the image and save it with the correct extension
- success = download_image_with_correct_extension(image_url, output_dir, filename_without_ext, row_num)
- # If the download fails, log the failed URL
- if not success:
- failed_downloads.append(f"Row {row_num}: {image_url}")
- else:
- # If the curl command is malformed, log the issue
- print(f"Skipping malformed row {row_num}: {command}")
- failed_downloads.append(f"Row {row_num}: Malformed command")
- # Add a delay to prevent overloading the server by downloading too fast
- time.sleep(2) # Increase this delay if necessary
- # Log any failed downloads to the log file
- if failed_downloads:
- with open(log_file, 'w') as log:
- for failure in failed_downloads:
- log.write(f"{failure}\n")
- print(f"Failed downloads logged in {log_file}")
- else:
- # If all images were downloaded successfully, print a success message
- print("All images downloaded successfully!")
- if __name__ == "__main__":
- # Path to the input file containing the curl commands (this file contains the curl download commands)
- log_file = "bases.txt" # This is the input file with curl commands, adjust the filename as needed
- output_file = "extracted_commands.txt" # File to store the extracted curl commands
- # Directory to save the downloaded images
- output_directory = "./downloaded_images"
- # Log file to record any failed downloads
- log_file_path = "./failed_downloads.log"
- # Call the function to start downloading images based on the extracted curl commands
- download_images(output_file, output_directory, log_file_path)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement