Advertisement
A_God

MdC

Oct 15th, 2024
55
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.11 KB | None | 0 0
  1. import os
  2. import subprocess
  3. import mimetypes
  4. import time
  5. import re
  6.  
  7. # Function to download the file from the given URL, detect its MIME type, and save it with the correct file extension
  8. def download_image_with_correct_extension(image_url, output_dir, filename_without_ext, row_num):
  9.     # Create a temporary file path (with .temp extension), we'll rename this after determining the correct file type
  10.     temp_output_file = os.path.join(output_dir, f"{filename_without_ext}.temp")
  11.    
  12.     # Prepare the curl command to download the image from the URL
  13.     curl_command = f'curl -L -o "{temp_output_file}" "{image_url}"'
  14.    
  15.     try:
  16.         # Execute the curl command using subprocess to download the image
  17.         result = subprocess.run(curl_command, shell=True, capture_output=True, text=True)
  18.  
  19.         # If curl command fails (returns a non-zero status), we log an error
  20.         if result.returncode != 0:
  21.             print(f"Failed to download {image_url} at row {row_num}")
  22.             return False
  23.        
  24.         # Use the Python `mimetypes` module to detect the MIME type of the downloaded file
  25.         mime_type, _ = mimetypes.guess_type(temp_output_file)
  26.  
  27.         # If the MIME type is detected, we use it to determine the appropriate file extension
  28.         if mime_type:
  29.             extension = mimetypes.guess_extension(mime_type)
  30.         else:
  31.             # If no MIME type is detected, default to .jpg as a fallback
  32.             extension = '.jpg'
  33.  
  34.         # Create the final file path with the correct extension
  35.         final_output_file = os.path.join(output_dir, f"{filename_without_ext}{extension}")
  36.         # Rename the temp file to the new file name with the correct extension
  37.         os.rename(temp_output_file, final_output_file)
  38.  
  39.         print(f"Downloaded and saved {final_output_file}")
  40.         return True
  41.    
  42.     # Catch any exception during the file download or renaming process and log the error
  43.     except Exception as e:
  44.         print(f"Exception occurred while downloading {image_url} at row {row_num}: {e}")
  45.         return False
  46.  
  47. # Function to iterate through the extracted curl commands and download images accordingly
  48. def download_images(output_file, output_dir, log_file):
  49.     # Ensure that the output directory exists, if not, create it
  50.     if not os.path.exists(output_dir):
  51.         os.makedirs(output_dir)
  52.  
  53.     # List to keep track of failed downloads
  54.     failed_downloads = []
  55.  
  56.     # Open the file containing the curl commands and read each line
  57.     with open(output_file, 'r') as file:
  58.         curl_commands = file.readlines()
  59.  
  60.         # Iterate over each curl command
  61.         for row_num, command in enumerate(curl_commands, start=1):
  62.             # Use regex to extract the filename without the extension and the URL from the curl command
  63.             filename_match = re.search(r'curl -o "([\w_\-\d]+)(\.[\w]+)?"', command)
  64.             image_url_match = re.search(r'"(https://drive\.google\.com/open\?id=[\w\d_-]+)"', command)
  65.  
  66.             # If both filename and URL are successfully extracted
  67.             if filename_match and image_url_match:
  68.                 # Extract the filename without extension
  69.                 filename_without_ext = filename_match.group(1)
  70.                 # Extract the image URL
  71.                 image_url = image_url_match.group(1)
  72.  
  73.                 # Call the function to download the image and save it with the correct extension
  74.                 success = download_image_with_correct_extension(image_url, output_dir, filename_without_ext, row_num)
  75.  
  76.                 # If the download fails, log the failed URL
  77.                 if not success:
  78.                     failed_downloads.append(f"Row {row_num}: {image_url}")
  79.             else:
  80.                 # If the curl command is malformed, log the issue
  81.                 print(f"Skipping malformed row {row_num}: {command}")
  82.                 failed_downloads.append(f"Row {row_num}: Malformed command")
  83.  
  84.             # Add a delay to prevent overloading the server by downloading too fast
  85.             time.sleep(2)  # Increase this delay if necessary
  86.  
  87.     # Log any failed downloads to the log file
  88.     if failed_downloads:
  89.         with open(log_file, 'w') as log:
  90.             for failure in failed_downloads:
  91.                 log.write(f"{failure}\n")
  92.         print(f"Failed downloads logged in {log_file}")
  93.     else:
  94.         # If all images were downloaded successfully, print a success message
  95.         print("All images downloaded successfully!")
  96.  
  97. if __name__ == "__main__":
  98.     # Path to the input file containing the curl commands (this file contains the curl download commands)
  99.     log_file = "bases.txt"  # This is the input file with curl commands, adjust the filename as needed
  100.     output_file = "extracted_commands.txt"  # File to store the extracted curl commands
  101.  
  102.     # Directory to save the downloaded images
  103.     output_directory = "./downloaded_images"
  104.     # Log file to record any failed downloads
  105.     log_file_path = "./failed_downloads.log"
  106.  
  107.     # Call the function to start downloading images based on the extracted curl commands
  108.     download_images(output_file, output_directory, log_file_path)
  109.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement