Advertisement
here2share

# webscrape_images2.py

Aug 22nd, 2023 (edited)
954
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.98 KB | None | 0 0
  1. # webscrape_images2.py
  2.  
  3. import urllib.request
  4. import os
  5. import re
  6. import time
  7. import random
  8. from PIL import Image
  9. import subprocess
  10.  
  11. internal = []
  12. keep = []
  13. links_only = 1
  14.  
  15. def redirect(html):
  16.     image_links = [target + t.split('"')[0] for t in html.split(r'***/')] ### substring needed to be changed now and then
  17.     return image_links
  18.  
  19. def simulate_button_click(html_file_name, button_name):
  20.     # Open the HTML file and read its contents.
  21.     with open(html_file_name, "rb") as f:
  22.         html = f.read()
  23.  
  24.     # Create an HTTP POST request with the button's name as the value of the "name" parameter.
  25.     data = "name={}".format(button_name).encode("utf-8")
  26.     headers = {"Content-Type": "application/x-www-form-urlencoded"}
  27.     process = subprocess.run(["curl", "-X", "POST", "-d", data, "-H", str(headers), "file://{}".format(html_file_name)], capture_output=True)
  28.  
  29.     # Return the HTTP response from the server, which should contain the contents of the next page.
  30.     return process.stdout.decode("utf-8")
  31.  
  32. def get_data(link):
  33.     req = urllib.request.Request(url=link, headers=headers)
  34.     return str(urllib.request.urlopen(req).read())
  35.  
  36. def addlinks(keyURL):
  37.     html = get_data(keyURL)
  38.     html = html.split('<img src="')
  39.  
  40.     for z in html:
  41.         z = z.split('"')[0]
  42.         keep.append(z)
  43.         internal.append(z)
  44.  
  45. base_folder = r'c:\Z-DL-TEMP'
  46. if not os.path.exists(base_folder):
  47.     os.makedirs(base_folder)
  48.  
  49. #fixed duplicate checker function
  50. dupl = set()
  51.  
  52. def download_image(url):
  53.     if url in dupl:
  54.         return
  55.     file_name = url.split(target)[1]
  56.     file_name = file_name.replace('/', '_')
  57.     file_name = file_name.split('.')
  58.     file_name = '_'.join(file_name[:-1]) + '.' + file_name[-1]
  59.        
  60.     # Download the image first
  61.     urllib.request.urlretrieve(url, os.path.join(base_folder, file_name))
  62.    
  63.     # Open and process the image using Pillow
  64.     img = Image.open(os.path.join(base_folder, file_name))
  65.     width, height = img.size
  66.    
  67.     print([width, height])
  68.     if width > 500 and height > 500:
  69.         dupl.add(url)
  70.         if img.mode != 'RGB':
  71.             img = img.convert('RGB')
  72.         img.save(os.path.join(base_folder, file_name), format='JPEG', quality=90, optimize=True)
  73.         print('+'*20, 'IMAGE SAVED', '+'*20, '\n')
  74.     elif width < 200 and height < 200:
  75.         print('')
  76.         return 1
  77.     else:
  78.         print('__too small__')
  79.     print('')
  80.  
  81. def from_page(u):
  82.     print('='*20)
  83.     print("+++", u)
  84.     print('='*20)
  85.     print('')
  86.  
  87.     in_page = get_data(u)
  88.     image_links = redirect(in_page)
  89.     for link in image_links[1:]:
  90.         print("Trying:", link)
  91.         download_image(link)
  92.  
  93. def main(z):
  94.     internal.append(z)
  95.     keep.append(z)
  96.     while internal:
  97.         random.shuffle(internal)
  98.         link = internal.pop(0)
  99.         if '***.' in link:
  100.             addlinks(link)
  101.             from_page(link)
  102.  
  103. url = 'https://***'
  104.  
  105. target = 'https://***'
  106.  
  107. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
  108.  
  109. main(url)
  110.  
  111. print('\n','_'*10,'DONE!','_'*10, '\n')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement