Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # webscrape_images2.py
- import urllib.request
- import os
- import re
- import time
- import random
- from PIL import Image
- import subprocess
- internal = []
- keep = []
- links_only = 1
- def redirect(html):
- image_links = [target + t.split('"')[0] for t in html.split(r'***/')] ### substring needed to be changed now and then
- return image_links
- def simulate_button_click(html_file_name, button_name):
- # Open the HTML file and read its contents.
- with open(html_file_name, "rb") as f:
- html = f.read()
- # Create an HTTP POST request with the button's name as the value of the "name" parameter.
- data = "name={}".format(button_name).encode("utf-8")
- headers = {"Content-Type": "application/x-www-form-urlencoded"}
- process = subprocess.run(["curl", "-X", "POST", "-d", data, "-H", str(headers), "file://{}".format(html_file_name)], capture_output=True)
- # Return the HTTP response from the server, which should contain the contents of the next page.
- return process.stdout.decode("utf-8")
- def get_data(link):
- req = urllib.request.Request(url=link, headers=headers)
- return str(urllib.request.urlopen(req).read())
- def addlinks(keyURL):
- html = get_data(keyURL)
- html = html.split('<img src="')
- for z in html:
- z = z.split('"')[0]
- keep.append(z)
- internal.append(z)
- base_folder = r'c:\Z-DL-TEMP'
- if not os.path.exists(base_folder):
- os.makedirs(base_folder)
- #fixed duplicate checker function
- dupl = set()
- def download_image(url):
- if url in dupl:
- return
- file_name = url.split(target)[1]
- file_name = file_name.replace('/', '_')
- file_name = file_name.split('.')
- file_name = '_'.join(file_name[:-1]) + '.' + file_name[-1]
- # Download the image first
- urllib.request.urlretrieve(url, os.path.join(base_folder, file_name))
- # Open and process the image using Pillow
- img = Image.open(os.path.join(base_folder, file_name))
- width, height = img.size
- print([width, height])
- if width > 500 and height > 500:
- dupl.add(url)
- if img.mode != 'RGB':
- img = img.convert('RGB')
- img.save(os.path.join(base_folder, file_name), format='JPEG', quality=90, optimize=True)
- print('+'*20, 'IMAGE SAVED', '+'*20, '\n')
- elif width < 200 and height < 200:
- print('')
- return 1
- else:
- print('__too small__')
- print('')
- def from_page(u):
- print('='*20)
- print("+++", u)
- print('='*20)
- print('')
- in_page = get_data(u)
- image_links = redirect(in_page)
- for link in image_links[1:]:
- print("Trying:", link)
- download_image(link)
- def main(z):
- internal.append(z)
- keep.append(z)
- while internal:
- random.shuffle(internal)
- link = internal.pop(0)
- if '***.' in link:
- addlinks(link)
- from_page(link)
- url = 'https://***'
- target = 'https://***'
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
- main(url)
- print('\n','_'*10,'DONE!','_'*10, '\n')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement