Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # webscrape_images.py
- import urllib, os, re, time
- import urllib2
- import ImageFile
- import random
- internal=[]
- keep=[]
- links_only = 1
- def addlinks(keyURL):
- url=keyURL
- def trim(target):
- try:
- while target[-1] in '/\\':
- target=target[:-1]
- except: pass
- return target
- url=trim(url)
- try:
- html=urllib.urlopen(url).read()
- except: return
- def filterHTML():
- links = re.findall(r'href="([^"]+)"',html)
- zzz = links[:]
- for z in zzz:
- if 'deviantart.' in z and '/art/' not in z:
- links.remove(z)
- return sorted(list(set(links)))
- links=filterHTML()
- avoid = 'java. mailto. about. about. shop. welcome. adverti /apply/'.split()
- for x in xrange(len(links)-1): # loops over filters
- link=links[x]
- for i in list('''"'#=''')+avoid:
- if i in link:
- link=None
- break
- if link:
- # internal link
- link=trim(link)
- if link not in keep:
- #print '>>>>>', link
- keep.append(link)
- internal.append(link)
- #
- base_folder = r'c:\Z-DL-TEMP\\'
- if not os.path.exists(base_folder):
- os.makedirs(base_folder)
- dupl = []
- def download_image(urls_list):
- for url in urls_list:
- while url:
- print url
- print
- path_components = re.sub('^https?://', '', url).split('/')
- file_name = path_components.pop()
- try:
- file.close()
- except: 0
- file = urllib.urlopen(url)
- size = file.headers.get("content-length")
- print size,
- p = ImageFile.Parser()
- w = h = 0
- while 1:
- try:
- data = file.read(16)
- except:
- break
- p.feed(data)
- if p.image:
- w,h = p.image.size
- print [w,h]
- break
- print
- file.close()
- if h >= 750 and w >= 240:
- hxw = str(w).zfill(4)+'x'+str(h).zfill(4)
- elif int(size) >= 100000:
- hxw = str(size).zfill(9)
- else:
- print "### Skipped Regarding Image Size ###"
- break
- if os.path.isfile(base_folder+'/'+hxw+file_name):
- print "### Filename Already Exists ###"
- dupl.append(url)
- elif links_only:
- print url
- else:
- urllib.urlretrieve(url, os.path.join(base_folder, file_name[:4]))
- os.rename(base_folder+'/'+file_name[:4],base_folder+'/'+hxw+file_name)
- print '_'*10, 'IMAGE SAVED', '_'*10
- break
- print
- print
- #
- def from_page(u):
- print '='*20
- print "+++", u
- print '='*20
- print "Please Wait..."
- print
- try:
- in_page = urllib.urlopen(u).read()
- except: return
- in_page = in_page.replace('.jpeg','.jpg')
- links = re.findall(r'full-img="([^"]+.jpg)"',in_page)
- download_image(list(set(links)))
- #
- []
- def main(z):
- internal.append(z)
- keep.append(z)
- while internal:
- random.shuffle(internal)
- link = internal.pop(0)
- if '.deviantart.com' in link:
- addlinks(link)
- from_page(link)
- #
- z = 'https://loopydave.deviantart.com/'
- main(z)
- ### note: might not ever get past this point through various websites ### ZZZ
- print '\n','_'*10,'DONE!','_'*10
- print
- for z in dupl: print z
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement