Advertisement
here2share

# b_webscrape_images.py

Feb 8th, 2018
241
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.91 KB | None | 0 0
  1. # webscrape_images.py
  2.  
  3. import urllib, os, re, time
  4. import urllib2
  5. import ImageFile
  6. import random
  7.  
  8. internal=[]
  9. keep=[]
  10. links_only = 1
  11.  
  12. def addlinks(keyURL):
  13.     url=keyURL
  14.     def trim(target):
  15.         try:
  16.             while target[-1] in '/\\':
  17.                 target=target[:-1]
  18.         except: pass
  19.         return target
  20.     url=trim(url)
  21.     try:
  22.         html=urllib.urlopen(url).read()
  23.     except: return
  24.     def filterHTML():
  25.         links = re.findall(r'href="([^"]+)"',html)
  26.         zzz = links[:]
  27.         for z in zzz:
  28.             if 'deviantart.' in z and '/art/' not in z:
  29.                 links.remove(z)
  30.         return sorted(list(set(links)))
  31.     links=filterHTML()
  32.     avoid = 'java. mailto. about. about. shop. welcome. adverti /apply/'.split()
  33.     for x in xrange(len(links)-1):  # loops over filters
  34.         link=links[x]
  35.         for i in list('''"'#=''')+avoid:
  36.             if i in link:
  37.                 link=None
  38.                 break
  39.         if link:
  40.             # internal link
  41.             link=trim(link)
  42.             if link not in keep:
  43.                 #print '>>>>>', link
  44.                 keep.append(link)
  45.                 internal.append(link)
  46. #
  47. base_folder = r'c:\Z-DL-TEMP\\'
  48. if not os.path.exists(base_folder):
  49.     os.makedirs(base_folder)
  50. dupl = []
  51. def download_image(urls_list):
  52.     for url in urls_list:
  53.         while url:
  54.             print url
  55.             print
  56.             path_components = re.sub('^https?://', '', url).split('/')
  57.             file_name = path_components.pop()
  58.             try:
  59.                 file.close()
  60.             except: 0
  61.             file = urllib.urlopen(url)
  62.             size = file.headers.get("content-length")
  63.             print size,
  64.             p = ImageFile.Parser()
  65.             w = h = 0
  66.             while 1:
  67.                 try:
  68.                     data = file.read(16)
  69.                 except:
  70.                     break
  71.                 p.feed(data)
  72.                 if p.image:
  73.                     w,h = p.image.size
  74.                     print [w,h]
  75.                     break
  76.                 print
  77.                 file.close()
  78.             if h >= 750 and w >= 240:
  79.                 hxw = str(w).zfill(4)+'x'+str(h).zfill(4)
  80.                
  81.             elif int(size) >= 100000:
  82.                 hxw = str(size).zfill(9)
  83.             else:
  84.                 print "### Skipped Regarding Image Size ###"
  85.                 break
  86.             if os.path.isfile(base_folder+'/'+hxw+file_name):
  87.                 print "### Filename Already Exists ###"
  88.                 dupl.append(url)
  89.             elif links_only:
  90.                 print url
  91.             else:
  92.                 urllib.urlretrieve(url, os.path.join(base_folder, file_name[:4]))
  93.                 os.rename(base_folder+'/'+file_name[:4],base_folder+'/'+hxw+file_name)
  94.                 print '_'*10, 'IMAGE SAVED', '_'*10
  95.                
  96.             break
  97.            
  98.         print
  99.         print
  100. #
  101. def from_page(u):
  102.     print '='*20
  103.     print "+++", u
  104.     print '='*20
  105.     print "Please Wait..."
  106.     print
  107.     try:
  108.         in_page = urllib.urlopen(u).read()
  109.     except: return
  110.     in_page = in_page.replace('.jpeg','.jpg')
  111.     links = re.findall(r'full-img="([^"]+.jpg)"',in_page)
  112.     download_image(list(set(links)))
  113. #
  114. []
  115. def main(z):
  116.     internal.append(z)
  117.     keep.append(z)
  118.     while internal:
  119.         random.shuffle(internal)
  120.         link = internal.pop(0)
  121.         if '.deviantart.com' in link:
  122.             addlinks(link)
  123.             from_page(link)
  124. #
  125. z = 'https://loopydave.deviantart.com/'
  126. main(z)
  127.  
  128. ### note: might not ever get past this point through various websites ### ZZZ
  129.  
  130. print '\n','_'*10,'DONE!','_'*10
  131. print
  132. for z in dupl: print z
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement