Advertisement
here2share

# web_scraping_images.py

Apr 6th, 2015
441
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.17 KB | None | 0 0
  1. # web_scraping_images.py
  2. import urllib, os, re
  3.  
  4. def download_image(urls_list):
  5.     base_folder = r'c:\Z-DL-TEMP\\'
  6.     if not os.path.exists(base_folder):
  7.         os.makedirs(base_folder)
  8.     print
  9.     print
  10.     for url in urls_list:
  11.         path_components = re.sub('^https?://', '', url).split('/')
  12.         file_name = path_components.pop()
  13.         folder_name = '..'.join(path_components)
  14.         full_path = base_folder + folder_name
  15.         if not os.path.exists(full_path):
  16.             os.makedirs(full_path)
  17.             print 'Made a new folder:', folder_name
  18.         print 'saving ->', url
  19.         urllib.urlretrieve(url, os.path.join(base_folder,
  20.                                              folder_name,
  21.                                              file_name[:4])) # ***
  22.         os.rename(full_path+'\\'+file_name[:4],full_path+'\\'+file_name) ### for os.rename() demo ref
  23.  
  24. def all_links(in_page):
  25.     links = re.findall(r'href="([^"]+)"',in_page)
  26.     if '.jpg' not in links:
  27.         links = re.findall(r'src="([^"]+)"',in_page)
  28.     links = [links[i].replace('.jpeg','.jpg') for i in xrange(0,len(links)) if ('http:' or 'https:') and '.jpg' in links[i].lower()]
  29.     return list(set(links))
  30.  
  31. def from_page(u):
  32.     return urllib.urlopen(u).read()
  33.  
  34. download_image(all_links(from_page("http://pizzot.com/en/for-android")))
  35. print 'Done'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement