Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # web_scraping_images.py
- import urllib, os, re
- def download_image(urls_list):
- base_folder = r'c:\Z-DL-TEMP\\'
- if not os.path.exists(base_folder):
- os.makedirs(base_folder)
- print
- print
- for url in urls_list:
- path_components = re.sub('^https?://', '', url).split('/')
- file_name = path_components.pop()
- folder_name = '..'.join(path_components)
- full_path = base_folder + folder_name
- if not os.path.exists(full_path):
- os.makedirs(full_path)
- print 'Made a new folder:', folder_name
- print 'saving ->', url
- urllib.urlretrieve(url, os.path.join(base_folder,
- folder_name,
- file_name[:4])) # ***
- os.rename(full_path+'\\'+file_name[:4],full_path+'\\'+file_name) ### for os.rename() demo ref
- def all_links(in_page):
- links = re.findall(r'href="([^"]+)"',in_page)
- if '.jpg' not in links:
- links = re.findall(r'src="([^"]+)"',in_page)
- links = [links[i].replace('.jpeg','.jpg') for i in xrange(0,len(links)) if ('http:' or 'https:') and '.jpg' in links[i].lower()]
- return list(set(links))
- def from_page(u):
- return urllib.urlopen(u).read()
- download_image(all_links(from_page("http://pizzot.com/en/for-android")))
- print 'Done'
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement