Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- 百度图片批量下载(翻页版)
- 默认的百度图片由下拉网页载入新的图片,一定程度给爬虫增大了难度
- 这里采用其可翻页版本爬取下载图片
- 百度图片: https://image.baidu.com
- 可翻页版网址: https://image.baidu.com/search/flip?xxx
- 使用方法:
- search = BaiduImage('上海对外经贸大学')
- search.crawler()
- 图片会保存在: 当前目录/baidu_image_上海对外经贸大学/
- 其他参数见初始化函数
- 注:百度图片搜索结果不会产生 404 页,所以搜索第一页总是可下载
- """
- from requests_html import HTMLSession
- from urllib import parse
- import requests_html
- import datetime
- import hashlib
- import sys
- import os
- import re
- class BaiduImage(object):
- def __init__(self, keyword,
- downdir=None, start=1, maxpage=None,
- timeout=1, retry=True, maxtry=3):
- # keyword: 搜索关键词
- # downdir: 下载图片存放目录,默认为 baidu_image_搜索关键词
- # maxpage: 最大翻页数,默认为 1e8(一直翻页)
- # retry: 是否尝试重新下载失败图片
- # maxtry: 下载失败的图片最大尝试次数
- # 其他参数
- # flip_root: 翻页版百度图片搜索基址
- # url_params: 当前网页 URL 数据参数
- # session: 统一 HTML session
- # response: 当前网页 HTML 响应
- # last_urls: 当前页图片链接
- # failed_urls: 爬取失败的网址
- # counter: 已下载的图片计数
- self.headers = {
- 'Referer': 'https://image.baidu.com'
- }
- self.flip_root = r'https://image.baidu.com/search/flip?'
- self.keyword = keyword
- self.downdir = downdir
- self.maxpage = maxpage
- self.timeout = timeout
- self.retry = retry
- self.maxtry = maxtry
- if downdir is None:
- self.downdir = 'baidu_image_' + keyword
- self.url = ''
- self.url_params = {
- 'tn': 'baiduimage',
- 'ie': 'utf-8',
- 'word': self.keyword,
- 'pn': (start-1)*20
- }
- self.last_urls = list()
- self.failed_urls = set()
- self.session = HTMLSession()
- self.response: requests_html.HTMLResponse
- self.counter = 0
- self.make_dir()
- def now_time(self):
- return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
- def make_dir(self):
- try:
- if not os.path.exists(self.downdir):
- os.mkdir(self.downdir)
- os.chdir(self.downdir)
- except Exception as e:
- print(f'[{self.now_time()}] [ERR - dir] {e}')
- sys.exit(1)
- def save_single_image(self, url) -> bool:
- """
- 从给定 URL 下载并保存单个图片
- 若不能下载,则打印问题 URL 并记录至 self.failed_urls 中
- """
- try:
- r = self.session.get(url, headers=self.headers,
- timeout=self.timeout)
- r.raise_for_status()
- content = r.content
- filename = hashlib.sha256(content).hexdigest()
- with open(filename+'.jpg', 'wb') as f:
- f.write(content)
- print(f'[{self.now_time()}] [INFO - Downloaded] [URL] {url}')
- self.counter += 1
- return True
- except Exception as e:
- self.failed_urls.update([url])
- print(
- f'[{self.now_time()}] [ERR - Download] [URL] {url} [ERR in Detail] {e}')
- return False
- def update_url(self):
- self.url = self.flip_root + parse.urlencode(self.url_params)
- def get_page(self):
- """
- 获得当前页面 HTML 响应
- """
- self.update_url()
- try:
- self.response = self.session.get(
- self.url, headers=self.headers, timeout=self.timeout)
- self.response.raise_for_status()
- except Exception as e:
- print(f'[{self.now_time()}] [ERR - Get Single Page] [ERR in Detail] {e}')
- def next_page(self):
- """
- 翻页,更新 self.url,若为最后一页则返回 None
- """
- self.response.encoding = self.response.apparent_encoding
- if 'class="n"' in self.response.html.html:
- self.url_params['pn'] += 20
- self.update_url()
- else:
- self.url = None
- def download_images(self):
- self.get_page()
- html = self.response.html.html
- image_urls = re.findall(r'"objURL":"(.*?)"', html)
- for url in image_urls:
- if url not in self.last_urls:
- self.save_single_image(url)
- self.last_urls = image_urls
- def crawler(self):
- # Dirty: 如果未设置 self.maxpage,则用大数模拟无尽循环
- self.maxpage = self.maxpage if self.maxpage is not None else 10**8
- for _ in range(self.maxpage):
- self.download_images()
- self.next_page()
- if self.url is None:
- break
- # 尝试重下失败列表中的图片
- if self.retry:
- print(f'[{self.now_time()}] [INFO] 尝试下载之前下载失败的图片...')
- print(self.failed_urls)
- for _ in range(self.maxtry):
- new_failed_urls = self.failed_urls.copy()
- for url in new_failed_urls:
- if self.save_single_image(url):
- self.failed_urls.remove(url)
- print(f'[{self.now_time()}] [INFO] 结束. 共下载 {self.counter} 张图片.')
- if __name__ == '__main__':
- search = BaiduImage('上海对外经贸大学', start=1, maxpage=1)
- search.crawler()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement