Advertisement
Pandaaaa906

netbian

Apr 23rd, 2021
1,127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.27 KB | None | 0 0
  1. import asyncio
  2. import shelve
  3. from os import path
  4. from pathlib import Path
  5.  
  6. from aiofile import async_open
  7. from httpx import AsyncClient
  8. from loguru import logger
  9. from lxml.etree import HTML
  10. from more_itertools import first
  11.  
  12. url_tmpl = 'http://www.netbian.com/index{}.htm'
  13. downloads_dir = Path('downloads')
  14. trans = str.maketrans('', '', r'\/:*?"<>|')
  15. logger.add('netbian.log', rotation='1 weeks')
  16. headers = {
  17.     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
  18.                   'AppleWebKit/537.36 (KHTML, like Gecko) '
  19.                   'Chrome/89.0.4389.128 Safari/537.36'
  20. }
  21. cache = shelve.open('cache', writeback=True)
  22.  
  23.  
  24. async def download_img(client: AsyncClient, url):
  25.     r = await client.get(url)
  26.     r.encoding = 'gbk'
  27.     html = HTML(r.text)
  28.     img_url = first(html.xpath('//div[@class="pic"]//img/@src'), None)
  29.     title = first(html.xpath('//div[@class="pic"]//img/@title'), None)
  30.     if not img_url:
  31.         return
  32.     *_, ext = path.splitext(img_url)
  33.     r = await client.get(img_url)
  34.     if r.status_code != 200:
  35.         logger.warning(f'{img_url=} got wrong status code: {r.status_code}')
  36.         return
  37.     async with async_open(downloads_dir / f'{title.translate(trans)}{ext}', 'wb') as f:
  38.         async for t in r.aiter_bytes():
  39.             await f.write(t)
  40.  
  41.  
  42. async def main(start_over=False):
  43.     logger.info('Staring')
  44.     cur_page = cache.get('cur_page', 1) if not start_over else 1
  45.     cur_url = url_tmpl.format(f'_{cur_page}' if cur_page != 1 else '')
  46.     async with AsyncClient(headers=headers, timeout=30) as client:
  47.         while cur_url:
  48.             r = await client.get(cur_url)
  49.             html = HTML(r.text)
  50.             tasks = (
  51.                 asyncio.create_task(download_img(client, r.url.join(rel_url)))
  52.                 for rel_url in html.xpath('//div[@class="list"]//li/a[img]/@href')
  53.             )
  54.             await asyncio.gather(*tasks)
  55.             cur_url = (tmp := first(html.xpath('//a[@class="prev"][last()]/@href'), None)) and r.url.join(tmp)
  56.             logger.info(f'page: {cur_page} finish download')
  57.             cur_page += 1
  58.             cache['cur_page'] = cur_page
  59.     logger.info('Finished downloading')
  60.  
  61.  
  62. if __name__ == '__main__':
  63.     downloads_dir.mkdir(exist_ok=True)
  64.     asyncio.run(main())
  65.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement