Advertisement
Pandaaaa906

tmall master worker version

May 15th, 2021
1,474
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.81 KB | None | 0 0
  1. import asyncio
  2. from asyncio import Queue
  3. from os import getenv
  4.  
  5. from loguru import logger
  6. from lxml.etree import HTML
  7. from more_itertools import first
  8. from pyppeteer import launch
  9. from pyppeteer.page import Page
  10. from pyppeteer_stealth import stealth
  11.  
  12.  
  13. product_url = 'https://detail.tmall.com/item.htm?' \
  14.               'spm=a1z10.5-b-s.w4011-21229599754.159.a43d3ab8lMMAyo&' \
  15.               'id=636149087216&rn=6d43f66ab34e0ad782135f76e059ddc7&abbucket=1'
  16. url_all_prd = 'https://skecherstx.tmall.com/?search=y'
  17. exit_flag = False
  18.  
  19.  
  20. async def parse_page(page: Page):
  21.     # 关闭登录框
  22.     try:
  23.         elem = await page.waitForXPath('//div[@class="baxia-dialog-close"]')
  24.         await elem.click()
  25.     except TimeoutError:
  26.         pass
  27.     # 点击评论
  28.     review = await page.waitForXPath('//a[text()="累计评价 "]/parent::li')
  29.     # magic sleep
  30.     await asyncio.sleep(1)
  31.     await review.click()
  32.  
  33.     while True:
  34.         await page.waitForXPath('//div[@class="rate-grid"]//tr')
  35.         # 拿整个网页的html
  36.         html = HTML(await page.content())
  37.         rows = html.xpath('//div[@class="rate-grid"]//tr')
  38.         for row in rows:
  39.             d = {
  40.                 'rate_content': first(row.xpath('.//div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]//text()'), None),
  41.                 'rate_date': first(row.xpath('.//div[@class="tm-rate-date"]/text()'), None),
  42.                 'rate_reply': first(row.xpath('.//div[@class="tm-rate-reply"]//text()'), None),
  43.                 'rate_sku': ';'.join(row.xpath('.//div[@class="rate-sku"]/p/text()')),
  44.                 'rate_user': ''.join(row.xpath('.//div[@class="rate-user-info"]//text()')),
  45.             }
  46.             logger.info(d)
  47.  
  48.         next_page = await page.waitForXPath('//div[@class="rate-paginator"]/span[not(@class)]/following-sibling::a')
  49.         if not next_page:
  50.             break
  51.         await next_page.click()
  52.         pass
  53.  
  54.  
  55. async def init_page(page: Page):
  56.     await stealth(page)
  57.     await page.setViewport({
  58.         'width': 1200,
  59.         'height': 960
  60.     })
  61.     return page
  62.  
  63.  
  64. @logger.catch
  65. async def worker(browser, queue):
  66.     page = await browser.newPage()
  67.     page = await init_page(page)
  68.     logger.info(f'worker initialized')
  69.     while not exit_flag or not queue.empty():
  70.         url = await queue.get()
  71.         logger.debug(f'going to url: {url}')
  72.         await page.goto(url)
  73.         await parse_page(page)
  74.  
  75.  
  76. @logger.catch
  77. async def master(browser, queue):
  78.     global exit_flag
  79.     page = await browser.newPage()
  80.     page = await init_page(page)
  81.  
  82.     # 打开所有产品页面
  83.     await page.goto(url_all_prd)
  84.     while True:
  85.         await page.waitForXPath('//div[contains(@class, "item") and contains(@class, "line")]')
  86.  
  87.         # 提取产品链接
  88.         html = HTML(await page.content())
  89.         urls = html.xpath('//div[contains(@class, "item") and contains(@class, "line")]//dt/a/@href')
  90.  
  91.         for prd_url in urls:
  92.             await queue.put(prd_url)
  93.  
  94.         # 翻页
  95.         try:
  96.             next_page = await page.waitForXPath('//div[@class="pagination"]/a[@class="page-cur"]/following-sibling::a')
  97.         except TimeoutError:
  98.             logger.info(f'Might be detected')
  99.             break
  100.         if not next_page:
  101.             break
  102.         await next_page.click()
  103.     exit_flag = True
  104.  
  105.  
  106. async def main(headless: bool = True, n_workers: int = 1):
  107.     logger.info('Starting')
  108.     browser = await launch(
  109.         headless=headless,
  110.         args=[
  111.             '--no-sandbox',
  112.         ]
  113.     )
  114.     queue = Queue()
  115.     # 开一个master,若干worker
  116.     await asyncio.gather(
  117.         master(browser, queue),
  118.         *(worker(browser, queue) for _ in range(n_workers))
  119.     )
  120.  
  121.  
  122. if __name__ == '__main__':
  123.     n = getenv('N_WORKERS', 1)
  124.     asyncio.run(main(False, n))
  125.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement