Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import asyncio
- from lxml.etree import HTML
- from more_itertools import first
- from pyppeteer import launch
- from pyppeteer_stealth import stealth
- url = 'https://detail.tmall.com/item.htm?' \
- 'spm=a1z10.5-b-s.w4011-21229599754.159.a43d3ab8lMMAyo&' \
- 'id=636149087216&rn=6d43f66ab34e0ad782135f76e059ddc7&abbucket=1'
- async def main():
- browser = await launch(
- headless=False,
- args=[
- '--no-sandbox',
- ]
- )
- page = await browser.newPage()
- await stealth(page)
- await page.goto(url)
- # 关闭登录框
- try:
- elem = await page.waitForXPath('//div[@class="baxia-dialog-close"]')
- await elem.click()
- except TimeoutError:
- pass
- # 点击评论
- review = await page.waitForXPath('//a[text()="累计评价 "]/parent::li')
- # magic sleep
- await asyncio.sleep(1)
- await review.click()
- while True:
- await page.waitForXPath('//div[@class="rate-grid"]//tr')
- # 拿整个网页的html
- html = HTML(await page.content())
- rows = html.xpath('//div[@class="rate-grid"]//tr')
- for row in rows:
- d = {
- 'rate_content': first(row.xpath('.//div[@class="tm-rate-content"]/div[@class="tm-rate-fulltxt"]//text()'), None),
- 'rate_date': first(row.xpath('.//div[@class="tm-rate-date"]/text()'), None),
- 'rate_reply': first(row.xpath('.//div[@class="tm-rate-reply"]//text()'), None),
- 'rate_sku': ';'.join(row.xpath('.//div[@class="rate-sku"]/p/text()')),
- 'rate_user': ''.join(row.xpath('.//div[@class="rate-user-info"]//text()')),
- }
- print(d)
- next_page = await page.waitForXPath('//div[@class="rate-paginator"]/span[not(@class)]/following-sibling::a')
- if not next_page:
- break
- await next_page.click()
- pass
- if __name__ == '__main__':
- asyncio.run(main())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement