Advertisement
shinemic

猫眼电影榜单

Nov 11th, 2019
405
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 0.96 KB | None | 0 0
  1. from requests_html import HTMLSession
  2. from collections import namedtuple
  3. import pandas as pd
  4. import re
  5.  
  6.  
  7. MovieRecord = namedtuple('MovieRecord', ['rank', 'title', 'star', 'releasetime', 'integer', 'fraction'])
  8.  
  9.  
  10. def nth_page(n):
  11.     url = f'https://maoyan.com/board/4?offset={10*(n-1)}'
  12.     pat = re.compile(r'''
  13.       <i\ class="board-index.*?>(?P<{}>.*?)</i>
  14.       .*?title="(?P<{}>.*?)"
  15.       .*?star">\s*主演:(?P<{}>.*?)\s*</p>
  16.       .*?releasetime">上映时间:(?P<{}>.*?)</p>
  17.       .*?integer">(?P<{}>.*?)</i>
  18.       .*?fraction">(?P<{}>.*?)</i>'''.format(*MovieRecord._fields), re.X | re.S)
  19.     req = HTMLSession().get(url)
  20.     return [MovieRecord(*x) for x in pat.findall(req.text)]
  21.  
  22.  
  23. if __name__ == '__main__':
  24.     movie_list = []
  25.     max_pages = 10
  26.     for i in range(1, max_pages):
  27.         movie_list.extend(nth_page(i))
  28.         print(f'{i}/{max_pages} finished.')
  29.     print(pd.DataFrame(movie_list, columns=MovieRecord._fields))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement