Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from requests_html import HTMLSession
- from collections import namedtuple
- import pandas as pd
- import re
- MovieRecord = namedtuple('MovieRecord', ['rank', 'title', 'star', 'releasetime', 'integer', 'fraction'])
- def nth_page(n):
- url = f'https://maoyan.com/board/4?offset={10*(n-1)}'
- pat = re.compile(r'''
- <i\ class="board-index.*?>(?P<{}>.*?)</i>
- .*?title="(?P<{}>.*?)"
- .*?star">\s*主演:(?P<{}>.*?)\s*</p>
- .*?releasetime">上映时间:(?P<{}>.*?)</p>
- .*?integer">(?P<{}>.*?)</i>
- .*?fraction">(?P<{}>.*?)</i>'''.format(*MovieRecord._fields), re.X | re.S)
- req = HTMLSession().get(url)
- return [MovieRecord(*x) for x in pat.findall(req.text)]
- if __name__ == '__main__':
- movie_list = []
- max_pages = 10
- for i in range(1, max_pages):
- movie_list.extend(nth_page(i))
- print(f'{i}/{max_pages} finished.')
- print(pd.DataFrame(movie_list, columns=MovieRecord._fields))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement