Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from urllib.request import urlopen
- from bs4 import BeautifulSoup
- import json
- import requests
- url = "https://ae-lib.org.ua/texts-c/tolkien__the_lord_of_the_rings_1__en.htm"
- html = urlopen(url).read()
- soup = BeautifulSoup(html, features="html.parser")
- # kill all script and style elements
- for script in soup(["script", "style"]):
- script.extract() # rip it out
- # get text
- text = soup.get_text()
- # break into lines and remove leading and trailing space on each
- lines = (line.strip() for line in text.splitlines())
- # break multi-headlines into a line each
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
- # drop blank lines
- text = '\n'.join(chunk for chunk in chunks if chunk)
- text = text[1100::] #Сразу избавляемся от оглавления
- titles = [
- "Chapter 1\nA Long-expected Party",
- "Chapter 2\nThe Shadow of the Past",
- "Chapter 3\nThree is Company",
- "Chapter 4\nA Short Cut to Mushrooms",
- "Chapter 5\nA Conspiracy Unmasked",
- "Chapter 6\nThe Old Forest",
- "Chapter 7\nIn the House of Tom Bombadil",
- "Chapter 8\nFog on the Barrow-Downs",
- "Chapter 9\nAt the Sign of The Prancing Pony",
- "Chapter 10\nStrider",
- "Chapter 11\nA Knife in the Dark",
- "Chapter 12\nFlight to the Ford",
- "Chapter 1\nMany Meetings",
- "Chapter 2\nThe Council of Elrond",
- "Chapter 3\nThe Ring Goes South",
- "Chapter 4\nA Journey in the Dark",
- "Chapter 5\nThe Bridge of Khazad-dûm",
- "Chapter 6\nLothlórien",
- "Chapter 7\nThe Mirror of Galadriel",
- "Chapter 8\nFarewell to Lórien",
- "Chapter 9\nThe Great River",
- "Chapter 10\nThe Breaking of the Fellowship",
- ]
- """
- В цикле НИЖЕ программа находит индексы начала и конца каждой главы в тексте книги.
- Это необходимо для дальнейшей обработки текста.
- """
- indexes = []
- for title in titles:
- idx = text.find(title)
- indexes.append(idx)
- #print(indexes)
- chapters_idx_in_text = {}
- for i in range(0, len(titles)):
- title = titles[i]
- if i == len(titles)-1:
- chapters_idx_in_text[title] = (indexes[i], len(text))
- else:
- chapters_idx_in_text[title] = (indexes[i], indexes[i + 1])
- # Делаем API запрос, чтобы узнать все имена персонажей из "Вдастелина колец"
- url = 'https://the-one-api.dev/v2/character'
- from requests.structures import CaseInsensitiveDict
- headers = CaseInsensitiveDict()
- headers["Accept"] = "application/json"
- headers["Authorization"] = "Bearer 7mbNq6GVWo0LiEQS7GCc"
- get_characters = requests.get(url, headers=headers)
- #print(get_characters.status_code)
- if get_characters.status_code == requests.codes.ok:
- characters_names = get_characters.json()
- with open("characters.json", 'w', encoding='utf-8') as fl:
- fl.write(json.dumps(characters_names, ensure_ascii=False, indent=4))
- all_characters = []
- for one in characters_names["docs"]:
- all_characters.append(one["name"])
- heroes_in_text = []
- for hero in all_characters:
- if hero in text:
- heroes_in_text.append(hero)
- # Считаем количество имён РАЗЛИЧНЫХ персонажей, которые встречаются в каждой главе
- chapters_num_of_heroes = {}
- for chapter in chapters_idx_in_text:
- from_to = chapters_idx_in_text[chapter] # from_to_in_text
- from_ = from_to[0]
- to = from_to[1]
- for character in all_characters:
- if text.find(character, from_, to) != -1:
- if chapter in chapters_num_of_heroes:
- chapters_num_of_heroes[chapter] += 1
- else:
- chapters_num_of_heroes[chapter] = 1
- rating_by_num_of_heroes = []
- for chapter in chapters_num_of_heroes:
- rating_by_num_of_heroes.append((chapters_num_of_heroes[chapter], chapter))
- # Сортируем список [ (кол-во персонажей, глава) ]
- rating_by_num_of_heroes = sorted(rating_by_num_of_heroes)
- answer_chapter = rating_by_num_of_heroes[-1] # Ответ на вопрос 2
- #print(answer_chapter)
- heroes_mentioned = {}
- answer = {"chapter title": answer_chapter[1],
- "number of different characters mentioned at the chapter": answer_chapter[0],
- "the number of occurrences of the name of the heroes in the text of the chapter": heroes_mentioned}
- chapter_idx = chapters_idx_in_text[answer_chapter[1]] # (from_index, to_index) in text-string
- #print(chapter_idx)
- from_ = chapter_idx[0]
- to = chapter_idx[1]
- #print(text[50567-50:50567+200])
- #print(text[to-200:to])
- # Считаем, сколько раз имя каждого героя упоминалось в тексте главы -- ответ на вопрос 3
- for hero in heroes_in_text:
- cnt = text.count(hero, from_, to)
- if cnt > 0:
- heroes_mentioned[hero] = cnt
- # Записываем ответ
- with open("N2-3_FINAL.json", 'w', encoding='utf-8') as fh:
- fh.write(json.dumps(answer, ensure_ascii=False, indent=4))
- print("FINISHED")
Add Comment
Please, Sign In to add comment