LORD_2-3

from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import requests

url = "https://ae-lib.org.ua/texts-c/tolkien__the_lord_of_the_rings_1__en.htm"
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")

# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()    # rip it out

# get text
text = soup.get_text()

# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)

text = text[1100::] #Сразу избавляемся от оглавления
titles = [
    "Chapter 1\nA Long-expected Party",

    "Chapter 2\nThe Shadow of the Past",

    "Chapter 3\nThree is Company",

    "Chapter 4\nA Short Cut to Mushrooms",

    "Chapter 5\nA Conspiracy Unmasked",

    "Chapter 6\nThe Old Forest",

    "Chapter 7\nIn the House of Tom Bombadil",

    "Chapter 8\nFog on the Barrow-Downs",

    "Chapter 9\nAt the Sign of The Prancing Pony",

    "Chapter 10\nStrider",

    "Chapter 11\nA Knife in the Dark",

    "Chapter 12\nFlight to the Ford",

    "Chapter 1\nMany Meetings",

    "Chapter 2\nThe Council of Elrond",

    "Chapter 3\nThe Ring Goes South",

    "Chapter 4\nA Journey in the Dark",

    "Chapter 5\nThe Bridge of Khazad-dûm",

    "Chapter 6\nLothlórien",

    "Chapter 7\nThe Mirror of Galadriel",

    "Chapter 8\nFarewell to Lórien",

    "Chapter 9\nThe Great River",

    "Chapter 10\nThe Breaking of the Fellowship",

]
"""
В цикле НИЖЕ программа находит индексы начала и конца каждой главы в тексте книги.
Это необходимо для дальнейшей обработки текста.
"""
indexes = []
for title in titles:
    idx = text.find(title)
    indexes.append(idx)
#print(indexes)

chapters_idx_in_text = {}
for i in range(0, len(titles)):
    title = titles[i]
    if i == len(titles)-1:
        chapters_idx_in_text[title] = (indexes[i], len(text))
    else:
        chapters_idx_in_text[title] = (indexes[i], indexes[i + 1])
# Делаем API запрос, чтобы узнать все имена персонажей из "Вдастелина колец"
url = 'https://the-one-api.dev/v2/character'
from requests.structures import CaseInsensitiveDict

headers = CaseInsensitiveDict()
headers["Accept"] = "application/json"
headers["Authorization"] = "Bearer 7mbNq6GVWo0LiEQS7GCc"
get_characters = requests.get(url, headers=headers)
#print(get_characters.status_code)
if get_characters.status_code == requests.codes.ok:
    characters_names = get_characters.json()

with open("characters.json", 'w', encoding='utf-8') as fl:
    fl.write(json.dumps(characters_names, ensure_ascii=False, indent=4))

all_characters = []
for one in characters_names["docs"]:
    all_characters.append(one["name"])

heroes_in_text = []
for hero in all_characters:
    if hero in text:
        heroes_in_text.append(hero)

# Считаем количество имён РАЗЛИЧНЫХ персонажей, которые встречаются в каждой главе
chapters_num_of_heroes = {}
for chapter in chapters_idx_in_text:
    from_to = chapters_idx_in_text[chapter]  # from_to_in_text
    from_ = from_to[0]
    to = from_to[1]
    for character in all_characters:
        if text.find(character, from_, to) != -1:
            if chapter in chapters_num_of_heroes:
                chapters_num_of_heroes[chapter] += 1
            else:
                chapters_num_of_heroes[chapter] = 1
rating_by_num_of_heroes = []
for chapter in chapters_num_of_heroes:
    rating_by_num_of_heroes.append((chapters_num_of_heroes[chapter], chapter))

# Сортируем список [ (кол-во персонажей, глава) ]
rating_by_num_of_heroes = sorted(rating_by_num_of_heroes)
answer_chapter = rating_by_num_of_heroes[-1]  # Ответ на вопрос 2
#print(answer_chapter)

heroes_mentioned = {}
answer = {"chapter title": answer_chapter[1],
          "number of different characters mentioned at the chapter": answer_chapter[0],
          "the number of occurrences of the name of the heroes in the text of the chapter": heroes_mentioned}
chapter_idx = chapters_idx_in_text[answer_chapter[1]]  # (from_index, to_index) in text-string
#print(chapter_idx)
from_ = chapter_idx[0]
to = chapter_idx[1]
#print(text[50567-50:50567+200])
#print(text[to-200:to])

# Считаем, сколько раз имя каждого героя упоминалось в тексте главы -- ответ на вопрос 3
for hero in heroes_in_text:
    cnt = text.count(hero, from_, to)
    if cnt > 0:
        heroes_mentioned[hero] = cnt

# Записываем ответ
with open("N2-3_FINAL.json", 'w', encoding='utf-8') as fh:
    fh.write(json.dumps(answer, ensure_ascii=False, indent=4))

print("FINISHED")