LORD_4

from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import requests

url = "https://ae-lib.org.ua/texts-c/tolkien__the_lord_of_the_rings_1__en.htm"
html = urlopen(url).read()
soup = BeautifulSoup(html, features="html.parser")

# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()  # rip it out

# get text
text = soup.get_text()
txt_list  = text.split()
# break into lines and remove leading and trailing space on each
lines = (line.strip() for line in text.splitlines())
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)

text = text[1100::]  # Сразу избавляемся от оглавления
titles = [
    "Chapter 1\nA Long-expected Party",

    "Chapter 2\nThe Shadow of the Past",

    "Chapter 3\nThree is Company",

    "Chapter 4\nA Short Cut to Mushrooms",

    "Chapter 5\nA Conspiracy Unmasked",

    "Chapter 6\nThe Old Forest",

    "Chapter 7\nIn the House of Tom Bombadil",

    "Chapter 8\nFog on the Barrow-Downs",

    "Chapter 9\nAt the Sign of The Prancing Pony",

    "Chapter 10\nStrider",

    "Chapter 11\nA Knife in the Dark",

    "Chapter 12\nFlight to the Ford",

    "Chapter 1\nMany Meetings",

    "Chapter 2\nThe Council of Elrond",

    "Chapter 3\nThe Ring Goes South",

    "Chapter 4\nA Journey in the Dark",

    "Chapter 5\nThe Bridge of Khazad-dûm",

    "Chapter 6\nLothlórien",

    "Chapter 7\nThe Mirror of Galadriel",

    "Chapter 8\nFarewell to Lórien",

    "Chapter 9\nThe Great River",

    "Chapter 10\nThe Breaking of the Fellowship",

]
"""
В цикле НИЖЕ программа находит индексы начала и конца каждой главы в тексте книги.
Это необходимо для дальнейшей обработки текста.
Ниже этого цикла есть готовый список indexes с индексами начала  каждой главы."""
indexes = []
for title in titles:
    idx = text.find(title)
    indexes.append(idx)
# print(indexes)

chapters_idx_in_text = {}
for i in range(0, len(titles)):
    title = titles[i]
    if i == len(titles) - 1:
        chapters_idx_in_text[title] = (indexes[i], len(text))
    else:
        chapters_idx_in_text[title] = (indexes[i], indexes[i + 1])

#print(chapters_idx_in_text)
url = 'https://the-one-api.dev/v2/character'
from requests.structures import CaseInsensitiveDict
headers = CaseInsensitiveDict()
headers["Accept"] = "application/json"
headers["Authorization"] = "Bearer 7mbNq6GVWo0LiEQS7GCc"
get_characters = requests.get(url, headers=headers)
if get_characters.status_code == requests.codes.ok:
    api_characters = get_characters.json()

#Делаем API запрос, чтобы узнать все имена персонажей из "Вдастелина колец"
all_characters = []
for one in api_characters["docs"]:
    all_characters.append(one["name"])
#Узнаем, какие имена встречаются в тексте
heroes_in_text = []
for hero in all_characters:
    if hero in txt_list:
        heroes_in_text.append(hero)


#Будем называть ИНДЕКСОМ ГЕРОЯ каждый индекс вхождения слова в список txt_list -- список всех слов, разделённых пробелом
#Например:
# имя героя -- 'a'
#txt_list = ['a','b','c','a']
#Тогда 0 и 3 -- являются ИНДЕКСАМИ героя


#Функции
#Посчитать расстояние между 2-мя индексами
def dst(a, b):
    return abs(a - b)


#Посчитать расстояние между одним индексом героя A и всеми индексами героя B
def binP_dst(x, array):
    L = 0
    R = len(array) - 1
    while R - L > 1:
        idx = (L + R) // 2
        y = array[idx]
        if y >= x:
            R = idx
            L = (L + R) // 2
        else:
            L = idx
            R = (R + len(array)) // 2
    if dst(x, array[L]) <=  dst(x, array[R]):
        return dst(x, array[L])
    else:
        return dst(x, array[R])


heroes_idx = {}


#Посчитать минимальное расстояние между именами 2-х героев в тексте
def min_dist(hero_A, hero_B):
    min_dst = pow(10, 20)
    idx_A = heroes_idx[hero_A]
    idx_B = heroes_idx[hero_B]
    for i in range(len(idx_A)):
        new_dst = binP_dst(idx_A[i], idx_B)
        if new_dst < min_dst:
            min_dst = new_dst
    return min_dst


#Получить список всех индексов одного персонажа в списке txt_list
def get_idx(hero_name):
    hero = hero_name
    from_ = 0
    all_idx = []
    while hero in txt_list[from_::]:
        idx = txt_list.index(hero, from_)
        all_idx.append(idx)
        from_ = idx+1
    return all_idx


#Теперь посчитаем все индексы для каждого героя
for hero in heroes_in_text:
    hero_idx = get_idx(hero)
    heroes_idx[hero] = hero_idx


heroes_dist = {}
for hero in heroes_in_text:
    heroes_dist[hero] = {}

for hero_A in heroes_idx:
    for hero_B in heroes_idx:
        if (hero_A != hero_B) and (hero_B not in heroes_idx[hero_A]) and (hero_A not in heroes_idx[hero_B]):
            minimum_dist = min_dist(hero_A, hero_B)
            heroes_dist[hero_A][hero_B] = minimum_dist
            heroes_dist[hero_B][hero_A] = minimum_dist


with open("heroes_dist.json", 'w', encoding='utf-8') as fh:
    fh.write(json.dumps(heroes_dist, ensure_ascii=False, indent = 4))

#Персонажи не связаны, если количество слов между ними больше N.
#Пусть N = среднеарифметиечское кол-во слов в одной главе, делённое на 80
N = len(txt_list) / len(titles) / 80
print("N = ",N)
"""Обоснование выбора числа N  в задании 4.
Назовём героев "связными", если они вместе принимают участие хотя бы в одном эпизоде.
Глава -- это смысловая часть, в которой происходит какое-то некотрое количество эпизодов.
Значит, если два каких-то персонажа фигурируют в одной и той же главе, то есть вероятность,
что они вместе принимают,участие в каком-то эпизоде.

Чтобы как можно более точно отразить связь персонажей между собой, разделим
средний размер главы на 80, предпологая, что в одной главе может быть до 80 эпизодов.
Теперь после деления на 80 вероятность того, что персонажи принимают участие в одном эпиходе, значительно возросла.

В итоге вышло, что N = 105 (примерно).
Больше этого значения N лучше не брать, поскольку, чем больше N, тем более запутанный и визуально непонятный граф.
"""

#Строим граф
import networkx as nx
import matplotlib.pyplot as plt
from random import randint
G = nx.Graph()
marked_edges = []
colors = ['red', 'green', 'pink',  'brown', 'yellow', 'purple', 'blue']

for hero_A in heroes_in_text:
    for hero_B in heroes_dist[hero_A]:
        if heroes_dist[hero_A][hero_B] <= N:
            if min(hero_A, hero_B) + max(hero_A, hero_B) not in marked_edges:
                marked_edges.append(min(hero_A, hero_B) + max(hero_A, hero_B))
                G.add_edge(hero_A, hero_B, weight = heroes_dist[hero_A][hero_B])
color_map = []
cnt = -1 #сделаем рандомные цвета для всех вершин, чтобы было легче различать разных героев
for edge in G:
    cnt += 1
    cnt %= len(colors)
    color_map.append(colors[cnt])
color_map_2 = []
cnt = -1
for node in G:
    cnt += 1
    cnt %= len(colors)
    color_map_2.append(colors[cnt])
#Разноцветные рёбра в графе могут помочь лучше проследить связть между 2-мя персонажами
nx.draw(G, with_labels=True,node_color= color_map_2,edge_color = color_map)#,  node_size = 10
#plt.savefig("N4_22.png")
plt.show()
print("FINISHED")