Korotkodul

LORD_2-3

Oct 31st, 2021 (edited)
99
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.11 KB | None | 0 0
  1. from urllib.request import urlopen
  2. from bs4 import BeautifulSoup
  3. import json
  4. import requests
  5.  
  6. url = "https://ae-lib.org.ua/texts-c/tolkien__the_lord_of_the_rings_1__en.htm"
  7. html = urlopen(url).read()
  8. soup = BeautifulSoup(html, features="html.parser")
  9.  
  10. # kill all script and style elements
  11. for script in soup(["script", "style"]):
  12.     script.extract()    # rip it out
  13.  
  14. # get text
  15. text = soup.get_text()
  16.  
  17. # break into lines and remove leading and trailing space on each
  18. lines = (line.strip() for line in text.splitlines())
  19. # break multi-headlines into a line each
  20. chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
  21. # drop blank lines
  22. text = '\n'.join(chunk for chunk in chunks if chunk)
  23.  
  24. text = text[1100::] #Сразу избавляемся от оглавления
  25. titles = [
  26.     "Chapter 1\nA Long-expected Party",
  27.  
  28.     "Chapter 2\nThe Shadow of the Past",
  29.  
  30.     "Chapter 3\nThree is Company",
  31.  
  32.     "Chapter 4\nA Short Cut to Mushrooms",
  33.  
  34.     "Chapter 5\nA Conspiracy Unmasked",
  35.  
  36.     "Chapter 6\nThe Old Forest",
  37.  
  38.     "Chapter 7\nIn the House of Tom Bombadil",
  39.  
  40.     "Chapter 8\nFog on the Barrow-Downs",
  41.  
  42.     "Chapter 9\nAt the Sign of The Prancing Pony",
  43.  
  44.     "Chapter 10\nStrider",
  45.  
  46.     "Chapter 11\nA Knife in the Dark",
  47.  
  48.     "Chapter 12\nFlight to the Ford",
  49.  
  50.     "Chapter 1\nMany Meetings",
  51.  
  52.     "Chapter 2\nThe Council of Elrond",
  53.  
  54.     "Chapter 3\nThe Ring Goes South",
  55.  
  56.     "Chapter 4\nA Journey in the Dark",
  57.  
  58.     "Chapter 5\nThe Bridge of Khazad-dûm",
  59.  
  60.     "Chapter 6\nLothlórien",
  61.  
  62.     "Chapter 7\nThe Mirror of Galadriel",
  63.  
  64.     "Chapter 8\nFarewell to Lórien",
  65.  
  66.     "Chapter 9\nThe Great River",
  67.  
  68.     "Chapter 10\nThe Breaking of the Fellowship",
  69.  
  70. ]
  71. """
  72. В цикле НИЖЕ программа находит индексы начала и конца каждой главы в тексте книги.
  73. Это необходимо для дальнейшей обработки текста.
  74. """
  75. indexes = []
  76. for title in titles:
  77.     idx = text.find(title)
  78.     indexes.append(idx)
  79. #print(indexes)
  80.  
  81. chapters_idx_in_text = {}
  82. for i in range(0, len(titles)):
  83.     title = titles[i]
  84.     if i == len(titles)-1:
  85.         chapters_idx_in_text[title] = (indexes[i], len(text))
  86.     else:
  87.         chapters_idx_in_text[title] = (indexes[i], indexes[i + 1])
  88. # Делаем API запрос, чтобы узнать все имена персонажей из "Вдастелина колец"
  89. url = 'https://the-one-api.dev/v2/character'
  90. from requests.structures import CaseInsensitiveDict
  91.  
  92. headers = CaseInsensitiveDict()
  93. headers["Accept"] = "application/json"
  94. headers["Authorization"] = "Bearer 7mbNq6GVWo0LiEQS7GCc"
  95. get_characters = requests.get(url, headers=headers)
  96. #print(get_characters.status_code)
  97. if get_characters.status_code == requests.codes.ok:
  98.     characters_names = get_characters.json()
  99.  
  100. with open("characters.json", 'w', encoding='utf-8') as fl:
  101.     fl.write(json.dumps(characters_names, ensure_ascii=False, indent=4))
  102.  
  103. all_characters = []
  104. for one in characters_names["docs"]:
  105.     all_characters.append(one["name"])
  106.  
  107. heroes_in_text = []
  108. for hero in all_characters:
  109.     if hero in text:
  110.         heroes_in_text.append(hero)
  111.  
  112. # Считаем количество имён РАЗЛИЧНЫХ персонажей, которые встречаются в каждой главе
  113. chapters_num_of_heroes = {}
  114. for chapter in chapters_idx_in_text:
  115.     from_to = chapters_idx_in_text[chapter]  # from_to_in_text
  116.     from_ = from_to[0]
  117.     to = from_to[1]
  118.     for character in all_characters:
  119.         if text.find(character, from_, to) != -1:
  120.             if chapter in chapters_num_of_heroes:
  121.                 chapters_num_of_heroes[chapter] += 1
  122.             else:
  123.                 chapters_num_of_heroes[chapter] = 1
  124. rating_by_num_of_heroes = []
  125. for chapter in chapters_num_of_heroes:
  126.     rating_by_num_of_heroes.append((chapters_num_of_heroes[chapter], chapter))
  127.  
  128. # Сортируем список [ (кол-во персонажей, глава) ]
  129. rating_by_num_of_heroes = sorted(rating_by_num_of_heroes)
  130. answer_chapter = rating_by_num_of_heroes[-1]  # Ответ на вопрос 2
  131. #print(answer_chapter)
  132.  
  133. heroes_mentioned = {}
  134. answer = {"chapter title": answer_chapter[1],
  135.           "number of different characters mentioned at the chapter": answer_chapter[0],
  136.           "the number of occurrences of the name of the heroes in the text of the chapter": heroes_mentioned}
  137. chapter_idx = chapters_idx_in_text[answer_chapter[1]]  # (from_index, to_index) in text-string
  138. #print(chapter_idx)
  139. from_ = chapter_idx[0]
  140. to = chapter_idx[1]
  141. #print(text[50567-50:50567+200])
  142. #print(text[to-200:to])
  143.  
  144. # Считаем, сколько раз имя каждого героя упоминалось в тексте главы -- ответ на вопрос 3
  145. for hero in heroes_in_text:
  146.     cnt = text.count(hero, from_, to)
  147.     if cnt > 0:
  148.         heroes_mentioned[hero] = cnt
  149.  
  150. # Записываем ответ
  151. with open("N2-3_FINAL.json", 'w', encoding='utf-8') as fh:
  152.     fh.write(json.dumps(answer, ensure_ascii=False, indent=4))
  153.  
  154. print("FINISHED")
Add Comment
Please, Sign In to add comment