Advertisement
Korotkodul

LORD_corrected

Oct 31st, 2021
90
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.09 KB | None | 0 0
  1. from pdfminer.high_level import extract_text
  2. import json
  3. import requests
  4. """
  5. Чтобы посчитать сколько героев упомиается в каждой главе, я скачал текст книги "The Fellowship of the Ring", или "Братство кольца".
  6. Формат -- pdf.
  7. """
  8. #МетодЭ чтобы избавиться от странных букв
  9. letters = "qwertyuiopasdfghjklzxcvbnm"
  10. strange_letters = {}
  11. def recycle_text(string):
  12.     for i in range(len())
  13.    
  14. file = "j-r-r-tolkien-lord-of-the-rings-01-the-fellowship-of-the-ring-retail-pdf.pdf"
  15. text = extract_text(file)
  16. text = recycle(text)
  17. #Так пишутся названия глав в скачанном тексте:
  18. chapters_titles_in_pfd = [
  19.     "A LONG-EXPECTED PARTY",
  20.     "THE SHADOW OF THE PAST",
  21.     "THREE IS COMPANY",
  22.     "A SHORT CUT TO MUSHROOMS",
  23.     "A CONSPIRACY UNMASKED",
  24.     "THE OLD FOREST",
  25.     "IN THE HOUSE OF TOM BOMBADIL",
  26.     "FOG ON THE BARROW-DOWNS",
  27.     "AT THE SIGN OF THE PRANCING PONY",
  28.     "STRIDER",
  29.     "A KNIFE IN THE DARK",
  30.     "FLIGHT TO THE FORD", #Chapter 12
  31.     "MANY MEETINGS",
  32.     "THE COUNCIL OF ELROND",
  33.     "THE RING GOES SOUTH",
  34.     "A JOURNEY IN THE DARK",
  35.     "THE BRIDGE OF KHAZAD-DUˆ M",
  36.     "LOTHLO´ RIEN",
  37.     "THE MIRROR OF GALADRIEL",
  38.     "FAREWELL TO LO´ RIEN",
  39.     "THE GREAT RIVER",
  40.     "THE BREAKING OF THE FELLOWSHIP",
  41. ]
  42.  
  43. chapters_beginning = {
  44.     "A LONG-EXPECTED PARTY":
  45.     "When Mr. Bilbo Baggins of Bag End announced",
  46.  
  47.     "THE SHADOW OF THE PAST":
  48.     "The talk did not die down in nine or even ninety",
  49.  
  50.     "THREE IS COMPANY":
  51.     "You ought to go quietly, and you ought to go soon",
  52.  
  53.     "A SHORT CUT TO MUSHROOMS":
  54.     "In the morning Frodo woke refreshed.",
  55.  
  56.     "A CONSPIRACY UNMASKED":
  57.     "Now we had better get home ourselves",
  58.  
  59.     "THE OLD FOREST":
  60.     "Frodo woke suddenly. It was still dark in the room.",
  61.  
  62.     "IN THE HOUSE OF TOM BOMBADIL":
  63.     "The four hobbits stepped over the wide stone threshold",
  64.  
  65.     "FOG ON THE BARROW-DOWNS":
  66.     "That night they heard no noises.",
  67.  
  68.     "AT THE SIGN OF THE PRANCING PONY":
  69.     "Bree was the chief village of the Bree-land",
  70.  
  71.     "STRIDER":
  72.     "Frodo, Pippin, and Sam made their way back to the parlour.",
  73.  
  74.     "A KNIFE IN THE DARK":
  75.     "As they prepared for sleep in the inn at Bree, darkness lay",
  76.  
  77.     "FLIGHT TO THE FORD":
  78.     "When Frodo came to himself he was still clutching the Ring",
  79.  
  80.     "MANY MEETINGS":
  81.     "Frodo woke and found himself lying in bed.",
  82.  
  83.     "THE COUNCIL OF ELROND":
  84.     "Next day Frodo woke early, feeling refreshed and well.",
  85.  
  86.     "THE RING GOES SOUTH":
  87.     "Later that day the hobbits held a meeting of their own",
  88.  
  89.     "A JOURNEY IN THE DARK":
  90.     "It was evening, and the grey light was again waning fast",
  91.  
  92.     "THE BRIDGE OF KHAZAD-DUˆ M":
  93.     "The Company of the Ring stood silent beside the tomb of",
  94.  
  95.     "LOTHLO´ RIEN":
  96.     "Alas! I fear we cannot stay here longer",
  97.  
  98.     "THE MIRROR OF GALADRIEL":
  99.     "The sun was sinking behind the mountains, and the shadows",
  100.  
  101.     "FAREWELL TO LO´ RIEN":
  102.     "and there the Lord and Lady greeted",
  103.  
  104.     "THE GREAT RIVER":
  105.     "Frodo was roused by Sam. He found that he was lying",
  106.  
  107.     "THE BREAKING OF THE FELLOWSHIP":
  108.     "Aragorn led them to the right arm of the River. Here upon"
  109. }
  110.  
  111. """
  112. В цикле НИЖЕ программа находит индексы начала и конца каждой главы в тексте книги.
  113. Это необходимо для дальнейшей обработки текста.
  114. Ниже этого цикла есть готовый список indexes с индексами начала  каждой главы."""
  115. """indexes = []
  116. for title in chapters_titles_in_pfd:
  117.    beginning = chapters_beginning[title]
  118.    idx = text.find(beginning)
  119.    indexes.append(idx)
  120. print(indexes)"""
  121. #[81878, 136751, 196408, 248094, 279204, 306929, 342151, 372128, 408266, 442252, 473158, 524145, 571240, 619854, 705847, 762924, 824744, 853924, 904185, 940415, 973280, 1011852]
  122. indexes = [81878, 136751, 196408, 248094, 279204, 306929, 342151, 372128, 408266, 442252, 473158, 524145, 571240, 619854, 705847, 762924, 824744, 853924, 904185, 940415, 973280, 1011852]
  123.  
  124. chapters_idx_in_text = {}
  125. for i in range(0, len(chapters_titles_in_pfd)):
  126.     title = chapters_titles_in_pfd[i]
  127.     if i == len(chapters_titles_in_pfd)-1:
  128.         chapters_idx_in_text[title] = (indexes[i], len(text))
  129.     else:
  130.         chapters_idx_in_text[title] = (indexes[i], indexes[i + 1])
  131.  
  132.  
  133. #Делаем API запрос, чтобы узнать все имена персонажей из "Вдастелина колец"
  134. url = 'https://the-one-api.dev/v2/character'
  135. from requests.structures import CaseInsensitiveDict
  136. headers = CaseInsensitiveDict()
  137. headers["Accept"] = "application/json"
  138. headers["Authorization"] = "Bearer 7mbNq6GVWo0LiEQS7GCc"
  139. get_characters = requests.get(url, headers=headers)
  140. if get_characters.status_code == requests.codes.ok:
  141.     api_characters = get_characters.json()
  142.  
  143.  
  144. with open("characters.json", 'w', encoding = 'utf-8') as fl:
  145.     fl.write(json.dumps(api_characters, ensure_ascii=False,indent=4))
  146. all_characters = []
  147. for one in api_characters["docs"]:
  148.     one = recycle(one) #избавиться от "странных букв"
  149.     all_characters.append(one["name"])
  150.  
  151.  
  152.  
  153. #debug
  154. all_heroes = {}
  155. for i in range(len(all_characters)):
  156.     all_heroes[i] = all_characters[i]
  157.  
  158. heroes_in_text = []
  159. for hero in all_characters:
  160.     if hero in text:
  161.         heroes_in_text.append(hero)
  162. in_txt = {}
  163. for i in range(len(heroes_in_text)):
  164.     in_txt[i] = heroes_in_text[i]
  165. with open("heroes_text.json", 'w', encoding = 'utf-8') as fh:
  166.     fh.write(json.dumps(in_txt, ensure_ascii = False, indent = 4))
  167. with open("all_heroes.json", 'w', encoding = 'utf-8') as fh:
  168.     fh.write(json.dumps(all_heroes, ensure_ascii = False, indent = 4))
  169. #debug finish
  170.  
  171. #Считаем количество имён РАЗЛИЧНЫХ персонажей, которые встречаются в каждой главе
  172. chapters_num_of_heroes = {}
  173. for chapter in chapters_idx_in_text:
  174.     from_to = chapters_idx_in_text[chapter] #from_to_in_text
  175.     from_ = from_to[0]
  176.     to = from_to[1]
  177.     for character in all_characters:
  178.         if text.find(character, from_, to)  != -1:
  179.             if chapter in chapters_num_of_heroes:
  180.                 chapters_num_of_heroes[chapter] += 1
  181.             else:
  182.                 chapters_num_of_heroes[chapter] = 1
  183. rating_by_num_of_heroes = []
  184. for chapter in chapters_num_of_heroes:
  185.     rating_by_num_of_heroes.append((chapters_num_of_heroes[chapter], chapter))
  186.  
  187. #Сортируем список [ (кол-во персонажей, глава) ]
  188. rating_by_num_of_heroes = sorted(rating_by_num_of_heroes)
  189. answer_chapter = rating_by_num_of_heroes[-1]#Ответ на вопрос 2
  190.  
  191.  
  192.  
  193. heroes_mentioned = {}
  194. answer = {"chapter title": answer_chapter[1],"number of different characters mentioned at the chapter": answer_chapter[0], "the number of occurrences of the name of the heroes in the text of the chapter": heroes_mentioned}
  195. chapter_idx = chapters_idx_in_text[answer_chapter[1]] #(from_index, to_index) in text-string
  196. from_ = chapter_idx[0]
  197. to = chapter_idx[1]
  198.  
  199. #debug
  200. debug = {"chapter": text[chapter_idx[0]:chapter_idx[1]],
  201.          "start": text[chapter_idx[0]:chapter_idx[0]+500],
  202.          "end": text[chapter_idx[1] - 1000:chapter_idx[1]]
  203.          }
  204. print(type(text[chapter_idx[0]:chapter_idx[1]]))
  205. with open("chapter.json", 'w', encoding = 'utf-8') as fh:
  206.     fh.write(json.dumps(debug, ensure_ascii=False, indent = 1))
  207. #debug finish
  208.  
  209.  
  210. #Считаем, сколько раз имя каждого героя упоминалось в тексте главы -- ответ на вопрос 3
  211. for hero in all_characters:
  212.     cnt = text.count(hero, from_, to)
  213.     if cnt > 0:
  214.         heroes_mentioned[hero] = cnt
  215.  
  216. #Записываем ответ
  217. with open("N2-3.json", 'w', encoding = 'utf-8') as fh:
  218.     fh.write(json.dumps(answer, ensure_ascii=False, indent = 4))
  219.  
  220. print("FINISHED")
  221.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement