Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from pdfminer.high_level import extract_text
- import json
- import requests
- """
- Чтобы посчитать сколько героев упомиается в каждой главе, я скачал текст книги "The Fellowship of the Ring", или "Братство кольца".
- Формат -- pdf.
- """
- #МетодЭ чтобы избавиться от странных букв
- letters = "qwertyuiopasdfghjklzxcvbnm"
- strange_letters = {}
- def recycle_text(string):
- for i in range(len())
- file = "j-r-r-tolkien-lord-of-the-rings-01-the-fellowship-of-the-ring-retail-pdf.pdf"
- text = extract_text(file)
- text = recycle(text)
- #Так пишутся названия глав в скачанном тексте:
- chapters_titles_in_pfd = [
- "A LONG-EXPECTED PARTY",
- "THE SHADOW OF THE PAST",
- "THREE IS COMPANY",
- "A SHORT CUT TO MUSHROOMS",
- "A CONSPIRACY UNMASKED",
- "THE OLD FOREST",
- "IN THE HOUSE OF TOM BOMBADIL",
- "FOG ON THE BARROW-DOWNS",
- "AT THE SIGN OF THE PRANCING PONY",
- "STRIDER",
- "A KNIFE IN THE DARK",
- "FLIGHT TO THE FORD", #Chapter 12
- "MANY MEETINGS",
- "THE COUNCIL OF ELROND",
- "THE RING GOES SOUTH",
- "A JOURNEY IN THE DARK",
- "THE BRIDGE OF KHAZAD-DUˆ M",
- "LOTHLO´ RIEN",
- "THE MIRROR OF GALADRIEL",
- "FAREWELL TO LO´ RIEN",
- "THE GREAT RIVER",
- "THE BREAKING OF THE FELLOWSHIP",
- ]
- chapters_beginning = {
- "A LONG-EXPECTED PARTY":
- "When Mr. Bilbo Baggins of Bag End announced",
- "THE SHADOW OF THE PAST":
- "The talk did not die down in nine or even ninety",
- "THREE IS COMPANY":
- "You ought to go quietly, and you ought to go soon",
- "A SHORT CUT TO MUSHROOMS":
- "In the morning Frodo woke refreshed.",
- "A CONSPIRACY UNMASKED":
- "Now we had better get home ourselves",
- "THE OLD FOREST":
- "Frodo woke suddenly. It was still dark in the room.",
- "IN THE HOUSE OF TOM BOMBADIL":
- "The four hobbits stepped over the wide stone threshold",
- "FOG ON THE BARROW-DOWNS":
- "That night they heard no noises.",
- "AT THE SIGN OF THE PRANCING PONY":
- "Bree was the chief village of the Bree-land",
- "STRIDER":
- "Frodo, Pippin, and Sam made their way back to the parlour.",
- "A KNIFE IN THE DARK":
- "As they prepared for sleep in the inn at Bree, darkness lay",
- "FLIGHT TO THE FORD":
- "When Frodo came to himself he was still clutching the Ring",
- "MANY MEETINGS":
- "Frodo woke and found himself lying in bed.",
- "THE COUNCIL OF ELROND":
- "Next day Frodo woke early, feeling refreshed and well.",
- "THE RING GOES SOUTH":
- "Later that day the hobbits held a meeting of their own",
- "A JOURNEY IN THE DARK":
- "It was evening, and the grey light was again waning fast",
- "THE BRIDGE OF KHAZAD-DUˆ M":
- "The Company of the Ring stood silent beside the tomb of",
- "LOTHLO´ RIEN":
- "Alas! I fear we cannot stay here longer",
- "THE MIRROR OF GALADRIEL":
- "The sun was sinking behind the mountains, and the shadows",
- "FAREWELL TO LO´ RIEN":
- "and there the Lord and Lady greeted",
- "THE GREAT RIVER":
- "Frodo was roused by Sam. He found that he was lying",
- "THE BREAKING OF THE FELLOWSHIP":
- "Aragorn led them to the right arm of the River. Here upon"
- }
- """
- В цикле НИЖЕ программа находит индексы начала и конца каждой главы в тексте книги.
- Это необходимо для дальнейшей обработки текста.
- Ниже этого цикла есть готовый список indexes с индексами начала каждой главы."""
- """indexes = []
- for title in chapters_titles_in_pfd:
- beginning = chapters_beginning[title]
- idx = text.find(beginning)
- indexes.append(idx)
- print(indexes)"""
- #[81878, 136751, 196408, 248094, 279204, 306929, 342151, 372128, 408266, 442252, 473158, 524145, 571240, 619854, 705847, 762924, 824744, 853924, 904185, 940415, 973280, 1011852]
- indexes = [81878, 136751, 196408, 248094, 279204, 306929, 342151, 372128, 408266, 442252, 473158, 524145, 571240, 619854, 705847, 762924, 824744, 853924, 904185, 940415, 973280, 1011852]
- chapters_idx_in_text = {}
- for i in range(0, len(chapters_titles_in_pfd)):
- title = chapters_titles_in_pfd[i]
- if i == len(chapters_titles_in_pfd)-1:
- chapters_idx_in_text[title] = (indexes[i], len(text))
- else:
- chapters_idx_in_text[title] = (indexes[i], indexes[i + 1])
- #Делаем API запрос, чтобы узнать все имена персонажей из "Вдастелина колец"
- url = 'https://the-one-api.dev/v2/character'
- from requests.structures import CaseInsensitiveDict
- headers = CaseInsensitiveDict()
- headers["Accept"] = "application/json"
- headers["Authorization"] = "Bearer 7mbNq6GVWo0LiEQS7GCc"
- get_characters = requests.get(url, headers=headers)
- if get_characters.status_code == requests.codes.ok:
- api_characters = get_characters.json()
- with open("characters.json", 'w', encoding = 'utf-8') as fl:
- fl.write(json.dumps(api_characters, ensure_ascii=False,indent=4))
- all_characters = []
- for one in api_characters["docs"]:
- one = recycle(one) #избавиться от "странных букв"
- all_characters.append(one["name"])
- #debug
- all_heroes = {}
- for i in range(len(all_characters)):
- all_heroes[i] = all_characters[i]
- heroes_in_text = []
- for hero in all_characters:
- if hero in text:
- heroes_in_text.append(hero)
- in_txt = {}
- for i in range(len(heroes_in_text)):
- in_txt[i] = heroes_in_text[i]
- with open("heroes_text.json", 'w', encoding = 'utf-8') as fh:
- fh.write(json.dumps(in_txt, ensure_ascii = False, indent = 4))
- with open("all_heroes.json", 'w', encoding = 'utf-8') as fh:
- fh.write(json.dumps(all_heroes, ensure_ascii = False, indent = 4))
- #debug finish
- #Считаем количество имён РАЗЛИЧНЫХ персонажей, которые встречаются в каждой главе
- chapters_num_of_heroes = {}
- for chapter in chapters_idx_in_text:
- from_to = chapters_idx_in_text[chapter] #from_to_in_text
- from_ = from_to[0]
- to = from_to[1]
- for character in all_characters:
- if text.find(character, from_, to) != -1:
- if chapter in chapters_num_of_heroes:
- chapters_num_of_heroes[chapter] += 1
- else:
- chapters_num_of_heroes[chapter] = 1
- rating_by_num_of_heroes = []
- for chapter in chapters_num_of_heroes:
- rating_by_num_of_heroes.append((chapters_num_of_heroes[chapter], chapter))
- #Сортируем список [ (кол-во персонажей, глава) ]
- rating_by_num_of_heroes = sorted(rating_by_num_of_heroes)
- answer_chapter = rating_by_num_of_heroes[-1]#Ответ на вопрос 2
- heroes_mentioned = {}
- answer = {"chapter title": answer_chapter[1],"number of different characters mentioned at the chapter": answer_chapter[0], "the number of occurrences of the name of the heroes in the text of the chapter": heroes_mentioned}
- chapter_idx = chapters_idx_in_text[answer_chapter[1]] #(from_index, to_index) in text-string
- from_ = chapter_idx[0]
- to = chapter_idx[1]
- #debug
- debug = {"chapter": text[chapter_idx[0]:chapter_idx[1]],
- "start": text[chapter_idx[0]:chapter_idx[0]+500],
- "end": text[chapter_idx[1] - 1000:chapter_idx[1]]
- }
- print(type(text[chapter_idx[0]:chapter_idx[1]]))
- with open("chapter.json", 'w', encoding = 'utf-8') as fh:
- fh.write(json.dumps(debug, ensure_ascii=False, indent = 1))
- #debug finish
- #Считаем, сколько раз имя каждого героя упоминалось в тексте главы -- ответ на вопрос 3
- for hero in all_characters:
- cnt = text.count(hero, from_, to)
- if cnt > 0:
- heroes_mentioned[hero] = cnt
- #Записываем ответ
- with open("N2-3.json", 'w', encoding = 'utf-8') as fh:
- fh.write(json.dumps(answer, ensure_ascii=False, indent = 4))
- print("FINISHED")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement