Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import nltk
- from nltk.tokenize import sent_tokenize, word_tokenize
- prim = "She’s swept the floor. She’s very kind to me."
- tok = sent_tokenize(prim)
- print(tok)
- for elem in tok:
- temp = word_tokenize(elem)
- print(temp)
- nltk.download('punkt')
- nltk.download('averaged_perceptron_tagger')
- # nltk.download()
- # СЛОВАРИ
- DictWith1Meaning = {
- "arencha": "are not you",
- "cuz": "because",
- "cuppa": "cup of",
- "dunno": "do not know",
- "finna": "fixing to",
- "gimme": "give me",
- "gonna": "going to",
- "gotta": "got to",
- "helluva": "hell of a",
- "howdy": "how do you do",
- "hafta": "have to",
- "Imma": "I am going to",
- "innit": "is not it",
- "Ion": "I do not",
- "kinda": "kind of",
- "lemme": "let me",
- "methinks": "I think",
- "tryna": "trying to",
- "wanna": "want to",
- "whatcha": "what are you",
- "wonnot": "will not",
- "yessir": "yes sir",
- "a’ight": "alright",
- "amn’t": "am not",
- "’n’": "and",
- "‘n’": "and",
- "aren’t": "are not",
- "’bout": "about",
- "cap’n": "captain",
- "can’t": "cannot",
- "’cause": "because",
- "’cept": "except",
- "c’mon": "come on",
- "could’ve": "could have",
- "couldn’t": "could not",
- "couldn’t’ve": "could not have",
- "daresn’t": "dare not",
- "dasn’t": "dare not",
- "didn’t": "did not",
- "doesn't": "does not",
- "e’en": "even",
- "e’er": "ever",
- "’em": "them",
- "fo’c’sle": "forecastle",
- "’gainst": "against",
- "g’day": "good day",
- "giv’n": "given",
- "gi’z": "give us",
- "gon’t": "go not",
- "hadn’t": "had not",
- "had’ve": "had have",
- "hasn’t": "has not",
- "haven’t": "have not",
- "here’s": "here is",
- "how’re": "how are",
- "if’n": "If and when",
- "I'd've": "I would have",
- "I’m": "I am",
- "I’m'onna": "I am going to",
- "I’m’o": "I am going to",
- "I’m'na": "I am going to",
- "I’ve": "I have",
- "isn’t": "is not",
- "it’d": "it would",
- "let’s": "let us",
- "loven’t": "love not",
- "ma’am": "madam",
- "mayn’t": "may not",
- "may’ve": "may have",
- "mightn’t": "might not",
- "might’ve": "might have",
- "mine’s": "mine is",
- "mustn’t": "must not",
- "mustn’t’ve": "must not have",
- "must’ve": "must have",
- "’neath": "beneath",
- "needn’t": "need not",
- "nal": "and all",
- "ne’er": "never",
- "o’": "of",
- "o’clock": "of the clock",
- "o’er": "over",
- "ol’": "old",
- "ought’ve": "ought have",
- "oughtn’t": "ought not",
- "oughtn’t’ve": "ought not have",
- "’round": "around",
- "shalln’t": "shall not",
- "shan’": "shall not",
- "shan’t": "shall not",
- "should’ve": "should have",
- "shouldn’t": "should not",
- "shouldn’t’ve": "should not have",
- "so’re": "so are",
- "so’ve": "so have",
- "that’re": "that are",
- "there’re": "there are",
- "these’re": "these are",
- "these’ve": "these have",
- "they’ve": "they have",
- "those’re ": "those are",
- "those’ve": "those have",
- "’thout": "without",
- "’til": "until",
- "’tis": "it is",
- "’tisn’t": "it is not",
- "to’ve": "to have",
- "’twas": "it was",
- "’tween": "between",
- "’twere": "it were",
- "w’all": "we all",
- "w’at": "we at",
- "wasn’t": "was not",
- "we’d’ve": "we would have",
- "we’re": "we are",
- "we’ve": "we have",
- "weren’t": "were not",
- "what’d": "what did",
- "what’ve": "what have",
- "when’d": "when did",
- "where’d": "where did",
- "where’re": "where are",
- "where’ve": "where have",
- "which’re": "which are",
- "which’ve": "which have",
- "who’re": "who are",
- "who’ve": "who have",
- "why’d": "why did",
- "why’re": "why are",
- "willn’t": "will not",
- "won’t": "will not",
- "would’ve": "would have",
- "wouldn’t": "would not",
- "wouldn’t’ve": "would not have",
- "y’at": "you at",
- "y'ever": "have you ever",
- "y’know": "you know",
- "you’re": "you are",
- "you’ve": "you have",
- "y’all": "you all",
- "y’all’d’ve": "you all would have",
- "y’all’dn't’ve": "you all would not have",
- "y’all’re": "you all are",
- "y’all’ren’t": "you all are not",
- "yes’m": "yes madam",
- "who’d’ve": "who would have",
- "yesn’t": "yes not"
- }
- DictOfFormal = {
- "’re": "are",
- "’ll": "will",
- "’ight": "alright",
- "’t": "not",
- "’ve": "have"
- }
- # чтение из входного файла текста для обработки
- filename = "in_text_contractions.txt"
- arr = [] # добавить обработку с сохранением абзацных отступов!!!
- with open("in_text_contractions.txt", "r", encoding='utf-8') as file:
- content = file.read()
- # arr.append(content)
- #prim = "You’re very tall."
- #prim1 = "She’s swept the floor."
- #prim2 = "She’s very kind to me."
- #tok = word_tokenize(prim1 +" "+ prim2)
- #print(tok)
- # tagged = nltk.pos_tag(tok)
- #print(tagged)
- #print()
- # Сначала обработка неформальных сокращений без апострофа
- text = content.split(" ")
- for i in range(0, len(text)):
- temp = text[i]
- f = False # флаг, была ли буква заглавной или нет
- if temp[0].isupper():
- f = True
- temp[0].lower()
- if temp in DictWith1Meaning:
- temp = DictWith1Meaning[temp]
- if f:
- temp[0].upper()
- f = False
- text[i] = temp
- z = ' '.join(text)
- # Обрабатываем формальные сокращения
- sents = sent_tokenize(z)
- for i in range(0, len(sents)):
- s = sents[i]
- s = word_tokenize(s)
- for j in range(0, len(s) - 1):
- if s[j] == "’":
- if s[j] + s[j + 1] in DictOfFormal:
- s[j] = DictOfFormal[s[j] + s[j + 1]]
- s.remove(s[j + 1])
- j += 1
- # здесь надо создать новую строку из того, что обработалось
- temp = ""
- c = ".’?!:;)]}«„„'"
- for x in range(0, len(s)):
- if s[x] in c:
- temp = temp[:-1]
- temp += s[x]
- elif s[x] == ',' or s[x] == '”' or s[x] == '»':
- temp = temp[:-1]
- temp += s[x] + " "
- else:
- temp += s[x] + " "
- sents[i] = temp
- # Соединяем текст обратно
- FullText = ""
- for i in range(0, len(sents)):
- FullText += sents[i] + ' '
- print(FullText)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement