coursework

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

prim = "She’s swept the floor. She’s very kind to me."
tok = sent_tokenize(prim)
print(tok)
for elem in tok:
    temp = word_tokenize(elem)
    print(temp)

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# nltk.download()
# СЛОВАРИ

DictWith1Meaning = {
    "arencha": "are not you",
    "cuz": "because",
    "cuppa": "cup of",
    "dunno": "do not know",
    "finna": "fixing to",
    "gimme": "give me",
    "gonna": "going to",
    "gotta": "got to",
    "helluva": "hell of a",
    "howdy": "how do you do",
    "hafta": "have to",
    "Imma": "I am going to",
    "innit": "is not it",
    "Ion": "I do not",
    "kinda": "kind of",
    "lemme": "let me",
    "methinks": "I think",
    "tryna": "trying to",
    "wanna": "want to",
    "whatcha": "what are you",
    "wonnot": "will not",
    "yessir": "yes sir",
    "a’ight": "alright",
    "amn’t": "am not",
    "’n’": "and",
    "‘n’": "and",
    "aren’t": "are not",
    "’bout": "about",
    "cap’n": "captain",
    "can’t": "cannot",
    "’cause": "because",
    "’cept": "except",
    "c’mon": "come on",
    "could’ve": "could have",
    "couldn’t": "could not",
    "couldn’t’ve": "could not have",
    "daresn’t": "dare not",
    "dasn’t": "dare not",
    "didn’t": "did not",
    "doesn't": "does not",
    "e’en": "even",
    "e’er": "ever",
    "’em": "them",
    "fo’c’sle": "forecastle",
    "’gainst": "against",
    "g’day": "good day",
    "giv’n": "given",
    "gi’z": "give us",
    "gon’t": "go not",
    "hadn’t": "had not",
    "had’ve": "had have",
    "hasn’t": "has not",
    "haven’t": "have not",
    "here’s": "here is",
    "how’re": "how are",
    "if’n": "If and when",
    "I'd've": "I would have",
    "I’m": "I am",
    "I’m'onna": "I am going to",
    "I’m’o": "I am going to",
    "I’m'na": "I am going to",
    "I’ve": "I have",
    "isn’t": "is not",
    "it’d": "it would",
    "let’s": "let us",
    "loven’t": "love not",
    "ma’am": "madam",
    "mayn’t": "may not",
    "may’ve": "may have",
    "mightn’t": "might not",
    "might’ve": "might have",
    "mine’s": "mine is",
    "mustn’t": "must not",
    "mustn’t’ve": "must not have",
    "must’ve": "must have",
    "’neath": "beneath",
    "needn’t": "need not",
    "nal": "and all",
    "ne’er": "never",
    "o’": "of",
    "o’clock": "of the clock",
    "o’er": "over",
    "ol’": "old",
    "ought’ve": "ought have",
    "oughtn’t": "ought not",
    "oughtn’t’ve": "ought not have",
    "’round": "around",
    "shalln’t": "shall not",
    "shan’": "shall not",
    "shan’t": "shall not",
    "should’ve": "should have",
    "shouldn’t": "should not",
    "shouldn’t’ve": "should not have",
    "so’re": "so are",
    "so’ve": "so have",
    "that’re": "that are",
    "there’re": "there are",
    "these’re": "these are",
    "these’ve": "these have",
    "they’ve": "they have",
    "those’re ": "those are",
    "those’ve": "those have",
    "’thout": "without",
    "’til": "until",
    "’tis": "it is",
    "’tisn’t": "it is not",
    "to’ve": "to have",
    "’twas": "it was",
    "’tween": "between",
    "’twere": "it were",
    "w’all": "we all",
    "w’at": "we at",
    "wasn’t": "was not",
    "we’d’ve": "we would have",
    "we’re": "we are",
    "we’ve": "we have",
    "weren’t": "were not",
    "what’d": "what did",
    "what’ve": "what have",
    "when’d": "when did",
    "where’d": "where did",
    "where’re": "where are",
    "where’ve": "where have",
    "which’re": "which are",
    "which’ve": "which have",
    "who’re": "who are",
    "who’ve": "who have",
    "why’d": "why did",
    "why’re": "why are",
    "willn’t": "will not",
    "won’t": "will not",
    "would’ve": "would have",
    "wouldn’t": "would not",
    "wouldn’t’ve": "would not have",
    "y’at": "you at",
    "y'ever": "have you ever",
    "y’know": "you know",
    "you’re": "you are",
    "you’ve": "you have",
    "y’all": "you all",
    "y’all’d’ve": "you all would have",
    "y’all’dn't’ve": "you all would not have",
    "y’all’re": "you all are",
    "y’all’ren’t": "you all are not",
    "yes’m": "yes madam",
    "who’d’ve": "who would have",
    "yesn’t": "yes not"
}

DictOfFormal = {
    "’re": "are",
    "’ll": "will",
    "’ight": "alright",
    "’t": "not",
    "’ve": "have"
}

# чтение из входного файла текста для обработки
filename = "in_text_contractions.txt"
arr = []                                                                    # добавить обработку с сохранением абзацных отступов!!!
with open("in_text_contractions.txt", "r", encoding='utf-8') as file:
    content = file.read()
    # arr.append(content)


#prim = "You’re very tall."
#prim1 = "She’s swept the floor."
#prim2 = "She’s very kind to me."
#tok = word_tokenize(prim1 +" "+ prim2)
#print(tok)

# tagged = nltk.pos_tag(tok)

#print(tagged)

#print()


# Сначала обработка неформальных сокращений без апострофа
text = content.split(" ")
for i in range(0, len(text)):
    temp = text[i]
    f = False   # флаг, была ли буква заглавной или нет
    if temp[0].isupper():
        f = True
        temp[0].lower()
    if temp in DictWith1Meaning:
        temp = DictWith1Meaning[temp]
    if f:
        temp[0].upper()
        f = False
    text[i] = temp

z = ' '.join(text)

# Обрабатываем формальные сокращения
sents = sent_tokenize(z)
for i in range(0, len(sents)):
    s = sents[i]
    s = word_tokenize(s)
    for j in range(0, len(s) - 1):
        if s[j] == "’":
            if s[j] + s[j + 1] in DictOfFormal:
                s[j] = DictOfFormal[s[j] + s[j + 1]]
                s.remove(s[j + 1])
                j += 1
    # здесь надо создать новую строку из того, что обработалось
    temp = ""
    c = ".’?!:;)]}«„„'"
    for x in range(0, len(s)):
        if s[x] in c:
            temp = temp[:-1]
            temp += s[x]
        elif s[x] == ',' or s[x] == '”' or s[x] == '»':
            temp = temp[:-1]
            temp += s[x] + " "
        else:
            temp += s[x] + " "
    sents[i] = temp

# Соединяем текст обратно
FullText = ""
for i in range(0, len(sents)):
    FullText += sents[i] + ' '

print(FullText)