pr11(RNN)

import numpy as np
import os
import datetime as dt
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, classification_report

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

def create_inputs(text, vocab):
    inputs = []
    for w in text.split():
        v = np.zeros((len(vocab), 1))
        if w in vocab:
            v[vocab[w]] = 1
        inputs.append(v)
    return np.array(inputs)


class RNN:
    def __init__(self, vocab, input_size, hidden_size, output_size=2):
        self.vocab = vocab
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.Wxh = np.random.randn(self.hidden_size, self.input_size) * np.sqrt(1 / self.input_size)
        self.Whh = np.random.randn(self.hidden_size, self.hidden_size) * np.sqrt(1 / self.hidden_size)
        self.Why = np.random.randn(self.output_size, self.hidden_size) * np.sqrt(1 / self.hidden_size)
        self.bh = np.zeros((self.hidden_size, 1))
        self.by = np.zeros((self.output_size, 1))
        self.last_hs = None
        self.train_loss = []
        self.test_loss = []
        self.train_accuracy = []
        self.test_accuracy = []

    def tanh(self, x):
        return np.tanh(x)

    def softmax(self, x):
        return np.exp(x) / np.sum(np.exp(x), axis=0)

    def forward(self, inputs):
        h = np.zeros((self.hidden_size, 1))
        self.last_hs = [h]
        for i in range(inputs.shape[0]):
            x = inputs[i].reshape(-1, 1)
            h = self.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)
            self.last_hs.append(h)

        y = np.dot(self.Why, h) + self.by
        y = self.softmax(y)
        return y, h

    def backward(self, inputs, y_hat, y):

        n = len(inputs)

        dLdy = y_hat
        dLdy[y] -= 1

        dWhy = np.dot(dLdy, self.last_hs[n].T)
        dby = dLdy

        dWhh = np.zeros(self.Whh.shape)
        dWxh = np.zeros(self.Wxh.shape)
        dbh = np.zeros(self.bh.shape)

        dh = np.dot(self.Why.T, dLdy)

        for t in reversed(range(n)):

            dhraw = (1 - self.last_hs[t+1]**2) * dh

            dbh += dhraw
            dWhh += np.dot(dhraw, self.last_hs[t].T)
            dWxh += np.dot(dhraw, inputs[t].T)
            dh = np.dot(self.Whh, dhraw)

        for d in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(d, -1, 1, out=d)

        return dWxh, dWhh, dWhy, dbh, dby

    def update_parameters(self, dWxh, dWhh, dWhy, dbh, dby, learning_rate=0.001):
        self.Wxh -= learning_rate * dWxh
        self.Whh -= learning_rate * dWhh
        self.Why -= learning_rate * dWhy
        self.bh -= learning_rate * dbh
        self.by -= learning_rate * dby

    def predict(self, x_test):
        predictions = []
        for inp in x_test:
            inputs = create_inputs(inp, self.vocab)
            y_hat, _ = self.forward(inputs)
            predictions.append(np.argmax(y_hat))
        return predictions

    def train(self, x_train, y_train, epochs=1000, learning_rate=0.001, x_test=None, y_test=None, early_stopping_rounds=np.inf):
        start_time = dt.datetime.now()
        for epoch in range(epochs):

            total_loss = 0
            for i in range(len(x_train)):
                inputs = create_inputs(x_train[i], self.vocab)
                target = y_train[i]
                y_hat, h = self.forward(inputs)
                loss = -np.sum(np.log(y_hat[target]))
                total_loss += loss

                dWxh, dWhh, dWhy, dbh, dby = self.backward(inputs, y_hat, target)

                self.update_parameters(dWxh, dWhh, dWhy, dbh, dby, learning_rate)

            self.train_loss.append(total_loss/len(y_train))
            self.train_accuracy.append(accuracy_score(y_train, self.predict(x_train))*100)

            total_loss = 0
            for i in range(len(x_test)):
                inputs = create_inputs(x_test[i], self.vocab)
                target = y_test[i]
                y_hat, _ = self.forward(inputs)
                loss = -np.sum(np.log(y_hat[target]))
                total_loss += loss

            self.test_loss.append(total_loss/len(y_test))
            self.test_accuracy.append(accuracy_score(y_test, self.predict(x_test))*100)

            prcnt = (epoch+1)/epochs * 100
            print(f'№{epoch+1}/{epochs} - {round(prcnt, 2)}% | total time: {dt.datetime.now() - start_time} | time remaining: {(dt.datetime.now() - start_time) / prcnt * (100 - prcnt)} | end time: {dt.datetime.now() + (dt.datetime.now() - start_time) / prcnt * (100 - prcnt)}', end='\r')
            os.system('cls' if os.name == 'nt' else 'clear')


            if epoch + 1 > early_stopping_rounds:
                if np.argmin(self.test_loss) < epoch - early_stopping_rounds:
                    print(f'Зупинка на епохі {epoch+1}')
                    break


    def show_loss(self):
        plt.figure(figsize=(7, 5))
        plt.title('Втрати')
        plt.plot(np.arange(1, len(self.train_loss)+1), self.train_loss, c='blue', label='Тренувальні дані')
        plt.plot(np.arange(1, len(self.test_loss)+1), self.test_loss, c='red', label='Тестові дані')
        plt.legend()
        plt.xlabel('epoche')
        plt.ylabel('loss')
        plt.grid()
        plt.show()

        plt.figure(figsize=(7, 5))
        plt.title('Точність')
        plt.plot(np.arange(1, len(self.train_accuracy)+1), self.train_accuracy, c='blue', label='Тренувальні дані')
        plt.plot(np.arange(1, len(self.test_accuracy)+1), self.test_accuracy, c='red', label='Тестові дані')
        plt.legend()
        plt.xlabel('epoche')
        plt.ylabel('accuracy')
        plt.grid()
        plt.show()


data = {
    'this is very sad': False,
    'this is very happy': True,
    'i am good not bad': True,
    'this is good not bad': True,
    'i am bad not good': False,
    'everything is great': True,
    'life is good': True,
    'good': True,
    'bad': False,
    'happy': True,
    'sad': False,
    'nothing is bad': True,
    'everything is bad': False,
    'i am extremely happy': True,
    'this is extremely bad': False,
    'this is moderately good': True,
    'i am not entirely sad': True,
    'this is incredibly good': True,
    'i feel so bad': False,
    'this is the worst': False,
    'this is awesome': True,
    'i feel wonderful': True,
    'this is absolutely terrible': False,
    'it is not okay': False,
    'this is quite good': True,
    'this is not so bad': True,
    'this is somewhat sad': False,
    'i am neither happy nor sad': False,
    'i feel neutral': False,
    'not good': False,
    'not bad': True,
    'not sad': True,
    'very good': True,
    'very bad': False,
    'very happy': True,
    'very sad': False,
    'this is good': True,
    'i am good': True,
    'this is bad': False,
    'i am sad': False,
    'this is sad': False,
    'i am not happy': False,
    'this is not good': False,
    'i am not bad': True,
    'this is not sad': True,
    'i am very happy': True,
    'this is very good': True,
    'i am very bad': False,
    'this makes me smile': True,
    'this breaks my heart': False,
    'i am overwhelmed with joy': True,
    'this situation is unbearable': False,
    'nothing could be better': True,
    'i am pleasantly surprised': True,
    'this is totally unacceptable': False,
    'i am proud of this': True,
    'this is shameful': False,
    'i am deeply disappointed': False,
    'this brings me hope': True,
    'i feel completely lost': False,
    'this is the happiest moment': True,
    'this ruins everything': False,
    'this makes everything better': True,
    'life feels meaningless': False,
    'i feel perfectly content': True,
    'this is utterly fantastic': True,
    'this is highly frustrating': False,
    'i feel like crying': False,
    'this moment is precious': True,
    'this is worse than expected': False,
    'this is a dream come true': True,
    'i have no words for this sadness': False,
    'this is better than perfect': True,
    'i feel so relaxed': True,
    'this is not worth it': False,
    'this inspires me': True,
    'this destroys my trust': False,
    'i can’t stop smiling': True,
    'i regret this decision': False,
    'i am extremely sad': False,
    'this makes me laugh': True,
    'this is painfully disappointing': False,
    'everything feels amazing': True,
    'i am incredibly thankful': True,
    'this is heartwarming': True,
    'this is completely unacceptable': False,
    'i feel devastated': False,
    'this makes my day': True,
    'this is totally frustrating': False,
    'i feel unbelievably great': True,
    'nothing feels worse': False,
    'this exceeds all expectations': True,
    'this is just awful': False,
    'i am full of hope': True,
    'this is emotionally draining': False,
    'i am cautiously optimistic': True,
    'this is beyond disappointing': False,
    'i feel amazingly calm': True,
    'this is such a relief': True,
    'i feel deeply sad': False,
    'this fills me with energy': True,
    'this completely destroys my mood': False,
    'i feel really proud': True,
    'this situation is infuriating': False,
    'i am fully satisfied': True,
    'this moment feels magical': True,
    'this is beyond terrible': False,
    'i feel on top of the world': True,
    'this breaks my confidence': False,
    'i am highly encouraged': True,
    'this is so disheartening': False,
    'i feel truly blessed': True,
    'this makes me cry tears of joy': True,
    'i feel utterly hopeless': False,
    'this is better than ever': True,
    'this ruins my plans': False,
    'this is pure happiness': True,
    'this leaves me speechless': True,
    'this crushes my soul': False,
    'this uplifts my spirit': True,
    'this is such a disappointment': False,
    'this fills me with pride': True,
    'this is happy': True,
    'i am good': True,
    'this is not happy': False,
    'i am not good': False,
    'this is not bad': True,
    'i am not sad': True,
    'i am very good': True,
    'this is very bad': False,
    'i am very sad': False,
    'this is bad not good': False,
    'this is good and happy': True,
    'i am not good and not happy': False,
    'i am not at all sad': True,
    'this is not at all good': False,
    'this is not at all bad': True,
    'this is good right now': True,
    'this is very bad right now': False,
    'this was good earlier': True,
    'i was not happy and not good earlier': False,
    'everything seems fine': True,
    'this feels amazing': True,
    'life is not bad at all': True,
    'this is a bit sad': False,
    'i feel absolutely great': True,
    'nothing feels wrong': True,
    'everything is falling apart': False,
    'this is so wonderful': True,
    'i am completely satisfied': True,
    'this is the best ever': True,
    'i am somewhat disappointed': False,
    'this is mildly frustrating': False,
    'this is rather enjoyable': True,
    'nothing seems right': False,
    'this is just okay': False,
    'i feel happy but tired': True,
    'this is better than expected': True,
    'this is not as bad as it looks': True,
    'this fills me with hope': True,
    'this is disappointing beyond words': False,
    'i am overjoyed with this result': True,
    'this is an unacceptable situation': False,
    'this is unbelievably bad': False,
    'i feel like jumping for joy': True,
    'this situation is hopeless': False,
    'this makes me incredibly happy': True,
    'this is the worst feeling': False,
    'this moment is unforgettable': True,
    'this is unbearably sad': False,
    'this could not have gone better': True,
    'this makes me feel alive': True,
    'this is worse than i thought': False,
    'i am truly grateful for this': True,
    'this is a complete disaster': False,
    'this exceeded my expectations': True,
    'this is heartbreaking': False,
    'this gives me strength': True,
    'this destroys my confidence': False,
    'this is absolutely worth it': True,
    'this ruins my day': False,
}


df = pd.DataFrame(list(data.items()), columns=['review', 'target'])
df =  df.sample(frac=1)
df['target'] = df['target'].apply(lambda x: 1 if x else 0)
x, y = RandomUnderSampler(random_state=42).fit_resample(df[['review']], df['target'])
df = pd.DataFrame(x)
df['target'] = y
df['review'] = df['review'].apply(preprocess_text)

X, y = df['review'], df['target']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vocab = set(word for review in x_train for word in review.split())
vocab = {word: idx for idx, word in enumerate(vocab)}

vocab_size = len(vocab)
print(f"Розмір словника: {vocab_size}")
print(f"Розмір x_train: {x_train.shape}")
print(f"Розмір x_test: {x_test.shape}")

model = RNN(vocab=vocab, input_size=len(vocab), hidden_size=128)
model.train(x_train.values, y_train.values, epochs=1000, learning_rate=0.0001, x_test=x_test.values, y_test=y_test.values, early_stopping_rounds=10)

y_pred = model.predict(x_train.values)
accuracy = accuracy_score(y_train, y_pred)
print(f"Train accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_train, y_pred))

y_pred = model.predict(x_test.values)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {accuracy * 100:.2f}%")
print(classification_report(y_test, y_pred))

model.show_loss()


def show_predictions(x, y_true, y_pred):
    def phrase(y):
        if y == 1: return 'positive phrase'
        else: return 'negative phrase'

    text_len = len('                    Text                    ')
    print(f'='*len(f'|                    Text                    |       Real      |     Predicted   |'))
    print(f'|                    Text                    |       Real      |     Predicted   |')
    print(f'='*len(f'|                    Text                    |       Real      |     Predicted   |'))
    for i in range(len(x)):
        len_plus = text_len - len(x[i]) - 1
        plus = ' '*len_plus
        print(f'| {x[i]}{plus}| {phrase(y_true[i])} | {phrase(y_pred[i])} |')
        print(f'-'*len(f'|                    Text                    |       Real      |     Predicted   |'))
    print(f'='*len(f'|                    Text                    |       Real      |     Predicted   |'))


mistakes = y_test.values != y_pred
show_predictions(x_test.values[mistakes], y_test.values[mistakes], np.array(y_pred)[mistakes])