Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import os
- import datetime as dt
- import pandas as pd
- from sklearn.model_selection import train_test_split
- from imblearn.under_sampling import RandomUnderSampler
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.model_selection import train_test_split
- from sklearn.feature_extraction.text import CountVectorizer
- import re
- import nltk
- from nltk.corpus import stopwords
- from nltk.tokenize import word_tokenize
- from nltk.stem import WordNetLemmatizer
- from sklearn.metrics import accuracy_score, classification_report
- nltk.download('punkt')
- nltk.download('stopwords')
- nltk.download('wordnet')
- def preprocess_text(text):
- text = text.lower()
- text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
- text = re.sub(r'\d+', '', text)
- text = re.sub(r'[^\w\s]', '', text)
- words = word_tokenize(text)
- stop_words = set(stopwords.words('english'))
- words = [word for word in words if word not in stop_words]
- lemmatizer = WordNetLemmatizer()
- words = [lemmatizer.lemmatize(word) for word in words]
- return ' '.join(words)
- def create_inputs(text, vocab):
- inputs = []
- for w in text.split():
- v = np.zeros((len(vocab), 1))
- if w in vocab:
- v[vocab[w]] = 1
- inputs.append(v)
- return np.array(inputs)
- class RNN:
- def __init__(self, vocab, input_size, hidden_size, output_size=2):
- self.vocab = vocab
- self.input_size = input_size
- self.hidden_size = hidden_size
- self.output_size = output_size
- self.Wxh = np.random.randn(self.hidden_size, self.input_size) * np.sqrt(1 / self.input_size)
- self.Whh = np.random.randn(self.hidden_size, self.hidden_size) * np.sqrt(1 / self.hidden_size)
- self.Why = np.random.randn(self.output_size, self.hidden_size) * np.sqrt(1 / self.hidden_size)
- self.bh = np.zeros((self.hidden_size, 1))
- self.by = np.zeros((self.output_size, 1))
- self.last_hs = None
- self.train_loss = []
- self.test_loss = []
- self.train_accuracy = []
- self.test_accuracy = []
- def tanh(self, x):
- return np.tanh(x)
- def softmax(self, x):
- return np.exp(x) / np.sum(np.exp(x), axis=0)
- def forward(self, inputs):
- h = np.zeros((self.hidden_size, 1))
- self.last_hs = [h]
- for i in range(inputs.shape[0]):
- x = inputs[i].reshape(-1, 1)
- h = self.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)
- self.last_hs.append(h)
- y = np.dot(self.Why, h) + self.by
- y = self.softmax(y)
- return y, h
- def backward(self, inputs, y_hat, y):
- n = len(inputs)
- dLdy = y_hat
- dLdy[y] -= 1
- dWhy = np.dot(dLdy, self.last_hs[n].T)
- dby = dLdy
- dWhh = np.zeros(self.Whh.shape)
- dWxh = np.zeros(self.Wxh.shape)
- dbh = np.zeros(self.bh.shape)
- dh = np.dot(self.Why.T, dLdy)
- for t in reversed(range(n)):
- dhraw = (1 - self.last_hs[t+1]**2) * dh
- dbh += dhraw
- dWhh += np.dot(dhraw, self.last_hs[t].T)
- dWxh += np.dot(dhraw, inputs[t].T)
- dh = np.dot(self.Whh, dhraw)
- for d in [dWxh, dWhh, dWhy, dbh, dby]:
- np.clip(d, -1, 1, out=d)
- return dWxh, dWhh, dWhy, dbh, dby
- def update_parameters(self, dWxh, dWhh, dWhy, dbh, dby, learning_rate=0.001):
- self.Wxh -= learning_rate * dWxh
- self.Whh -= learning_rate * dWhh
- self.Why -= learning_rate * dWhy
- self.bh -= learning_rate * dbh
- self.by -= learning_rate * dby
- def predict(self, x_test):
- predictions = []
- for inp in x_test:
- inputs = create_inputs(inp, self.vocab)
- y_hat, _ = self.forward(inputs)
- predictions.append(np.argmax(y_hat))
- return predictions
- def train(self, x_train, y_train, epochs=1000, learning_rate=0.001, x_test=None, y_test=None, early_stopping_rounds=np.inf):
- start_time = dt.datetime.now()
- for epoch in range(epochs):
- total_loss = 0
- for i in range(len(x_train)):
- inputs = create_inputs(x_train[i], self.vocab)
- target = y_train[i]
- y_hat, h = self.forward(inputs)
- loss = -np.sum(np.log(y_hat[target]))
- total_loss += loss
- dWxh, dWhh, dWhy, dbh, dby = self.backward(inputs, y_hat, target)
- self.update_parameters(dWxh, dWhh, dWhy, dbh, dby, learning_rate)
- self.train_loss.append(total_loss/len(y_train))
- self.train_accuracy.append(accuracy_score(y_train, self.predict(x_train))*100)
- total_loss = 0
- for i in range(len(x_test)):
- inputs = create_inputs(x_test[i], self.vocab)
- target = y_test[i]
- y_hat, _ = self.forward(inputs)
- loss = -np.sum(np.log(y_hat[target]))
- total_loss += loss
- self.test_loss.append(total_loss/len(y_test))
- self.test_accuracy.append(accuracy_score(y_test, self.predict(x_test))*100)
- prcnt = (epoch+1)/epochs * 100
- print(f'№{epoch+1}/{epochs} - {round(prcnt, 2)}% | total time: {dt.datetime.now() - start_time} | time remaining: {(dt.datetime.now() - start_time) / prcnt * (100 - prcnt)} | end time: {dt.datetime.now() + (dt.datetime.now() - start_time) / prcnt * (100 - prcnt)}', end='\r')
- os.system('cls' if os.name == 'nt' else 'clear')
- if epoch + 1 > early_stopping_rounds:
- if np.argmin(self.test_loss) < epoch - early_stopping_rounds:
- print(f'Зупинка на епохі {epoch+1}')
- break
- def show_loss(self):
- plt.figure(figsize=(7, 5))
- plt.title('Втрати')
- plt.plot(np.arange(1, len(self.train_loss)+1), self.train_loss, c='blue', label='Тренувальні дані')
- plt.plot(np.arange(1, len(self.test_loss)+1), self.test_loss, c='red', label='Тестові дані')
- plt.legend()
- plt.xlabel('epoche')
- plt.ylabel('loss')
- plt.grid()
- plt.show()
- plt.figure(figsize=(7, 5))
- plt.title('Точність')
- plt.plot(np.arange(1, len(self.train_accuracy)+1), self.train_accuracy, c='blue', label='Тренувальні дані')
- plt.plot(np.arange(1, len(self.test_accuracy)+1), self.test_accuracy, c='red', label='Тестові дані')
- plt.legend()
- plt.xlabel('epoche')
- plt.ylabel('accuracy')
- plt.grid()
- plt.show()
- data = {
- 'this is very sad': False,
- 'this is very happy': True,
- 'i am good not bad': True,
- 'this is good not bad': True,
- 'i am bad not good': False,
- 'everything is great': True,
- 'life is good': True,
- 'good': True,
- 'bad': False,
- 'happy': True,
- 'sad': False,
- 'nothing is bad': True,
- 'everything is bad': False,
- 'i am extremely happy': True,
- 'this is extremely bad': False,
- 'this is moderately good': True,
- 'i am not entirely sad': True,
- 'this is incredibly good': True,
- 'i feel so bad': False,
- 'this is the worst': False,
- 'this is awesome': True,
- 'i feel wonderful': True,
- 'this is absolutely terrible': False,
- 'it is not okay': False,
- 'this is quite good': True,
- 'this is not so bad': True,
- 'this is somewhat sad': False,
- 'i am neither happy nor sad': False,
- 'i feel neutral': False,
- 'not good': False,
- 'not bad': True,
- 'not sad': True,
- 'very good': True,
- 'very bad': False,
- 'very happy': True,
- 'very sad': False,
- 'this is good': True,
- 'i am good': True,
- 'this is bad': False,
- 'i am sad': False,
- 'this is sad': False,
- 'i am not happy': False,
- 'this is not good': False,
- 'i am not bad': True,
- 'this is not sad': True,
- 'i am very happy': True,
- 'this is very good': True,
- 'i am very bad': False,
- 'this makes me smile': True,
- 'this breaks my heart': False,
- 'i am overwhelmed with joy': True,
- 'this situation is unbearable': False,
- 'nothing could be better': True,
- 'i am pleasantly surprised': True,
- 'this is totally unacceptable': False,
- 'i am proud of this': True,
- 'this is shameful': False,
- 'i am deeply disappointed': False,
- 'this brings me hope': True,
- 'i feel completely lost': False,
- 'this is the happiest moment': True,
- 'this ruins everything': False,
- 'this makes everything better': True,
- 'life feels meaningless': False,
- 'i feel perfectly content': True,
- 'this is utterly fantastic': True,
- 'this is highly frustrating': False,
- 'i feel like crying': False,
- 'this moment is precious': True,
- 'this is worse than expected': False,
- 'this is a dream come true': True,
- 'i have no words for this sadness': False,
- 'this is better than perfect': True,
- 'i feel so relaxed': True,
- 'this is not worth it': False,
- 'this inspires me': True,
- 'this destroys my trust': False,
- 'i can’t stop smiling': True,
- 'i regret this decision': False,
- 'i am extremely sad': False,
- 'this makes me laugh': True,
- 'this is painfully disappointing': False,
- 'everything feels amazing': True,
- 'i am incredibly thankful': True,
- 'this is heartwarming': True,
- 'this is completely unacceptable': False,
- 'i feel devastated': False,
- 'this makes my day': True,
- 'this is totally frustrating': False,
- 'i feel unbelievably great': True,
- 'nothing feels worse': False,
- 'this exceeds all expectations': True,
- 'this is just awful': False,
- 'i am full of hope': True,
- 'this is emotionally draining': False,
- 'i am cautiously optimistic': True,
- 'this is beyond disappointing': False,
- 'i feel amazingly calm': True,
- 'this is such a relief': True,
- 'i feel deeply sad': False,
- 'this fills me with energy': True,
- 'this completely destroys my mood': False,
- 'i feel really proud': True,
- 'this situation is infuriating': False,
- 'i am fully satisfied': True,
- 'this moment feels magical': True,
- 'this is beyond terrible': False,
- 'i feel on top of the world': True,
- 'this breaks my confidence': False,
- 'i am highly encouraged': True,
- 'this is so disheartening': False,
- 'i feel truly blessed': True,
- 'this makes me cry tears of joy': True,
- 'i feel utterly hopeless': False,
- 'this is better than ever': True,
- 'this ruins my plans': False,
- 'this is pure happiness': True,
- 'this leaves me speechless': True,
- 'this crushes my soul': False,
- 'this uplifts my spirit': True,
- 'this is such a disappointment': False,
- 'this fills me with pride': True,
- 'this is happy': True,
- 'i am good': True,
- 'this is not happy': False,
- 'i am not good': False,
- 'this is not bad': True,
- 'i am not sad': True,
- 'i am very good': True,
- 'this is very bad': False,
- 'i am very sad': False,
- 'this is bad not good': False,
- 'this is good and happy': True,
- 'i am not good and not happy': False,
- 'i am not at all sad': True,
- 'this is not at all good': False,
- 'this is not at all bad': True,
- 'this is good right now': True,
- 'this is very bad right now': False,
- 'this was good earlier': True,
- 'i was not happy and not good earlier': False,
- 'everything seems fine': True,
- 'this feels amazing': True,
- 'life is not bad at all': True,
- 'this is a bit sad': False,
- 'i feel absolutely great': True,
- 'nothing feels wrong': True,
- 'everything is falling apart': False,
- 'this is so wonderful': True,
- 'i am completely satisfied': True,
- 'this is the best ever': True,
- 'i am somewhat disappointed': False,
- 'this is mildly frustrating': False,
- 'this is rather enjoyable': True,
- 'nothing seems right': False,
- 'this is just okay': False,
- 'i feel happy but tired': True,
- 'this is better than expected': True,
- 'this is not as bad as it looks': True,
- 'this fills me with hope': True,
- 'this is disappointing beyond words': False,
- 'i am overjoyed with this result': True,
- 'this is an unacceptable situation': False,
- 'this is unbelievably bad': False,
- 'i feel like jumping for joy': True,
- 'this situation is hopeless': False,
- 'this makes me incredibly happy': True,
- 'this is the worst feeling': False,
- 'this moment is unforgettable': True,
- 'this is unbearably sad': False,
- 'this could not have gone better': True,
- 'this makes me feel alive': True,
- 'this is worse than i thought': False,
- 'i am truly grateful for this': True,
- 'this is a complete disaster': False,
- 'this exceeded my expectations': True,
- 'this is heartbreaking': False,
- 'this gives me strength': True,
- 'this destroys my confidence': False,
- 'this is absolutely worth it': True,
- 'this ruins my day': False,
- }
- df = pd.DataFrame(list(data.items()), columns=['review', 'target'])
- df = df.sample(frac=1)
- df['target'] = df['target'].apply(lambda x: 1 if x else 0)
- x, y = RandomUnderSampler(random_state=42).fit_resample(df[['review']], df['target'])
- df = pd.DataFrame(x)
- df['target'] = y
- df['review'] = df['review'].apply(preprocess_text)
- X, y = df['review'], df['target']
- x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
- vocab = set(word for review in x_train for word in review.split())
- vocab = {word: idx for idx, word in enumerate(vocab)}
- vocab_size = len(vocab)
- print(f"Розмір словника: {vocab_size}")
- print(f"Розмір x_train: {x_train.shape}")
- print(f"Розмір x_test: {x_test.shape}")
- model = RNN(vocab=vocab, input_size=len(vocab), hidden_size=128)
- model.train(x_train.values, y_train.values, epochs=1000, learning_rate=0.0001, x_test=x_test.values, y_test=y_test.values, early_stopping_rounds=10)
- y_pred = model.predict(x_train.values)
- accuracy = accuracy_score(y_train, y_pred)
- print(f"Train accuracy: {accuracy * 100:.2f}%")
- print(classification_report(y_train, y_pred))
- y_pred = model.predict(x_test.values)
- accuracy = accuracy_score(y_test, y_pred)
- print(f"Test accuracy: {accuracy * 100:.2f}%")
- print(classification_report(y_test, y_pred))
- model.show_loss()
- def show_predictions(x, y_true, y_pred):
- def phrase(y):
- if y == 1: return 'positive phrase'
- else: return 'negative phrase'
- text_len = len(' Text ')
- print(f'='*len(f'| Text | Real | Predicted |'))
- print(f'| Text | Real | Predicted |')
- print(f'='*len(f'| Text | Real | Predicted |'))
- for i in range(len(x)):
- len_plus = text_len - len(x[i]) - 1
- plus = ' '*len_plus
- print(f'| {x[i]}{plus}| {phrase(y_true[i])} | {phrase(y_pred[i])} |')
- print(f'-'*len(f'| Text | Real | Predicted |'))
- print(f'='*len(f'| Text | Real | Predicted |'))
- mistakes = y_test.values != y_pred
- show_predictions(x_test.values[mistakes], y_test.values[mistakes], np.array(y_pred)[mistakes])
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement