Advertisement
mirosh111000

pr11(RNN)

Nov 28th, 2024
37
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 15.57 KB | None | 0 0
  1. import numpy as np
  2. import os
  3. import datetime as dt
  4. import pandas as pd
  5. from sklearn.model_selection import train_test_split
  6. from imblearn.under_sampling import RandomUnderSampler
  7. from sklearn.preprocessing import MinMaxScaler
  8. from sklearn.model_selection import train_test_split
  9. from sklearn.feature_extraction.text import CountVectorizer
  10. import re
  11. import nltk
  12. from nltk.corpus import stopwords
  13. from nltk.tokenize import word_tokenize
  14. from nltk.stem import WordNetLemmatizer
  15. from sklearn.metrics import accuracy_score, classification_report
  16.  
  17. nltk.download('punkt')
  18. nltk.download('stopwords')
  19. nltk.download('wordnet')
  20.  
  21. def preprocess_text(text):
  22.     text = text.lower()  
  23.     text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
  24.     text = re.sub(r'\d+', '', text)  
  25.     text = re.sub(r'[^\w\s]', '', text)
  26.     words = word_tokenize(text)
  27.     stop_words = set(stopwords.words('english'))  
  28.     words = [word for word in words if word not in stop_words]
  29.     lemmatizer = WordNetLemmatizer()
  30.     words = [lemmatizer.lemmatize(word) for word in words]
  31.     return ' '.join(words)
  32.  
  33. def create_inputs(text, vocab):
  34.     inputs = []
  35.     for w in text.split():
  36.         v = np.zeros((len(vocab), 1))
  37.         if w in vocab:
  38.             v[vocab[w]] = 1
  39.         inputs.append(v)
  40.     return np.array(inputs)
  41.  
  42.  
  43. class RNN:
  44.     def __init__(self, vocab, input_size, hidden_size, output_size=2):
  45.         self.vocab = vocab
  46.         self.input_size = input_size
  47.         self.hidden_size = hidden_size
  48.         self.output_size = output_size  
  49.  
  50.         self.Wxh = np.random.randn(self.hidden_size, self.input_size) * np.sqrt(1 / self.input_size)
  51.         self.Whh = np.random.randn(self.hidden_size, self.hidden_size) * np.sqrt(1 / self.hidden_size)
  52.         self.Why = np.random.randn(self.output_size, self.hidden_size) * np.sqrt(1 / self.hidden_size)  
  53.         self.bh = np.zeros((self.hidden_size, 1))  
  54.         self.by = np.zeros((self.output_size, 1))
  55.         self.last_hs = None
  56.         self.train_loss = []
  57.         self.test_loss = []
  58.         self.train_accuracy = []
  59.         self.test_accuracy = []
  60.  
  61.     def tanh(self, x):
  62.         return np.tanh(x)
  63.  
  64.     def softmax(self, x):
  65.         return np.exp(x) / np.sum(np.exp(x), axis=0)
  66.  
  67.     def forward(self, inputs):
  68.         h = np.zeros((self.hidden_size, 1))
  69.         self.last_hs = [h]
  70.         for i in range(inputs.shape[0]):  
  71.             x = inputs[i].reshape(-1, 1)  
  72.             h = self.tanh(np.dot(self.Wxh, x) + np.dot(self.Whh, h) + self.bh)  
  73.             self.last_hs.append(h)
  74.            
  75.         y = np.dot(self.Why, h) + self.by
  76.         y = self.softmax(y)
  77.         return y, h
  78.  
  79.     def backward(self, inputs, y_hat, y):
  80.  
  81.         n = len(inputs)
  82.        
  83.         dLdy = y_hat
  84.         dLdy[y] -= 1
  85.        
  86.         dWhy = np.dot(dLdy, self.last_hs[n].T)
  87.         dby = dLdy
  88.  
  89.         dWhh = np.zeros(self.Whh.shape)
  90.         dWxh = np.zeros(self.Wxh.shape)
  91.         dbh = np.zeros(self.bh.shape)
  92.  
  93.         dh = np.dot(self.Why.T, dLdy)
  94.  
  95.         for t in reversed(range(n)):
  96.  
  97.             dhraw = (1 - self.last_hs[t+1]**2) * dh
  98.  
  99.             dbh += dhraw
  100.             dWhh += np.dot(dhraw, self.last_hs[t].T)
  101.             dWxh += np.dot(dhraw, inputs[t].T)
  102.             dh = np.dot(self.Whh, dhraw)
  103.  
  104.         for d in [dWxh, dWhh, dWhy, dbh, dby]:
  105.             np.clip(d, -1, 1, out=d)
  106.  
  107.         return dWxh, dWhh, dWhy, dbh, dby
  108.  
  109.     def update_parameters(self, dWxh, dWhh, dWhy, dbh, dby, learning_rate=0.001):
  110.         self.Wxh -= learning_rate * dWxh
  111.         self.Whh -= learning_rate * dWhh
  112.         self.Why -= learning_rate * dWhy
  113.         self.bh -= learning_rate * dbh
  114.         self.by -= learning_rate * dby
  115.  
  116.     def predict(self, x_test):
  117.         predictions = []
  118.         for inp in x_test:
  119.             inputs = create_inputs(inp, self.vocab)
  120.             y_hat, _ = self.forward(inputs)
  121.             predictions.append(np.argmax(y_hat))
  122.         return predictions
  123.  
  124.     def train(self, x_train, y_train, epochs=1000, learning_rate=0.001, x_test=None, y_test=None, early_stopping_rounds=np.inf):
  125.         start_time = dt.datetime.now()
  126.         for epoch in range(epochs):
  127.            
  128.             total_loss = 0    
  129.             for i in range(len(x_train)):
  130.                 inputs = create_inputs(x_train[i], self.vocab)
  131.                 target = y_train[i]
  132.                 y_hat, h = self.forward(inputs)
  133.                 loss = -np.sum(np.log(y_hat[target]))
  134.                 total_loss += loss
  135.    
  136.                 dWxh, dWhh, dWhy, dbh, dby = self.backward(inputs, y_hat, target)
  137.    
  138.                 self.update_parameters(dWxh, dWhh, dWhy, dbh, dby, learning_rate)
  139.  
  140.             self.train_loss.append(total_loss/len(y_train))
  141.             self.train_accuracy.append(accuracy_score(y_train, self.predict(x_train))*100)
  142.            
  143.             total_loss = 0
  144.             for i in range(len(x_test)):
  145.                 inputs = create_inputs(x_test[i], self.vocab)
  146.                 target = y_test[i]
  147.                 y_hat, _ = self.forward(inputs)
  148.                 loss = -np.sum(np.log(y_hat[target]))
  149.                 total_loss += loss
  150.  
  151.             self.test_loss.append(total_loss/len(y_test))
  152.             self.test_accuracy.append(accuracy_score(y_test, self.predict(x_test))*100)
  153.  
  154.             prcnt = (epoch+1)/epochs * 100
  155.             print(f'№{epoch+1}/{epochs} - {round(prcnt, 2)}% | total time: {dt.datetime.now() - start_time} | time remaining: {(dt.datetime.now() - start_time) / prcnt * (100 - prcnt)} | end time: {dt.datetime.now() + (dt.datetime.now() - start_time) / prcnt * (100 - prcnt)}', end='\r')
  156.             os.system('cls' if os.name == 'nt' else 'clear')
  157.  
  158.  
  159.             if epoch + 1 > early_stopping_rounds:
  160.                 if np.argmin(self.test_loss) < epoch - early_stopping_rounds:
  161.                     print(f'Зупинка на епохі {epoch+1}')
  162.                     break
  163.  
  164.  
  165.     def show_loss(self):
  166.         plt.figure(figsize=(7, 5))
  167.         plt.title('Втрати')
  168.         plt.plot(np.arange(1, len(self.train_loss)+1), self.train_loss, c='blue', label='Тренувальні дані')
  169.         plt.plot(np.arange(1, len(self.test_loss)+1), self.test_loss, c='red', label='Тестові дані')
  170.         plt.legend()
  171.         plt.xlabel('epoche')
  172.         plt.ylabel('loss')
  173.         plt.grid()
  174.         plt.show()
  175.  
  176.         plt.figure(figsize=(7, 5))
  177.         plt.title('Точність')
  178.         plt.plot(np.arange(1, len(self.train_accuracy)+1), self.train_accuracy, c='blue', label='Тренувальні дані')
  179.         plt.plot(np.arange(1, len(self.test_accuracy)+1), self.test_accuracy, c='red', label='Тестові дані')
  180.         plt.legend()
  181.         plt.xlabel('epoche')
  182.         plt.ylabel('accuracy')
  183.         plt.grid()
  184.         plt.show()
  185.  
  186.  
  187.  
  188. data = {
  189.     'this is very sad': False,
  190.     'this is very happy': True,
  191.     'i am good not bad': True,
  192.     'this is good not bad': True,
  193.     'i am bad not good': False,
  194.     'everything is great': True,
  195.     'life is good': True,
  196.     'good': True,
  197.     'bad': False,
  198.     'happy': True,
  199.     'sad': False,
  200.     'nothing is bad': True,
  201.     'everything is bad': False,
  202.     'i am extremely happy': True,
  203.     'this is extremely bad': False,
  204.     'this is moderately good': True,
  205.     'i am not entirely sad': True,
  206.     'this is incredibly good': True,
  207.     'i feel so bad': False,
  208.     'this is the worst': False,
  209.     'this is awesome': True,
  210.     'i feel wonderful': True,
  211.     'this is absolutely terrible': False,
  212.     'it is not okay': False,
  213.     'this is quite good': True,
  214.     'this is not so bad': True,
  215.     'this is somewhat sad': False,
  216.     'i am neither happy nor sad': False,
  217.     'i feel neutral': False,
  218.     'not good': False,
  219.     'not bad': True,
  220.     'not sad': True,
  221.     'very good': True,
  222.     'very bad': False,
  223.     'very happy': True,
  224.     'very sad': False,
  225.     'this is good': True,
  226.     'i am good': True,
  227.     'this is bad': False,
  228.     'i am sad': False,
  229.     'this is sad': False,
  230.     'i am not happy': False,
  231.     'this is not good': False,
  232.     'i am not bad': True,
  233.     'this is not sad': True,
  234.     'i am very happy': True,
  235.     'this is very good': True,
  236.     'i am very bad': False,
  237.     'this makes me smile': True,
  238.     'this breaks my heart': False,
  239.     'i am overwhelmed with joy': True,
  240.     'this situation is unbearable': False,
  241.     'nothing could be better': True,
  242.     'i am pleasantly surprised': True,
  243.     'this is totally unacceptable': False,
  244.     'i am proud of this': True,
  245.     'this is shameful': False,
  246.     'i am deeply disappointed': False,
  247.     'this brings me hope': True,
  248.     'i feel completely lost': False,
  249.     'this is the happiest moment': True,
  250.     'this ruins everything': False,
  251.     'this makes everything better': True,
  252.     'life feels meaningless': False,
  253.     'i feel perfectly content': True,
  254.     'this is utterly fantastic': True,
  255.     'this is highly frustrating': False,
  256.     'i feel like crying': False,
  257.     'this moment is precious': True,
  258.     'this is worse than expected': False,
  259.     'this is a dream come true': True,
  260.     'i have no words for this sadness': False,
  261.     'this is better than perfect': True,
  262.     'i feel so relaxed': True,
  263.     'this is not worth it': False,
  264.     'this inspires me': True,
  265.     'this destroys my trust': False,
  266.     'i can’t stop smiling': True,
  267.     'i regret this decision': False,
  268.     'i am extremely sad': False,
  269.     'this makes me laugh': True,
  270.     'this is painfully disappointing': False,
  271.     'everything feels amazing': True,
  272.     'i am incredibly thankful': True,
  273.     'this is heartwarming': True,
  274.     'this is completely unacceptable': False,
  275.     'i feel devastated': False,
  276.     'this makes my day': True,
  277.     'this is totally frustrating': False,
  278.     'i feel unbelievably great': True,
  279.     'nothing feels worse': False,
  280.     'this exceeds all expectations': True,
  281.     'this is just awful': False,
  282.     'i am full of hope': True,
  283.     'this is emotionally draining': False,
  284.     'i am cautiously optimistic': True,
  285.     'this is beyond disappointing': False,
  286.     'i feel amazingly calm': True,
  287.     'this is such a relief': True,
  288.     'i feel deeply sad': False,
  289.     'this fills me with energy': True,
  290.     'this completely destroys my mood': False,
  291.     'i feel really proud': True,
  292.     'this situation is infuriating': False,
  293.     'i am fully satisfied': True,
  294.     'this moment feels magical': True,
  295.     'this is beyond terrible': False,
  296.     'i feel on top of the world': True,
  297.     'this breaks my confidence': False,
  298.     'i am highly encouraged': True,
  299.     'this is so disheartening': False,
  300.     'i feel truly blessed': True,
  301.     'this makes me cry tears of joy': True,
  302.     'i feel utterly hopeless': False,
  303.     'this is better than ever': True,
  304.     'this ruins my plans': False,
  305.     'this is pure happiness': True,
  306.     'this leaves me speechless': True,
  307.     'this crushes my soul': False,
  308.     'this uplifts my spirit': True,
  309.     'this is such a disappointment': False,
  310.     'this fills me with pride': True,
  311.     'this is happy': True,
  312.     'i am good': True,
  313.     'this is not happy': False,
  314.     'i am not good': False,
  315.     'this is not bad': True,
  316.     'i am not sad': True,
  317.     'i am very good': True,
  318.     'this is very bad': False,
  319.     'i am very sad': False,
  320.     'this is bad not good': False,
  321.     'this is good and happy': True,
  322.     'i am not good and not happy': False,
  323.     'i am not at all sad': True,
  324.     'this is not at all good': False,
  325.     'this is not at all bad': True,
  326.     'this is good right now': True,
  327.     'this is very bad right now': False,
  328.     'this was good earlier': True,
  329.     'i was not happy and not good earlier': False,
  330.     'everything seems fine': True,
  331.     'this feels amazing': True,
  332.     'life is not bad at all': True,
  333.     'this is a bit sad': False,
  334.     'i feel absolutely great': True,
  335.     'nothing feels wrong': True,
  336.     'everything is falling apart': False,
  337.     'this is so wonderful': True,
  338.     'i am completely satisfied': True,
  339.     'this is the best ever': True,
  340.     'i am somewhat disappointed': False,
  341.     'this is mildly frustrating': False,
  342.     'this is rather enjoyable': True,
  343.     'nothing seems right': False,
  344.     'this is just okay': False,
  345.     'i feel happy but tired': True,
  346.     'this is better than expected': True,
  347.     'this is not as bad as it looks': True,
  348.     'this fills me with hope': True,
  349.     'this is disappointing beyond words': False,
  350.     'i am overjoyed with this result': True,
  351.     'this is an unacceptable situation': False,
  352.     'this is unbelievably bad': False,
  353.     'i feel like jumping for joy': True,
  354.     'this situation is hopeless': False,
  355.     'this makes me incredibly happy': True,
  356.     'this is the worst feeling': False,
  357.     'this moment is unforgettable': True,
  358.     'this is unbearably sad': False,
  359.     'this could not have gone better': True,
  360.     'this makes me feel alive': True,
  361.     'this is worse than i thought': False,
  362.     'i am truly grateful for this': True,
  363.     'this is a complete disaster': False,
  364.     'this exceeded my expectations': True,
  365.     'this is heartbreaking': False,
  366.     'this gives me strength': True,
  367.     'this destroys my confidence': False,
  368.     'this is absolutely worth it': True,
  369.     'this ruins my day': False,
  370. }
  371.  
  372.  
  373. df = pd.DataFrame(list(data.items()), columns=['review', 'target'])
  374. df =  df.sample(frac=1)
  375. df['target'] = df['target'].apply(lambda x: 1 if x else 0)
  376. x, y = RandomUnderSampler(random_state=42).fit_resample(df[['review']], df['target'])
  377. df = pd.DataFrame(x)
  378. df['target'] = y
  379. df['review'] = df['review'].apply(preprocess_text)
  380.  
  381. X, y = df['review'], df['target']
  382. x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  383.  
  384. vocab = set(word for review in x_train for word in review.split())
  385. vocab = {word: idx for idx, word in enumerate(vocab)}
  386.  
  387. vocab_size = len(vocab)
  388. print(f"Розмір словника: {vocab_size}")
  389. print(f"Розмір x_train: {x_train.shape}")
  390. print(f"Розмір x_test: {x_test.shape}")
  391.  
  392. model = RNN(vocab=vocab, input_size=len(vocab), hidden_size=128)
  393. model.train(x_train.values, y_train.values, epochs=1000, learning_rate=0.0001, x_test=x_test.values, y_test=y_test.values, early_stopping_rounds=10)
  394.  
  395. y_pred = model.predict(x_train.values)
  396. accuracy = accuracy_score(y_train, y_pred)
  397. print(f"Train accuracy: {accuracy * 100:.2f}%")
  398. print(classification_report(y_train, y_pred))
  399.  
  400. y_pred = model.predict(x_test.values)
  401. accuracy = accuracy_score(y_test, y_pred)
  402. print(f"Test accuracy: {accuracy * 100:.2f}%")
  403. print(classification_report(y_test, y_pred))
  404.  
  405. model.show_loss()
  406.  
  407.  
  408. def show_predictions(x, y_true, y_pred):
  409.     def phrase(y):
  410.         if y == 1: return 'positive phrase'
  411.         else: return 'negative phrase'
  412.  
  413.     text_len = len('                    Text                    ')
  414.     print(f'='*len(f'|                    Text                    |       Real      |     Predicted   |'))
  415.     print(f'|                    Text                    |       Real      |     Predicted   |')
  416.     print(f'='*len(f'|                    Text                    |       Real      |     Predicted   |'))
  417.     for i in range(len(x)):
  418.         len_plus = text_len - len(x[i]) - 1
  419.         plus = ' '*len_plus
  420.         print(f'| {x[i]}{plus}| {phrase(y_true[i])} | {phrase(y_pred[i])} |')
  421.         print(f'-'*len(f'|                    Text                    |       Real      |     Predicted   |'))
  422.     print(f'='*len(f'|                    Text                    |       Real      |     Predicted   |'))
  423.  
  424.  
  425. mistakes = y_test.values != y_pred
  426. show_predictions(x_test.values[mistakes], y_test.values[mistakes], np.array(y_pred)[mistakes])
  427.  
  428.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement