Advertisement
iSach

Untitled

Jan 2nd, 2024
41
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.04 KB | None | 0 0
  1. data = pd.read_csv('dataset.csv')
  2.  
  3. Xy = data[['review_text', 'review_score']]
  4.  
  5. def tokenize(review):
  6. review = review.lower()
  7. review = re.sub(r'[^a-zA-Z0-9_ ]', '', review)
  8. review = review.split(' ')
  9. review = list(filter(lambda x: x != '', review))
  10. return review
  11.  
  12. voc = {}
  13. special_tokens = ['<eos>', '<pad>', '<unk>']
  14. for review in Xy['review_text']:
  15. review = tokenize(str(review))
  16. for word in review:
  17. voc[word] = 1 + voc.get(word, 0)
  18.  
  19. print(f"Vocabulary length: {len(voc)}")
  20.  
  21. # Filter less frequent words.
  22. voc = {k: v for k, v in sorted(voc.items(), reverse=True, key=lambda item: item[1])}
  23.  
  24. N_voc = 3000
  25. voc = dict(list(voc.items())[:N_voc - len(special_tokens)])
  26. for special_token in special_tokens:
  27. voc[special_token] = 1
  28.  
  29. print(f"Final vocabulary length: {len(voc)}")
  30.  
  31. # Tokenizer
  32. i2t = list(voc.keys())
  33. with open('voc', 'wb') as fp:
  34. pickle.dump(i2t, fp)
  35. t2i = {i2t[i]: i for i in range(N_voc)}
  36. eos_token_id = t2i['<eos>']
  37. pad_token_id = t2i['<pad>']
  38. unk_token_id = t2i['<unk>']
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement