Advertisement
iSach

Untitled

Jan 2nd, 2024
38
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.39 KB | None | 0 0
  1. def tokenize(review):
  2. review = review.lower()
  3. review = re.sub(r'[^a-zA-Z0-9_ ]', '', review)
  4. review = review.split(' ')
  5. review = list(filter(lambda x: x != '', review))
  6. return review
  7.  
  8. data = pd.read_csv('dataset.csv')
  9.  
  10. pos_data = data[data['review_score'] == 1]
  11. neg_data = data[data['review_score'] == -1]
  12. neg_data.loc[:, 'review_score'] = 0
  13.  
  14. total_len = len(data)
  15. num_pos = len(pos_data)
  16. num_neg = len(neg_data)
  17.  
  18. print(f"Positive Reviews: {num_pos}/{total_len} ({(num_pos/total_len * 100):.2f}%)")
  19. print(f"Positive Reviews: {num_neg}/{total_len} ({(num_neg/total_len * 100):.2f}%)")
  20.  
  21. pos_data.to_csv('data_positive.csv',encoding='utf-8', index=False)
  22. neg_data.to_csv('data_negative.csv',encoding='utf-8', index=False)
  23.  
  24. MIN_SEQ_LENGTH = 50
  25. MAX_SEQ_LENGTH = 500
  26.  
  27. # Remove data in pos_data where length of (tokenized) review_text > 500
  28. pos_data['review_text'] = pos_data['review_text'].apply(str)
  29. p = pos_data['review_text'].apply(lambda x: len(tokenize(x)))
  30. pos_data = pos_data[(p >= MIN_SEQ_LENGTH) & (p <= MAX_SEQ_LENGTH)]
  31.  
  32. neg_data['review_text'] = neg_data['review_text'].apply(str)
  33. n = neg_data['review_text'].apply(lambda x: len(tokenize(x)))
  34. neg_data = neg_data[(n >= MIN_SEQ_LENGTH) & (n <= MAX_SEQ_LENGTH)]
  35.  
  36. pos_data.to_csv('data_positive_50to500.csv',encoding='utf-8', index=False)
  37. neg_data.to_csv('data_negative_50to500.csv',encoding='utf-8', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement