Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def tokenize(review):
- review = review.lower()
- review = re.sub(r'[^a-zA-Z0-9_ ]', '', review)
- review = review.split(' ')
- review = list(filter(lambda x: x != '', review))
- return review
- data = pd.read_csv('dataset.csv')
- pos_data = data[data['review_score'] == 1]
- neg_data = data[data['review_score'] == -1]
- neg_data.loc[:, 'review_score'] = 0
- total_len = len(data)
- num_pos = len(pos_data)
- num_neg = len(neg_data)
- print(f"Positive Reviews: {num_pos}/{total_len} ({(num_pos/total_len * 100):.2f}%)")
- print(f"Positive Reviews: {num_neg}/{total_len} ({(num_neg/total_len * 100):.2f}%)")
- pos_data.to_csv('data_positive.csv',encoding='utf-8', index=False)
- neg_data.to_csv('data_negative.csv',encoding='utf-8', index=False)
- MIN_SEQ_LENGTH = 50
- MAX_SEQ_LENGTH = 500
- # Remove data in pos_data where length of (tokenized) review_text > 500
- pos_data['review_text'] = pos_data['review_text'].apply(str)
- p = pos_data['review_text'].apply(lambda x: len(tokenize(x)))
- pos_data = pos_data[(p >= MIN_SEQ_LENGTH) & (p <= MAX_SEQ_LENGTH)]
- neg_data['review_text'] = neg_data['review_text'].apply(str)
- n = neg_data['review_text'].apply(lambda x: len(tokenize(x)))
- neg_data = neg_data[(n >= MIN_SEQ_LENGTH) & (n <= MAX_SEQ_LENGTH)]
- pos_data.to_csv('data_positive_50to500.csv',encoding='utf-8', index=False)
- neg_data.to_csv('data_negative_50to500.csv',encoding='utf-8', index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement