Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- data = pd.read_csv('dataset.csv')
- Xy = data[['review_text', 'review_score']]
- def tokenize(review):
- review = review.lower()
- review = re.sub(r'[^a-zA-Z0-9_ ]', '', review)
- review = review.split(' ')
- review = list(filter(lambda x: x != '', review))
- return review
- voc = {}
- special_tokens = ['<eos>', '<pad>', '<unk>']
- for review in Xy['review_text']:
- review = tokenize(str(review))
- for word in review:
- voc[word] = 1 + voc.get(word, 0)
- print(f"Vocabulary length: {len(voc)}")
- # Filter less frequent words.
- voc = {k: v for k, v in sorted(voc.items(), reverse=True, key=lambda item: item[1])}
- N_voc = 3000
- voc = dict(list(voc.items())[:N_voc - len(special_tokens)])
- for special_token in special_tokens:
- voc[special_token] = 1
- print(f"Final vocabulary length: {len(voc)}")
- # Tokenizer
- i2t = list(voc.keys())
- with open('voc', 'wb') as fp:
- pickle.dump(i2t, fp)
- t2i = {i2t[i]: i for i in range(N_voc)}
- eos_token_id = t2i['<eos>']
- pad_token_id = t2i['<pad>']
- unk_token_id = t2i['<unk>']
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement