Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import nltk
- from nltk.corpus import stopwords as nltk_stopwords
- # < напишите код здесь >
- from sklearn.feature_extraction.text import TfidfVectorizer
- data = pd.read_csv("/datasets/tweets_lemm.csv")
- corpus = data['lemm_text'].values.astype('U')
- nltk.download('stopwords')
- stopwords = set(nltk_stopwords.words('russian'))
- count_tf_idf = TfidfVectorizer(stop_words=stopwords)
- tf_idf = count_tf_idf.fit_transform(corpus)
- print("Размер матрицы:", tf_idf.shape)
Add Comment
Please, Sign In to add comment