Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Install required packages (if not already installed)
- if (!require("tidyverse")) install.packages("tidyverse")
- if (!require("tm")) install.packages("tm")
- if (!require("wordcloud2")) install.packages("wordcloud2")
- if (!require("syuzhet")) install.packages("syuzhet")
- if(!require(reshape2))install.packages("reshape2",dependencies = T)
- library(tidyverse)
- library(tm)
- library(wordcloud2)
- library(syuzhet)
- library(reshape2)
- #Getting data (select the "twcs"file)
- org_data <- read.csv(file.choose(), header = T)
- #Select the "sample" file
- data <- read.csv(file.choose(),header=TRUE)
- ################################################
- # Calculate the frequency of each unique author_id
- author_id_freq <- table(org_data$author_id)
- # Convert author_id_freq to data frame for filtering
- author_id_df <- as.data.frame(author_id_freq)
- # Filter out author_ids with frequency greater than or equal to 25 and are non-numeric
- filtered_author_ids <- subset(author_id_df, Var1 != "" & Var1 != "NA" & Var1 != "NULL" & !grepl("^\\d+$", Var1) & Freq >= 30000)
- # Create a bar plot
- barplot(filtered_author_ids$Freq,
- names.arg = filtered_author_ids$Var1,
- las = 2,
- col = rainbow(nrow(filtered_author_ids)),main = "Companies with frequency of tweets")
- # Filter data where author_id is equal to "AppleSupport"
- abc_data <- subset(org_data, author_id == "AdobeCares")
- # Convert to POSIXct object
- #abc_data$created_at <- as.POSIXct(abc_data$created_at, format = "%a %b %d %H:%M:%S %z %Y")
- # Create a new column for time
- #abc_data$time <- format(abc_data$created_at, format = "%H:%M:%S")
- #To add year and month column to the dataframe , from the publish date column
- #abc_data$year<-format(abc_data$created_at,"%Y")
- #abc_data$month<-format(abc_data$created_at,"%m")
- #abc_data$month <- as.integer(abc_data$month)
- # Convert numeric month values to month names
- #abc_data$month <- month.name[abc_data$month]
- #Convert the year column to factor datatype
- #abc_data$year<-as.factor(abc_data$year)
- # Bar plot showing tweet counts for each month, grouped by year
- #ggplot(abc_data, aes(x = factor(month), fill = "red")) +
- # geom_bar() +
- # labs(x = "Month", y = "Number of Tweets", title = "Tweet Counts by Month and Year") +
- # theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
- # facet_wrap(~year)
- # Build corpus
- library(tm)
- corpus <- iconv(abc_data$text)
- corpus <- Corpus(VectorSource(corpus))
- inspect(corpus[1:5])
- # Clean Text
- #During the cleaning phase, the initial step is to transform all the text into lowercase.
- #This conversion can be carried out using the `tm_map` function.
- corpus <- tm_map(corpus, tolower)
- inspect(corpus[1:5])
- # To remove remove Punctuations
- corpus <- tm_map(corpus, removePunctuation)
- inspect(corpus[1:5])
- # To remove numbers from data
- corpus <- tm_map(corpus, removeNumbers)
- inspect(corpus[1:5])
- # Removing stopwords
- clean_data <- tm_map(corpus, removeWords, stopwords('english'))
- inspect(clean_data[1:5])
- # To remove URL from data
- removeURL <- function(x) gsub('http[[:alnum:]]*', '', x)
- clean_data <- tm_map(clean_data, content_transformer(removeURL))
- inspect(clean_data[1:5])
- #To remove extra spaces from data
- clean_data <- tm_map(clean_data, stripWhitespace)
- inspect(clean_data[1:5])
- #
- tdm <- TermDocumentMatrix(clean_data)
- tdm
- #
- tdm <- as.matrix(tdm)
- tdm[1:10, 1:20]
- # Bar Plot of Word Frequencies that occur more than 1000 times
- w <- rowSums(tdm)
- w <- subset(w, w>=5000)
- barplot(w,
- las = 2,
- col = rainbow(50),main = "Bar Plot of Word Frequencies")
- # Install and load the required packages to create wordcloud
- if(!require("wordcloud")) install.packages("wordcloud", dependencies = TRUE)
- library(wordcloud)
- w <- sort(rowSums(tdm), decreasing = TRUE)
- set.seed(222)
- wordcloud(words = names(w),
- freq = w,
- max.words = 150,
- random.order = F,
- min.freq = 5,
- colors = brewer.pal(8, 'Dark2'),
- scale = c(5, 0.3),
- rot.per = 0.7)
- # Wordcloud using wordcloud2 package
- if(!require("wordcloud2")) install.packages("wordcloud2", dependencies = TRUE)
- library(wordcloud2)
- w <- data.frame(names(w), w)
- colnames(w) <- c('word', 'freq')
- wordcloud2(w,
- size = 0.7,
- shape = 'triangle',
- rotateRatio = 0.5,
- minSize = 1)
- # Sentiment Analysis using "TM libraray"
- tweets <- iconv(abc_data$text)
- s <- get_nrc_sentiment(tweets)
- head(s)
- #Convert colSums(s) to a data frame
- data <- data.frame(word = names(colSums(s)), count = colSums(s))
- # Reorder word based on count
- data$word <- factor(data$word, levels = data$word[order(-data$count)])
- # Create the bar plot using ggplot2
- ggplot(data, aes(x = word, y = count)) +
- geom_bar(stat = "identity", fill = rainbow(10)) +
- theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
- labs(x = "Words", y = "Count", title = "Sentiment Scores Tweets")
- # Analyzing sentiments using the syuzhet package based on the NRC sentiment dictionary
- text_df <- tibble(text=str_to_lower((abc_data$text)))
- emotions <- get_nrc_sentiment(text_df$text)
- emo_bar <- colSums(emotions)
- emo_sum <- data.frame(count=emo_bar, emotion=names(emo_bar))
- # Craete bar plot showing counts for each of differnt emotions and Positive/negative rating
- ggplot(emo_sum, aes(x= reorder(emotion, -count),y=count,color="black",fill=rainbow(10)))+
- geom_bar(stat="identity",show.legend = FALSE)+
- labs(title="Different emotions with frequency",x="Emotions")
- #Sentiment analysis with the tidytext package using the "bing" lexicon
- library(tidytext)
- bing_word_counts <- text_df %>% unnest_tokens(output=word,input=text) %>%
- inner_join(get_sentiments("bing")) %>%
- count(word,sentiment, sort = TRUE)
- # Select the top 10 words by sentiment
- bing_top_10_words_by_sentiment <- bing_word_counts %>%
- group_by(sentiment) %>%
- slice_max(order_by=n,n = 10) %>%
- ungroup() %>%
- mutate(word= reorder(word, n))
- bing_top_10_words_by_sentiment
- # Create a bar plot showing contribution of words to sentiment
- bing_top_10_words_by_sentiment %>%
- ggplot(aes(word, n, fill= sentiment))+
- geom_col(show.legend = FALSE)+
- facet_wrap(~sentiment,scales = "free_y")+
- theme_light()+
- labs(y= "Contribution to sentiment", x=NULL)+
- coord_flip()
- #Sentiment analysis with the tidytext package using the "loughran" lexicon
- loughran_word_counts <- text_df %>% unnest_tokens(output=word,input=text) %>%
- inner_join(get_sentiments("loughran")) %>%
- count(word,sentiment, sort = TRUE)
- # Select the top 10 words by sentiment
- loughran_top_10_words_by_sentiment <- loughran_word_counts %>%
- group_by(sentiment) %>%
- slice_max(order_by=n,n = 10) %>%
- ungroup() %>%
- mutate(word= reorder(word, n))
- loughran_top_10_words_by_sentiment
- # Create a bar plot showing contribution of words to sentiment
- loughran_top_10_words_by_sentiment %>%
- ggplot(aes(word, n, fill= sentiment))+
- geom_col(show.legend = FALSE)+
- facet_wrap(~sentiment,scales = "free_y")+
- theme_light()+
- labs(y= "Contribution to sentiment", x=NULL)+
- coord_flip()
- #############################################################################
- # Convert created_at to datetime
- data$created_at <- as.POSIXct(data$created_at, format = "%a %b %d %H:%M:%S %z %Y")
- # Arrange data by created_at
- data <- data %>% arrange(created_at)
- # Calculate time difference between consecutive tweets in minutes and round off
- data$time_difference_minutes <- c(NA, round(diff(data$created_at) / 60))
- # Show the result
- print(data)
- # Load required libraries
- library(ggplot2)
- # Create a histogram of time differences between consecutive tweets
- ggplot(data, aes(x = time_difference_minutes)) +
- geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
- labs(title = "Histogram of Time Differences Between Consecutive Tweets",
- x = "Time Difference (Minutes)",
- y = "Frequency") +
- theme_minimal()
- # Load required libraries
- library(dplyr)
- library(syuzhet)
- # Convert created_at to datetime
- data$created_at <- as.POSIXct(data$created_at, format = "%a %b %d %H:%M:%S %z %Y")
- # Perform sentiment analysis on the text data
- sentiment_scores <- get_sentiment(data$text, method = "afinn")
- # Merge sentiment scores with the original data
- data_with_sentiments <- cbind(data, sentiment_scores)
- # Classify sentiment into categories: positive, negative, neutral
- data_with_sentiments$sentiment_category <- ifelse(data_with_sentiments$sentiment > 0, "Positive",
- ifelse(data_with_sentiments$sentiment < 0, "Negative", "Neutral"))
- # Calculate time difference between tweet and reply
- data_with_sentiments <- data_with_sentiments %>%
- mutate(reply_time = as.numeric(difftime(created_at, lag(created_at), units = "mins")))
- # Analyze the average reply time based on sentiment category
- reply_time_summary <- data_with_sentiments %>%
- filter(!is.na(in_response_to_tweet_id)) %>%
- group_by(sentiment_category) %>%
- summarise(avg_reply_time = mean(reply_time, na.rm = TRUE))
- # Print the summary
- print(reply_time_summary)
- ##########################################
- # Load required library
- library(ggplot2)
- # Plot the average reply time based on sentiment category
- ggplot(reply_time_summary, aes(x = sentiment_category, y = avg_reply_time)) +
- geom_bar(stat = "identity", fill = c("red","blue","green")) +
- labs(title = "Average Reply Time by Sentiment Category",
- x = "Sentiment Category",
- y = "Average Reply Time (minutes)") +
- theme_minimal()
- ####################### THE END ##############################
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement