Advertisement
techno-

DMA2 project

May 7th, 2024
69
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.58 KB | None | 0 0
  1. # Install required packages (if not already installed)
  2. if (!require("tidyverse")) install.packages("tidyverse")
  3. if (!require("tm")) install.packages("tm")
  4. if (!require("wordcloud2")) install.packages("wordcloud2")
  5. if (!require("syuzhet")) install.packages("syuzhet")
  6. if(!require(reshape2))install.packages("reshape2",dependencies = T)
  7.  
  8. library(tidyverse)
  9. library(tm)
  10. library(wordcloud2)
  11. library(syuzhet)
  12. library(reshape2)
  13.  
  14. #Getting data (select the "twcs"file)
  15. org_data <- read.csv(file.choose(), header = T)
  16.  
  17. #Select the "sample" file
  18. data <- read.csv(file.choose(),header=TRUE)
  19. ################################################
  20. # Calculate the frequency of each unique author_id
  21. author_id_freq <- table(org_data$author_id)
  22.  
  23. # Convert author_id_freq to data frame for filtering
  24. author_id_df <- as.data.frame(author_id_freq)
  25.  
  26. # Filter out author_ids with frequency greater than or equal to 25 and are non-numeric
  27. filtered_author_ids <- subset(author_id_df, Var1 != "" & Var1 != "NA" & Var1 != "NULL" & !grepl("^\\d+$", Var1) & Freq >= 30000)
  28.  
  29. # Create a bar plot
  30. barplot(filtered_author_ids$Freq,
  31. names.arg = filtered_author_ids$Var1,
  32. las = 2,
  33. col = rainbow(nrow(filtered_author_ids)),main = "Companies with frequency of tweets")
  34.  
  35.  
  36.  
  37. # Filter data where author_id is equal to "AppleSupport"
  38. abc_data <- subset(org_data, author_id == "AdobeCares")
  39.  
  40.  
  41. # Convert to POSIXct object
  42. #abc_data$created_at <- as.POSIXct(abc_data$created_at, format = "%a %b %d %H:%M:%S %z %Y")
  43. # Create a new column for time
  44. #abc_data$time <- format(abc_data$created_at, format = "%H:%M:%S")
  45.  
  46. #To add year and month column to the dataframe , from the publish date column
  47. #abc_data$year<-format(abc_data$created_at,"%Y")
  48. #abc_data$month<-format(abc_data$created_at,"%m")
  49. #abc_data$month <- as.integer(abc_data$month)
  50. # Convert numeric month values to month names
  51. #abc_data$month <- month.name[abc_data$month]
  52.  
  53. #Convert the year column to factor datatype
  54. #abc_data$year<-as.factor(abc_data$year)
  55.  
  56. # Bar plot showing tweet counts for each month, grouped by year
  57. #ggplot(abc_data, aes(x = factor(month), fill = "red")) +
  58. # geom_bar() +
  59. # labs(x = "Month", y = "Number of Tweets", title = "Tweet Counts by Month and Year") +
  60. # theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  61. # facet_wrap(~year)
  62.  
  63.  
  64. # Build corpus
  65. library(tm)
  66. corpus <- iconv(abc_data$text)
  67. corpus <- Corpus(VectorSource(corpus))
  68. inspect(corpus[1:5])
  69.  
  70. # Clean Text
  71. #During the cleaning phase, the initial step is to transform all the text into lowercase.
  72. #This conversion can be carried out using the `tm_map` function.
  73. corpus <- tm_map(corpus, tolower)
  74. inspect(corpus[1:5])
  75.  
  76. # To remove remove Punctuations
  77. corpus <- tm_map(corpus, removePunctuation)
  78. inspect(corpus[1:5])
  79.  
  80. # To remove numbers from data
  81. corpus <- tm_map(corpus, removeNumbers)
  82. inspect(corpus[1:5])
  83.  
  84. # Removing stopwords
  85. clean_data <- tm_map(corpus, removeWords, stopwords('english'))
  86. inspect(clean_data[1:5])
  87.  
  88.  
  89. # To remove URL from data
  90. removeURL <- function(x) gsub('http[[:alnum:]]*', '', x)
  91. clean_data <- tm_map(clean_data, content_transformer(removeURL))
  92. inspect(clean_data[1:5])
  93.  
  94.  
  95. #To remove extra spaces from data
  96. clean_data <- tm_map(clean_data, stripWhitespace)
  97. inspect(clean_data[1:5])
  98.  
  99. #
  100. tdm <- TermDocumentMatrix(clean_data)
  101. tdm
  102.  
  103.  
  104. #
  105. tdm <- as.matrix(tdm)
  106. tdm[1:10, 1:20]
  107.  
  108.  
  109. # Bar Plot of Word Frequencies that occur more than 1000 times
  110. w <- rowSums(tdm)
  111. w <- subset(w, w>=5000)
  112. barplot(w,
  113. las = 2,
  114. col = rainbow(50),main = "Bar Plot of Word Frequencies")
  115.  
  116.  
  117. # Install and load the required packages to create wordcloud
  118. if(!require("wordcloud")) install.packages("wordcloud", dependencies = TRUE)
  119. library(wordcloud)
  120. w <- sort(rowSums(tdm), decreasing = TRUE)
  121. set.seed(222)
  122. wordcloud(words = names(w),
  123. freq = w,
  124. max.words = 150,
  125. random.order = F,
  126. min.freq = 5,
  127. colors = brewer.pal(8, 'Dark2'),
  128. scale = c(5, 0.3),
  129. rot.per = 0.7)
  130.  
  131.  
  132. # Wordcloud using wordcloud2 package
  133. if(!require("wordcloud2")) install.packages("wordcloud2", dependencies = TRUE)
  134. library(wordcloud2)
  135. w <- data.frame(names(w), w)
  136. colnames(w) <- c('word', 'freq')
  137. wordcloud2(w,
  138. size = 0.7,
  139. shape = 'triangle',
  140. rotateRatio = 0.5,
  141. minSize = 1)
  142.  
  143. # Sentiment Analysis using "TM libraray"
  144. tweets <- iconv(abc_data$text)
  145. s <- get_nrc_sentiment(tweets)
  146. head(s)
  147. #Convert colSums(s) to a data frame
  148. data <- data.frame(word = names(colSums(s)), count = colSums(s))
  149. # Reorder word based on count
  150. data$word <- factor(data$word, levels = data$word[order(-data$count)])
  151. # Create the bar plot using ggplot2
  152. ggplot(data, aes(x = word, y = count)) +
  153. geom_bar(stat = "identity", fill = rainbow(10)) +
  154. theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  155. labs(x = "Words", y = "Count", title = "Sentiment Scores Tweets")
  156.  
  157. # Analyzing sentiments using the syuzhet package based on the NRC sentiment dictionary
  158. text_df <- tibble(text=str_to_lower((abc_data$text)))
  159. emotions <- get_nrc_sentiment(text_df$text)
  160. emo_bar <- colSums(emotions)
  161. emo_sum <- data.frame(count=emo_bar, emotion=names(emo_bar))
  162.  
  163.  
  164. # Craete bar plot showing counts for each of differnt emotions and Positive/negative rating
  165. ggplot(emo_sum, aes(x= reorder(emotion, -count),y=count,color="black",fill=rainbow(10)))+
  166. geom_bar(stat="identity",show.legend = FALSE)+
  167. labs(title="Different emotions with frequency",x="Emotions")
  168.  
  169. #Sentiment analysis with the tidytext package using the "bing" lexicon
  170. library(tidytext)
  171. bing_word_counts <- text_df %>% unnest_tokens(output=word,input=text) %>%
  172. inner_join(get_sentiments("bing")) %>%
  173. count(word,sentiment, sort = TRUE)
  174.  
  175. # Select the top 10 words by sentiment
  176. bing_top_10_words_by_sentiment <- bing_word_counts %>%
  177. group_by(sentiment) %>%
  178. slice_max(order_by=n,n = 10) %>%
  179. ungroup() %>%
  180. mutate(word= reorder(word, n))
  181. bing_top_10_words_by_sentiment
  182.  
  183. # Create a bar plot showing contribution of words to sentiment
  184. bing_top_10_words_by_sentiment %>%
  185. ggplot(aes(word, n, fill= sentiment))+
  186. geom_col(show.legend = FALSE)+
  187. facet_wrap(~sentiment,scales = "free_y")+
  188. theme_light()+
  189. labs(y= "Contribution to sentiment", x=NULL)+
  190. coord_flip()
  191.  
  192. #Sentiment analysis with the tidytext package using the "loughran" lexicon
  193. loughran_word_counts <- text_df %>% unnest_tokens(output=word,input=text) %>%
  194. inner_join(get_sentiments("loughran")) %>%
  195. count(word,sentiment, sort = TRUE)
  196.  
  197. # Select the top 10 words by sentiment
  198. loughran_top_10_words_by_sentiment <- loughran_word_counts %>%
  199. group_by(sentiment) %>%
  200. slice_max(order_by=n,n = 10) %>%
  201. ungroup() %>%
  202. mutate(word= reorder(word, n))
  203. loughran_top_10_words_by_sentiment
  204.  
  205.  
  206. # Create a bar plot showing contribution of words to sentiment
  207. loughran_top_10_words_by_sentiment %>%
  208. ggplot(aes(word, n, fill= sentiment))+
  209. geom_col(show.legend = FALSE)+
  210. facet_wrap(~sentiment,scales = "free_y")+
  211. theme_light()+
  212. labs(y= "Contribution to sentiment", x=NULL)+
  213. coord_flip()
  214.  
  215. #############################################################################
  216. # Convert created_at to datetime
  217. data$created_at <- as.POSIXct(data$created_at, format = "%a %b %d %H:%M:%S %z %Y")
  218.  
  219. # Arrange data by created_at
  220. data <- data %>% arrange(created_at)
  221.  
  222. # Calculate time difference between consecutive tweets in minutes and round off
  223. data$time_difference_minutes <- c(NA, round(diff(data$created_at) / 60))
  224.  
  225. # Show the result
  226. print(data)
  227.  
  228. # Load required libraries
  229. library(ggplot2)
  230.  
  231. # Create a histogram of time differences between consecutive tweets
  232. ggplot(data, aes(x = time_difference_minutes)) +
  233. geom_histogram(binwidth = 50, fill = "skyblue", color = "black") +
  234. labs(title = "Histogram of Time Differences Between Consecutive Tweets",
  235. x = "Time Difference (Minutes)",
  236. y = "Frequency") +
  237. theme_minimal()
  238.  
  239.  
  240. # Load required libraries
  241. library(dplyr)
  242. library(syuzhet)
  243.  
  244. # Convert created_at to datetime
  245. data$created_at <- as.POSIXct(data$created_at, format = "%a %b %d %H:%M:%S %z %Y")
  246.  
  247. # Perform sentiment analysis on the text data
  248. sentiment_scores <- get_sentiment(data$text, method = "afinn")
  249.  
  250. # Merge sentiment scores with the original data
  251. data_with_sentiments <- cbind(data, sentiment_scores)
  252.  
  253. # Classify sentiment into categories: positive, negative, neutral
  254. data_with_sentiments$sentiment_category <- ifelse(data_with_sentiments$sentiment > 0, "Positive",
  255. ifelse(data_with_sentiments$sentiment < 0, "Negative", "Neutral"))
  256.  
  257. # Calculate time difference between tweet and reply
  258. data_with_sentiments <- data_with_sentiments %>%
  259. mutate(reply_time = as.numeric(difftime(created_at, lag(created_at), units = "mins")))
  260.  
  261. # Analyze the average reply time based on sentiment category
  262. reply_time_summary <- data_with_sentiments %>%
  263. filter(!is.na(in_response_to_tweet_id)) %>%
  264. group_by(sentiment_category) %>%
  265. summarise(avg_reply_time = mean(reply_time, na.rm = TRUE))
  266.  
  267. # Print the summary
  268. print(reply_time_summary)
  269.  
  270. ##########################################
  271.  
  272. # Load required library
  273. library(ggplot2)
  274.  
  275. # Plot the average reply time based on sentiment category
  276. ggplot(reply_time_summary, aes(x = sentiment_category, y = avg_reply_time)) +
  277. geom_bar(stat = "identity", fill = c("red","blue","green")) +
  278. labs(title = "Average Reply Time by Sentiment Category",
  279. x = "Sentiment Category",
  280. y = "Average Reply Time (minutes)") +
  281. theme_minimal()
  282.  
  283. ####################### THE END ##############################
  284.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement