Advertisement
techno-

DMA2 user-company interaction

Jun 2nd, 2024 (edited)
76
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 6.10 KB | None | 0 0
  1. # Load required libraries
  2. library(dplyr)
  3. library(syuzhet)
  4. library(igraph)
  5. library(tidyr)
  6. library(stringr)
  7.  
  8. # Load the dataset
  9. data <- read.csv(file.choose(), header = TRUE)
  10.  
  11. # Sample a quarter of the rows
  12. set.seed(123) # To make the sampling reproducible
  13. sampled_data <- data %>%
  14. sample_frac(0.001)
  15.  
  16. # Convert created_at to datetime
  17. sampled_data$created_at <- as.POSIXct(sampled_data$created_at, format = "%a %b %d %H:%M:%S %z %Y")
  18.  
  19. # Perform sentiment analysis on the text data
  20. sentiment_scores <- get_sentiment(sampled_data$text, method = "afinn")
  21.  
  22. # Merge sentiment scores with the original data
  23. data_with_sentiments <- cbind(sampled_data, sentiment_scores)
  24.  
  25. # Classify sentiment into categories: positive, negative, neutral
  26. data_with_sentiments$sentiment_category <- ifelse(data_with_sentiments$sentiment > 0, "Positive",
  27. ifelse(data_with_sentiments$sentiment < 0, "Negative", "Neutral"))
  28.  
  29. # Filter inbound tweets (from users to companies)
  30. inbound_tweets <- data_with_sentiments %>% filter(inbound == "True")
  31.  
  32. # Filter outbound tweets (companies)
  33. outbound_tweets <- data_with_sentiments %>% filter(inbound == "False")
  34.  
  35. # Extract mentioned companies from inbound tweets using regex
  36. inbound_tweets <- inbound_tweets %>%
  37. mutate(mentioned_company = str_extract(text, "(?<=@)\\w+"))
  38.  
  39. # Filter only valid mentions
  40. inbound_tweets <- inbound_tweets %>% filter(!is.na(mentioned_company))
  41.  
  42. # Filter out companies whose mentioned_company is completely numeric
  43. inbound_tweets <- inbound_tweets %>% filter(!str_detect(mentioned_company, "^\\d+$"))
  44.  
  45. # Create a user-company interaction data frame
  46. user_company_interaction <- inbound_tweets %>%
  47. select(author_id, mentioned_company, sentiment_category)
  48.  
  49. # Initialize the graph and node list
  50. g_combined <- graph.empty(directed = FALSE)
  51. all_nodes <- character()
  52.  
  53. # Iterate over inbound tweets to create user nodes and connect them to company nodes
  54. for (i in 1:nrow(inbound_tweets)) {
  55. # Get user and company IDs
  56. user_id <- inbound_tweets$author_id[i]
  57. company_id <- inbound_tweets$mentioned_company[i]
  58.  
  59. # Print the user_id and company_id
  60. cat("User ID:", user_id, "Company ID:", company_id, "\n")
  61.  
  62. # Add user and company nodes to the graph if they don't exist
  63. if (!is.na(user_id) && !(user_id %in% all_nodes)) {
  64. g_combined <- add_vertices(g_combined, 1, name = user_id)
  65. all_nodes <- c(all_nodes, user_id)
  66. }
  67.  
  68. if (!is.na(company_id) && !(company_id %in% all_nodes)) {
  69. g_combined <- add_vertices(g_combined, 1, name = company_id)
  70. all_nodes <- c(all_nodes, company_id)
  71. }
  72.  
  73. # Add edge between user and company nodes if both IDs are not NA
  74. if (!is.na(user_id) && !is.na(company_id)) {
  75. g_combined <- add_edges(g_combined, c(user_id, company_id))
  76. cat("Added edge between User ID:", user_id, "and Company ID:", company_id, "\n")
  77. }
  78. }
  79.  
  80. # Assign sentiment category to vertices
  81. V(g_combined)$sentiment_category <- ifelse(V(g_combined)$name %in% inbound_tweets$author_id,
  82. inbound_tweets$sentiment_category[match(V(g_combined)$name, inbound_tweets$author_id)],
  83. NA)
  84.  
  85. # Assign colors to vertices based on type and sentiment
  86. V(g_combined)$color <- ifelse(V(g_combined)$name %in% outbound_tweets$author_id, "yellow",
  87. ifelse(V(g_combined)$sentiment_category == "Positive", "green",
  88. ifelse(V(g_combined)$sentiment_category == "Negative", "red", "blue")))
  89.  
  90. # Increase the size of company nodes based on the number of positive interactions
  91. positive_interactions <- inbound_tweets %>%
  92. filter(sentiment_category == "Positive") %>%
  93. group_by(mentioned_company) %>%
  94. summarise(count = n())
  95.  
  96. vertex_sizes <- sapply(V(g_combined)$name, function(x) {
  97. if (x %in% positive_interactions$mentioned_company) {
  98. return(log(positive_interactions$count[positive_interactions$mentioned_company == x] + 1) * 5)
  99. } else {
  100. return(5) # default size for users and companies with no positive interactions
  101. }
  102. })
  103.  
  104. V(g_combined)$size <- vertex_sizes
  105.  
  106. # Use the Kamada-Kawai layout for better separation
  107. layout_combined <- layout_with_kk(g_combined, niter = 100000)
  108.  
  109. # Plot the combined graph with colors and sizes
  110. plot(g_combined, vertex.size = V(g_combined)$size, vertex.label = NA,
  111. vertex.color = V(g_combined)$color, layout = layout_combined,
  112. main = "User-Company Interaction Network Based on Sentiment")
  113.  
  114. # Calculate total interactions for each company
  115. company_interactions <- user_company_interaction %>%
  116. group_by(mentioned_company) %>%
  117. summarise(total_interactions = n())
  118.  
  119. # Top 10 companies with the most positive connections
  120. top_positive_companies <- positive_interactions %>%
  121. arrange(desc(count)) %>%
  122. head(10)
  123.  
  124. print("Top 10 Companies with the Most Positive Connections:")
  125. print(top_positive_companies)
  126.  
  127. # Calculate the ratio of positive connections to total connections for the top 10 companies
  128. positive_to_total_ratio <- top_positive_companies %>%
  129. mutate(total_connections = company_interactions$total_interactions[match(mentioned_company, company_interactions$mentioned_company)]) %>%
  130. mutate(positive_to_total_ratio = count / total_connections)
  131.  
  132. print("Ratio of Positive Connections to Total Connections for the Top 10 Companies:")
  133. print(positive_to_total_ratio)
  134.  
  135. # Calculate total interactions for each company
  136. company_interactions <- user_company_interaction %>%
  137. group_by(mentioned_company) %>%
  138. summarise(total_interactions = n())
  139.  
  140. # Calculate the ratio of positive connections to total connections for all companies
  141. positive_to_total_ratio_all <- company_interactions %>%
  142. left_join(positive_interactions, by = "mentioned_company") %>%
  143. mutate(positive_to_total_ratio = ifelse(is.na(count), 0, count) / total_interactions) %>%
  144. summarise(mean_positive_to_total_ratio = mean(positive_to_total_ratio))
  145.  
  146. # Print the mean positive_to_total_ratio for all companies
  147. print("Mean Positive to Total Ratio for All Companies:")
  148. print(positive_to_total_ratio_all$mean_positive_to_total_ratio)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement