DMA2 user-company interaction

# Load required libraries
library(dplyr)
library(syuzhet)
library(igraph)
library(tidyr)
library(stringr)

# Load the dataset
data <- read.csv(file.choose(), header = TRUE)

# Sample a quarter of the rows
set.seed(123)  # To make the sampling reproducible
sampled_data <- data %>%
  sample_frac(0.001)

# Convert created_at to datetime
sampled_data$created_at <- as.POSIXct(sampled_data$created_at, format = "%a %b %d %H:%M:%S %z %Y")

# Perform sentiment analysis on the text data
sentiment_scores <- get_sentiment(sampled_data$text, method = "afinn")

# Merge sentiment scores with the original data
data_with_sentiments <- cbind(sampled_data, sentiment_scores)

# Classify sentiment into categories: positive, negative, neutral
data_with_sentiments$sentiment_category <- ifelse(data_with_sentiments$sentiment > 0, "Positive",
                                                  ifelse(data_with_sentiments$sentiment < 0, "Negative", "Neutral"))

# Filter inbound tweets (from users to companies)
inbound_tweets <- data_with_sentiments %>% filter(inbound == "True")

# Filter outbound tweets (companies)
outbound_tweets <- data_with_sentiments %>% filter(inbound == "False")

# Extract mentioned companies from inbound tweets using regex
inbound_tweets <- inbound_tweets %>%
  mutate(mentioned_company = str_extract(text, "(?<=@)\\w+"))

# Filter only valid mentions
inbound_tweets <- inbound_tweets %>% filter(!is.na(mentioned_company))

# Filter out companies whose mentioned_company is completely numeric
inbound_tweets <- inbound_tweets %>% filter(!str_detect(mentioned_company, "^\\d+$"))

# Create a user-company interaction data frame
user_company_interaction <- inbound_tweets %>%
  select(author_id, mentioned_company, sentiment_category)

# Initialize the graph and node list
g_combined <- graph.empty(directed = FALSE)
all_nodes <- character()

# Iterate over inbound tweets to create user nodes and connect them to company nodes
for (i in 1:nrow(inbound_tweets)) {
  # Get user and company IDs
  user_id <- inbound_tweets$author_id[i]
  company_id <- inbound_tweets$mentioned_company[i]

  # Print the user_id and company_id
  cat("User ID:", user_id, "Company ID:", company_id, "\n")

  # Add user and company nodes to the graph if they don't exist
  if (!is.na(user_id) && !(user_id %in% all_nodes)) {
    g_combined <- add_vertices(g_combined, 1, name = user_id)
    all_nodes <- c(all_nodes, user_id)
  }

  if (!is.na(company_id) && !(company_id %in% all_nodes)) {
    g_combined <- add_vertices(g_combined, 1, name = company_id)
    all_nodes <- c(all_nodes, company_id)
  }

  # Add edge between user and company nodes if both IDs are not NA
  if (!is.na(user_id) && !is.na(company_id)) {
    g_combined <- add_edges(g_combined, c(user_id, company_id))
    cat("Added edge between User ID:", user_id, "and Company ID:", company_id, "\n")
  }
}

# Assign sentiment category to vertices
V(g_combined)$sentiment_category <- ifelse(V(g_combined)$name %in% inbound_tweets$author_id,
                                           inbound_tweets$sentiment_category[match(V(g_combined)$name, inbound_tweets$author_id)],
                                           NA)

# Assign colors to vertices based on type and sentiment
V(g_combined)$color <- ifelse(V(g_combined)$name %in% outbound_tweets$author_id, "yellow",
                              ifelse(V(g_combined)$sentiment_category == "Positive", "green",
                                     ifelse(V(g_combined)$sentiment_category == "Negative", "red", "blue")))

# Increase the size of company nodes based on the number of positive interactions
positive_interactions <- inbound_tweets %>%
  filter(sentiment_category == "Positive") %>%
  group_by(mentioned_company) %>%
  summarise(count = n())

vertex_sizes <- sapply(V(g_combined)$name, function(x) {
  if (x %in% positive_interactions$mentioned_company) {
    return(log(positive_interactions$count[positive_interactions$mentioned_company == x] + 1) * 5)
  } else {
    return(5)  # default size for users and companies with no positive interactions
  }
})

V(g_combined)$size <- vertex_sizes

# Use the Kamada-Kawai layout for better separation
layout_combined <- layout_with_kk(g_combined, niter = 100000)

# Plot the combined graph with colors and sizes
plot(g_combined, vertex.size = V(g_combined)$size, vertex.label = NA,
     vertex.color = V(g_combined)$color, layout = layout_combined,
     main = "User-Company Interaction Network Based on Sentiment")

# Calculate total interactions for each company
company_interactions <- user_company_interaction %>%
  group_by(mentioned_company) %>%
  summarise(total_interactions = n())

# Top 10 companies with the most positive connections
top_positive_companies <- positive_interactions %>%
  arrange(desc(count)) %>%
  head(10)

print("Top 10 Companies with the Most Positive Connections:")
print(top_positive_companies)

# Calculate the ratio of positive connections to total connections for the top 10 companies
positive_to_total_ratio <- top_positive_companies %>%
  mutate(total_connections = company_interactions$total_interactions[match(mentioned_company, company_interactions$mentioned_company)]) %>%
  mutate(positive_to_total_ratio = count / total_connections)

print("Ratio of Positive Connections to Total Connections for the Top 10 Companies:")
print(positive_to_total_ratio)

# Calculate total interactions for each company
company_interactions <- user_company_interaction %>%
  group_by(mentioned_company) %>%
  summarise(total_interactions = n())

# Calculate the ratio of positive connections to total connections for all companies
positive_to_total_ratio_all <- company_interactions %>%
  left_join(positive_interactions, by = "mentioned_company") %>%
  mutate(positive_to_total_ratio = ifelse(is.na(count), 0, count) / total_interactions) %>%
  summarise(mean_positive_to_total_ratio = mean(positive_to_total_ratio))

# Print the mean positive_to_total_ratio for all companies
print("Mean Positive to Total Ratio for All Companies:")
print(positive_to_total_ratio_all$mean_positive_to_total_ratio)