networks part2

#################Sin bucle###
# Load required libraries
library(dplyr)
library(syuzhet)
library(igraph)
library(tidyr)
library(stringr)

# Load the dataset
data <- read.csv(file.choose(), header = TRUE)

# Sample a quarter of the rows
set.seed(123)  # To make the sampling reproducible
sampled_data <- data %>%
  sample_frac(0.001)

# Convert created_at to datetime
sampled_data$created_at <- as.POSIXct(sampled_data$created_at, format = "%a %b %d %H:%M:%S %z %Y")

# Perform sentiment analysis on the text data
sentiment_scores <- get_sentiment(sampled_data$text, method = "afinn")

# Merge sentiment scores with the original data
data_with_sentiments <- cbind(sampled_data, sentiment_scores)

# Classify sentiment into categories: positive, negative, neutral
data_with_sentiments$sentiment_category <- ifelse(data_with_sentiments$sentiment > 0, "Positive",
                                                  ifelse(data_with_sentiments$sentiment < 0, "Negative", "Neutral"))

# Filter inbound tweets (from users to companies)
inbound_tweets <- data_with_sentiments %>% filter(inbound == "True")

# Filter outbound tweets (companies)
outbound_tweets <- data_with_sentiments %>% filter(inbound == "False")

# Extract mentioned companies from inbound tweets using regex
inbound_tweets <- inbound_tweets %>%
  mutate(mentioned_company = str_extract(text, "(?<=@)\\w+"))

# Filter only valid mentions
inbound_tweets <- inbound_tweets %>% filter(!is.na(mentioned_company))

# Filter out companies whose mentioned_company is completely numeric
inbound_tweets <- inbound_tweets %>% filter(!str_detect(mentioned_company, "^\\d+$"))

# Create a user-company interaction data frame
user_company_interaction <- inbound_tweets %>%
  select(author_id, mentioned_company, sentiment_category)

# Create a bipartite graph
bipartite_edges <- as.matrix(user_company_interaction %>% select(author_id, mentioned_company))
g_bipartite <- graph_from_edgelist(bipartite_edges, directed = FALSE)

# Set bipartite type (TRUE for companies, FALSE for users)
V(g_bipartite)$type <- V(g_bipartite)$name %in% unique(user_company_interaction$mentioned_company)

# Assign sentiment category to vertices
V(g_bipartite)$sentiment_category <- ifelse(V(g_bipartite)$type, NA, user_company_interaction$sentiment_category[match(V(g_bipartite)$name, user_company_interaction$author_id)])

# Assign colors to vertices based on type and sentiment
V(g_bipartite)$color <- ifelse(V(g_bipartite)$type, "yellow",
                               ifelse(V(g_bipartite)$sentiment_category == "Positive", "green",
                                      ifelse(V(g_bipartite)$sentiment_category == "Negative", "red", "blue")))

# Increase the size of company nodes based on the number of positive interactions
positive_interactions <- inbound_tweets %>%
  filter(sentiment_category == "Positive") %>%
  group_by(mentioned_company) %>%
  summarise(count = n())

vertex_sizes <- sapply(V(g_bipartite)$name, function(x) {
  if (x %in% positive_interactions$mentioned_company) {
    return(log(positive_interactions$count[positive_interactions$mentioned_company == x] + 1) * 5)
  } else {
    return(5)  # default size for users and companies with no positive interactions
  }
})

V(g_bipartite)$size <- vertex_sizes

# Use the Kamada-Kawai layout for better separation
layout_combined <- layout_with_kk(g_bipartite, maxiter = 1000)

# Plot the combined graph with colors and sizes
plot(g_bipartite, vertex.size = V(g_bipartite)$size, vertex.label = NA,
     vertex.color = V(g_bipartite)$color, layout = layout_combined,
     main = "User-Company Interaction Network Based on Sentiment")

# Calculate total interactions for each company
company_interactions <- user_company_interaction %>%
  group_by(mentioned_company) %>%
  summarise(total_interactions = n())

# Top 10 companies with the most positive connections
top_positive_companies <- positive_interactions %>%
  arrange(desc(count)) %>%
  head(10)

print("Top 10 Companies with the Most Positive Connections:")
print(top_positive_companies)

# Calculate the ratio of positive connections to total connections for the top 10 companies
positive_to_total_ratio <- top_positive_companies %>%
  mutate(total_connections = company_interactions$total_interactions[match(mentioned_company, company_interactions$mentioned_company)]) %>%
  mutate(positive_to_total_ratio = count / total_connections)

print("Ratio of Positive Connections to Total Connections for the Top 10 Companies:")
print(positive_to_total_ratio)

# Calculate the mean positive_to_total_ratio for all companies
positive_to_total_ratio_all <- company_interactions %>%
  left_join(positive_interactions, by = "mentioned_company") %>%
  mutate(positive_to_total_ratio = ifelse(is.na(count), 0, count) / total_interactions) %>%
  summarise(mean_positive_to_total_ratio = mean(positive_to_total_ratio))

# Print the mean positive_to_total_ratio for all companies
print("Mean Positive to Total Ratio for All Companies:")
print(positive_to_total_ratio_all$mean_positive_to_total_ratio)