# Code for tables and plots for Extended Abstract submitted to Web Science 2013
# "Who contributes and Who is receiving the attention on Twitter during academic conferences?"  by Parra, Tratter and Wen
# @author Denis Parra http://www.sis.pitt.edu/~dparra/
# @date March 15 2013

setwd('C:\\your system path')
library(ggplot2)
library(gridExtra)
library(plyr)
library(scales) # to plot datetime data


# HT2012
# Dates considered: from Jun-23 00:01 UTC to Jun-27 16:03 UTC
# check fields
ht12 <- read.csv('ht12_iconf2012.csv',  header = TRUE, sep = ",", colClasses=c(rep("factor",15),rep("character",6),rep("numeric",3) ) )
ht12 <- within(ht12, Relationship.Date..UTC. <- as.POSIXct(Relationship.Date..UTC., format = "%m/%d/%Y %H:%M"))
ht12 <- within(ht12, Tweet.Date..UTC. <- as.POSIXct(Tweet.Date..UTC., format = "%m/%d/%Y %H:%M"))

#plot tweets as  date/time v/s User 
ht12f <- ht12
# sort the user wrt time they started participating
# take into account: some tweets are duplicated due to users mentioning several other users
tw.dfx=ddply(ht12f, .var = "Vertex.1", .fun = function(x) {return(subset(x, Tweet.Date..UTC. %in% min(Tweet.Date..UTC.),select=c(Vertex.1,Tweet.Date..UTC.))) })
# March 12th 2013 fix ~ remove duplciated rows
tw.dfx <- tw.dfx[!duplicated(tw.dfx),]
tw.dfxa=arrange(tw.dfx,-desc(Tweet.Date..UTC.))
ht12f$Vertex.1=factor(ht12f$Vertex.1, levels = tw.dfxa$Vertex.1)
#and plot the result
p3 <- ggplot(ht12f)+geom_point(aes(x=Tweet.Date..UTC.,y=Vertex.1,colour=Relationship2, shape=Relationship2, size=3)) +
scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-06-23", "2012-06-28")), labels = date_format("%b %d\n%H:%M:%S") ) +
scale_y_discrete(labels = c(1:length(levels(ht12f$Vertex.1)))) + theme_bw()
p3 <- p3 + scale_shape_manual(name = "Tweet Type", values=c(1,2,3,4)) + labs(title = "#ht2012 Tweet Activity") + xlab("Tweet date/time") + ylab("User ID") + scale_colour_discrete(name = "Tweet Type") + scale_size( guide="none" )
p3 <- p3 + theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
p3
png("ht12-1.png", width=900, height=800, res=120)
p3
dev.off()

#now plot the user/tweet distribution
#reorder dataframe based on one column
ht12_user_tweet_dist <- table(ht12$Vertex.1)
# write.table(ht12_user_tweet_dist,file="ht12_user_tweet_dist.csv",sep=";",row.names=T)
ht12_users <- as.data.frame(table(ht12$Vertex.1))
colnames(ht12_users) <- c("user", "tweets")
ht12_users <- ht12_users[sort.list(ht12_users$tweets, decreasing = T),]
#plot it
p <- ggplot(ht12_users) + geom_point(aes(1:nrow(ht12_users), tweets), color = "salmon") + theme_bw() + labs(title = "#ht2012 tweets per user") + xlab("User ID - sorted by amount of tweets") + ylab("#tweets")
# add lines and labels
p <- p + geom_hline(aes(1:nrow(ht12_users),tweets), stat="hline", yintercept="mean", linetype = "dashed") + annotate("text", x=40, y=5, hjust=0, label="Mean nbr of tweets/user", size=3)
p <- p + geom_hline(aes(1:nrow(ht12_users),tweets), stat="hline", yintercept="median", linetype = "dashed") + annotate("text", x=40, y=1, hjust=0, vjust= -0.35, label="Median nbr of tweets/user", size=3)
p <- p + geom_vline(aes(1:nrow(ht12_users),tweets), stat="vline", xintercept=30, linetype = "dashed") +  annotate("text", x=29.5, y=38, hjust=1, vjust =0, angle = 90, label="50% of users",size=3) 
png("ht12-2.png", width=900, height=850, res=120)
p
dev.off()

# STATS of HT2012
# 1) number of users
length(unique(ht12$Vertex.1))
# 2) #total tweets
length(ht12$Vertex.1)
# a) 3)  # Retweets (RT) | # b) # Mentions | # c) # Replies to | # tweets not a), b), c)
table(ht12$Relationship2)
# same as before, in percentages
table(ht12$Relationship2)/length(ht12$Vertex.1)
# % Users Retweeted
length(unique(ht12$Vertex.2[ht12$Relationship2 == "Retweet"]))/length(unique(ht12$Vertex.1))
# % Users Retweeted \ Mentioned \ replied to
length(unique(ht12$Vertex.2[ht12$Relationship2 != "Tweet"]))/length(unique(ht12$Vertex.1))
# % Tweets made by Top-20% contributors
ht12_users <- as.data.frame(table(ht12$Vertex.1))
ht12_users <- arrange(ht12_users,desc(Freq))
ht12_top <- ht12_users[1:ceiling(length(ht12_users$Var1)/5),]$Var1
ht12_top <- factor(ht12_top)
sum(ht12_users[1:ceiling(length(ht12_users$Var1)/5),]$Freq)/sum(ht12_users$Freq) 
# %Tweets made by bottom-20% contributors
sum(tail(ht12_users,ceiling(length(ht12_users$Var1)/5))$Freq)/sum(ht12_users$Freq) 
# =========
ht12_bot <- tail(ht12_users,ceiling(length(ht12_users$Var1)/5))$Var1
ht12_bot <- factor(ht12_bot)
#  20% most early contributors
#  a) list of 20$ early contributors
ht12_early <- head( tw.dfxa , ceiling(length(ht12_users$Var1)/5) )$Vertex.1
ht12_early <- factor(ht12_early)
# % of the earliest contributors that are top20% contributors
length(subset(ht12_top, as.character(ht12_early) %in% as.character(ht12_top) ))/length(ht12_top)
#  20% of late contributors
#  a) list of 20$ late contributors
ht12_late <- tail( tw.dfxa , ceiling(length(ht12_users$Var1)/5) )$Vertex.1 
ht12_late <- factor(ht12_late)
# % of the top 20% lurkers that are late contributors
length(subset(ht12_late, as.character(ht12_late) %in% as.character(ht12_users[ht12_users$Freq == 1,]$Var1) ))/length(ht12_bot)

# PLOT 
df_ht12_early <- subset(ht12,Vertex.1 %in% ht12_early )
df_ht12_late <- subset(ht12,Vertex.1 %in% ht12_late)
# plot early 20%
p_early <- ggplot(df_ht12_early ,aes(Relationship2)) + geom_bar(aes( y = (..count..)/sum(..count..) )) + 
scale_y_continuous(labels = percent_format()) + labs(title="#ht2012: Tweet Types for Early 20%", x="Tweet Type", y="Percentage") + 
theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
# plot late  20%
# check https://github.com/hadley/ggplot2/wiki/-opts%28%29-List for opts() list
p_late <- ggplot(df_ht12_late,aes(Relationship2)) + geom_bar(aes( y = (..count..)/sum(..count..) )) + 
scale_y_continuous(labels = percent_format()) + labs(title="#ht2012: Tweet Types for Late 20%", x="Tweet Type", y="Percentage") + 
theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
#combined plot
png("ht12-3.png", width=900, height=400, res=100)
grid.arrange(p_early,p_late,nrow=1)
dev.off()

ht12_ttype_late <- table(df_ht12_late$Relationship2)/nrow(df_ht12_late)
ht12_ttype_early <- table(df_ht12_early$Relationship2)/nrow(df_ht12_early)

# ================== HT  WHICH category mentions which other category
df_twt_category_2 <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
twt_cat2_ht12 <- df_twt_category_2[df_twt_category_2$conference == 'ht12_iconf2013',]
ht12_u_ctg1 <- merge(ht12[ht12$Relationship2 != 'Tweet', ],twt_cat2_ht12, by.x = "Vertex.1", by.y = "twitter")
ht12_u_ctg2 <- merge(ht12_u_ctg1,twt_cat_ht12, by.x = "Vertex.2", by.y = "twitter")
addmargins(table(ht12_u_ctg2$category.x,ht12_u_ctg2$category.y))
inter_comm <- addmargins(table(ht12_u_ctg2$category.x,ht12_u_ctg2$category.y))
write.table(inter_comm, file='inter_category_ht2012.csv',sep=',', row.names=T)
# ===========================================================================================

# ========= HyperText plot time series : cumulative tweet tye over time
df_twt_category <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
twt_cat_ht12 <- df_twt_category[df_twt_category$conference == 'ht12_iconf2013',]
df_twt_category <- read.csv('dbdump_twitter_account_2.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
twt_cat_ht12_2 <- df_twt_category[df_twt_category$conference == 'ht12_iconf2013',]

# sort the dataframe by date 

# use merge to add user category on tweets dataframe
ht12_u_ctg <- merge(ht12,twt_cat_ht12, by.x = "Vertex.1", by.y = "twitter")
ht12_u_ctg3 <- merge(ht12_u_ctg,twt_cat_ht12_2, by.x = "Vertex.2", by.y = "twitter")

table(ht12_u_ctg3$category,ht12_u_ctg3$category2)


ht12_u_ctg <- arrange(ht12_u_ctg,-desc(Tweet.Date..UTC.))
ht12_u_ctg$cumttype_count <- ave(rep(1,length(ht12_u_ctg$Relationship2)), ht12_u_ctg$Relationship2, FUN = cumsum)


p4 <- ggplot(ht12_u_ctg)+geom_line(aes(x=Tweet.Date..UTC.,y=cumttype_count, group=Relationship2,colour=Relationship2)) +
  scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-06-23", "2012-06-28")), labels = date_format("%b %d\n%H:%M:%S") ) +
  theme_bw()
p4 <- p4 + labs(title = "#ht2012 Tweet Activity") + xlab("Tweet date/time") + ylab("Cummulative tweet count") + scale_colour_discrete(name = "Tweet Type") + scale_size( guide="none" )
p4 <- p4 + theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
p4
png("ht12-4.png", width=900, height=800, res=120)
p4
dev.off()

# ========= ht plot time series : cumulative tweet per user category over time
#df_twt_category <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
#twt_cat_ht12 <- df_twt_category[df_twt_category$conference == 'ht12_iconf2012',]
#ht12_u_ctg <- merge(ht12,twt_cat_ht12, by.x = "Vertex.1", by.y = "twitter")

# use merge to add user category on tweets dataframe
ht12_u_ctg <- arrange(ht12_u_ctg,-desc(Tweet.Date..UTC.))
ht12_u_ctg$cumcategory_count <- ave(rep(1,length(ht12_u_ctg$category)), ht12_u_ctg$category, FUN = cumsum)

p5 <- ggplot(ht12_u_ctg)+geom_line(aes(x=Tweet.Date..UTC.,y=cumcategory_count, group=category,colour=category), size=1.2) +
  scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-06-23", "2012-06-28")), labels = date_format("%b %d\n%H:%M:%S") ) + 
  theme_bw()
p5 <- p5 + labs(title = "#ht2012 Tweet Activity") + xlab("Tweet date/time") + ylab("Cummulative tweet count") + scale_colour_discrete(name = "User Category") + scale_size( guide="none" )
p5 <- p5 + theme(axis.text.x=element_text(size=9), axis.title.x=element_text(size=11),axis.text.y=element_text(size=9), axis.title.y=element_text(size=11))
p5 <- p5 + theme(panel.border = element_rect(colour = 'black',size = 1.2, linetype='solid'))
p5
png("ht12-5.png", width=900, height=800, res=120)
p5
dev.off()

# ============ write.tablecreate summary tables to export to excel
ht12_ttype_ucat <- addmargins(table(ht12_u_ctg$category,ht12_u_ctg$Relationship2))
write.table(ht12_ttype_ucat, file="ht12_ttype_ucat.csv",sep=",",row.names=T)
write.table(ht12_ttype_early,file="ht12_ttype_early.csv",sep=",",row.names=T)
write.table(ht12_ttype_late,file="ht12_ttype_late.csv",sep=",",row.names=T)
ht12_user_tweet_dist <- table(ht12$Vertex.1)
write.table(ht12_user_tweet_dist,file="ht12_user_tweet_dist.csv",sep=",",row.names=T)

# == who (which category) is being mentioned, retweeted, replied to ?
ht12_u_ctg_2 <- merge(ht12,twt_cat_ht12, by.x = "Vertex.2", by.y = "twitter")
ht12_ttype_ucat_rec <- addmargins(table(ht12_u_ctg_2$category,ht12_u_ctg_2$Relationship2))
write.table(ht12_ttype_ucat_rec,file="ht12_ttype_ucat_rec.csv",sep=",",row.names=T)

# =======================================================================================================================

# UMAP2012
# Dates considered: from Jul-15 20:00 UTC to Jul-20 23:00 UTC
# check fields "Twitter Page for Tweet" and "imported ID"
umap12 <- read.csv('umap12_iconf2012.csv',  header = TRUE, sep = ",", colClasses=c(rep("factor",15),rep("character",6),rep("numeric",3) ) )
umap12 <- within(umap12, Relationship.Date..UTC. <- as.POSIXct(Relationship.Date..UTC., format = "%m/%d/%Y %H:%M"))
umap12 <- within(umap12, Tweet.Date..UTC. <- as.POSIXct(Tweet.Date..UTC., format = "%m/%d/%Y %H:%M"))

#plot tweets as  date/time v/s User 
ht12f <- umap12
# sort the user wrt time they started participating
tw.dfx=ddply(ht12f, .var = "Vertex.1", .fun = function(x) {return(subset(x, Tweet.Date..UTC. %in% min(Tweet.Date..UTC.),select=c(Vertex.1,Tweet.Date..UTC.))) })
# March 12th 2013 fix ~ remove duplciated rows
tw.dfx <- tw.dfx[!duplicated(tw.dfx),]
tw.dfxa=arrange(tw.dfx,-desc(Tweet.Date..UTC.))
ht12f$Vertex.1=factor(ht12f$Vertex.1, levels = tw.dfxa$Vertex.1)
#and plot the result
p3 <- ggplot(ht12f)+geom_point(aes(x=Tweet.Date..UTC.,y=Vertex.1,colour=Relationship2, shape=Relationship2, size=3)) +
scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-07-15", "2012-07-21")), labels = date_format("%b %d\n%H:%M:%S") ) +
scale_y_discrete(labels = c(1:length(levels(ht12f$Vertex.1)))) + theme_bw()
p3 <- p3 + scale_shape_manual(name = "Tweet Type", values=c(1,2,3,4)) + labs(title = "#umap2012 Tweet Activity") + xlab("Tweet date/time") + ylab("User ID") + scale_colour_discrete(name = "Tweet Type") + scale_size( guide="none" )
p3 <- p3 + theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
p3

png("umap12-1.png", width=900, height=800, res=120)
p3
dev.off()

#reorder dataframe based on one column
umap12_user_tweet_dist <- table(umap12$Vertex.1)
ht12_users <- as.data.frame(table(ht12f$Vertex.1))
colnames(ht12_users) <- c("user", "tweets")
ht12_users <- ht12_users[sort.list(ht12_users$tweets, decreasing = T),]
#plot it
p <- ggplot(ht12_users) + geom_point(aes(1:nrow(ht12_users), tweets), color = "salmon") + theme_bw() + labs(title = "#umap2012 tweets per user") + xlab("User ID - sorted by amount of tweets") + ylab("#tweets")
# add lines and labels
#p <- p + geom_hline(aes(1:nrow(ht12_users),tweets), stat="hline", yintercept="mean", linetype = "dashed") + annotate("text", x=40, y=5, hjust=0, label="Mean nbr of tweets/user", size=3)
#p <- p + geom_hline(aes(1:nrow(ht12_users),tweets), stat="hline", yintercept="median", linetype = "dashed") + annotate("text", x=40, y=1, hjust=0, vjust= -0.35, label="Median nbr of tweets/user", size=3)
#p <- p + geom_vline(aes(1:nrow(ht12_users),tweets), stat="vline", xintercept=30, linetype = "dashed") +  annotate("text", x=29.5, y=38, hjust=1, vjust =0, angle = 90, label="50% of users",size=3) 
png("umap12-2.png", width=900, height=850, res=120)
p
dev.off()

# STATS of UMAP2012
# 1) number of users
length(unique(umap12$Vertex.1))
# 2) #total tweets
length(umap12$Vertex.1)
# a) 3)  # Retweets (RT) | # b) # Mentions | # c) # Replies to | # tweets not a), b), c)
table(umap12$Relationship2)
# same as before, in percentages
table(umap12$Relationship2)/length(umap12$Vertex.1)
# % Users Retweeted
length(unique(umap12$Vertex.2[umap12$Relationship2 == "Retweet"]))/length(unique(umap12$Vertex.1))
# % Users Retweeted \ Mentioned \ replied to
length(unique(umap12$Vertex.2[umap12$Relationship2 != "Tweet"]))/length(unique(umap12$Vertex.1))
# % Tweets made by Top-20% contributors
umap12_users <- as.data.frame(table(umap12$Vertex.1))
umap12_users <- arrange(umap12_users,desc(Freq))
umap12_top <- umap12_users[1:ceiling(length(umap12_users$Var1)/5),]$Var1
umap12_top <- factor(umap12_top)
sum(umap12_users[1:ceiling(length(umap12_users$Var1)/5),]$Freq)/sum(umap12_users$Freq) 
# %Tweets made by bottom-20% contributors
sum(tail(umap12_users,ceiling(length(umap12_users$Var1)/5))$Freq)/sum(umap12_users$Freq) 
# =========
umap12_bot <- tail(umap12_users,ceiling(length(umap12_users$Var1)/5))$Var1
umap12_bot <- factor(umap12_bot)
#  20% most early contributors
#  a) list of 20$ early contributors
tw.dfx=ddply(umap12, .var = "Vertex.1", .fun = function(x) {return(subset(x, Tweet.Date..UTC. %in% min(Tweet.Date..UTC.),select=c(Vertex.1,Tweet.Date..UTC.))) })
# March 12th 2013 fix ~ remove duplciated rows
tw.dfx <- tw.dfx[!duplicated(tw.dfx),]
tw.dfxa=arrange(tw.dfx,-desc(Tweet.Date..UTC.))
umap12_early <- head( tw.dfxa , ceiling(length(umap12_users$Var1)/5) )$Vertex.1
umap12_early <- factor(umap12_early)
# % of the earliest contributors that are top20% contributors
length(subset(umap12_top, as.character(umap12_early) %in% as.character(umap12_top) ))/length(umap12_top)
#  20% of late contributors
#  a) list of 20$ late contributors
umap12_late <- tail( tw.dfxa , ceiling(length(umap12_users$Var1)/5) )$Vertex.1 
umap12_late <- factor(umap12_late)
# % of the top 20% lurkers that are late contributors
length(subset(umap12_late, as.character(umap12_late) %in% as.character(umap12_users[umap12_users$Freq == 1,]$Var1) ))/length(umap12_bot)

# PLOT 
df_umap12_early <- subset(umap12,Vertex.1 %in% umap12_early )
df_umap12_late <- subset(umap12,Vertex.1 %in% umap12_late)
# plot early 20%
p_early <- ggplot(df_umap12_early ,aes(Relationship2)) + geom_bar(aes( y = (..count..)/sum(..count..) )) + 
scale_y_continuous(labels = percent_format()) + labs(title="#umap2012: Tweet Types for Early 20%", x="Tweet Type", y="Percentage") + 
theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
# plot late  20%
# check https://github.com/hadley/ggplot2/wiki/-opts%28%29-List for opts() list
p_late <- ggplot(df_umap12_late,aes(Relationship2)) + geom_bar(aes( y = (..count..)/sum(..count..) )) + 
scale_y_continuous(labels = percent_format()) + labs(title="#umap2012: Tweet Types for Late 20%", x="Tweet Type", y="Percentage") + 
theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
#combined plot
grid.arrange(p_early,p_late,nrow=1)
png("umap12-3.png", width=900, height=400, res=100)
grid.arrange(p_early,p_late,nrow=1)
dev.off()

#the values
umap12_ttype_late <- table(df_umap12_late$Relationship2)/nrow(df_umap12_late)
umap12_ttype_early <- table(df_umap12_early$Relationship2)/nrow(df_umap12_early)

# ================== UMAP  WHICH category mentions which other category
df_twt_category_2 <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
twt_cat2_umap12 <- df_twt_category_2[df_twt_category_2$conference == 'umap_iconf2012',]
umap12_u_ctg1 <- merge(umap12[umap12$Relationship2 != 'Tweet', ],twt_cat2_umap12, by.x = "Vertex.1", by.y = "twitter")
umap12_u_ctg2 <- merge(umap12_u_ctg1,twt_cat_umap12, by.x = "Vertex.2", by.y = "twitter")
addmargins(table(umap12_u_ctg2$category.x,umap12_u_ctg2$category.y))
inter_comm <- addmargins(table(umap12_u_ctg2$category.x,umap12_u_ctg2$category.y))
write.table(inter_comm, file='inter_category_umap2012.csv',sep=',', row.names=T)
# ===========================================================================================


# ========= UMAP plot time series : cumulative tweet tye over time
df_twt_category <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
twt_cat_umap12 <- df_twt_category[df_twt_category$conference == 'umap_iconf2012',]
umap12_u_ctg <- merge(umap12,twt_cat_umap12, by.x = "Vertex.1", by.y = "twitter")

# use merge to add user category on tweets dataframe
umap12_u_ctg <- arrange(umap12_u_ctg,-desc(Tweet.Date..UTC.))
umap12_u_ctg$cumttype_count <- ave(rep(1,length(umap12_u_ctg$Relationship2)), umap12_u_ctg$Relationship2, FUN = cumsum)

p4 <- ggplot(umap12_u_ctg)+geom_line(aes(x=Tweet.Date..UTC.,y=cumttype_count, group=Relationship2,colour=Relationship2)) +
  scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-07-15", "2012-07-21")), labels = date_format("%b %d\n%H:%M:%S") ) + 
  theme_bw()
p4 <- p4 + labs(title = "#umap2012 Tweet Activity") + xlab("Tweet date/time") + ylab("Cummulative tweet count") + scale_colour_discrete(name = "Tweet Type") + scale_size( guide="none" )
p4 <- p4 + theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
p4
png("umap12-4.png", width=900, height=800, res=120)
p4
dev.off()

# ========= umap plot time series : cumulative tweet per user category over time
#df_twt_category <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
#twt_cat_umap12 <- df_twt_category[df_twt_category$conference == 'umap12_iconf2012',]
#umap12_u_ctg <- merge(umap12,twt_cat_umap12, by.x = "Vertex.1", by.y = "twitter")

# use merge to add user category on tweets dataframe
umap12_u_ctg <- arrange(umap12_u_ctg,-desc(Tweet.Date..UTC.))
umap12_u_ctg$cumcategory_count <- ave(rep(1,length(umap12_u_ctg$category)), umap12_u_ctg$category, FUN = cumsum)

p5 <- ggplot(umap12_u_ctg)+geom_line(aes(x=Tweet.Date..UTC.,y=cumcategory_count, group=category,colour=category), size=1.2) +
  scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-07-15", "2012-07-21")), labels = date_format("%b %d\n%H:%M:%S") ) + 
  theme_bw()
p5 <- p5 + labs(title = "#umap2012 Tweet Activity") + xlab("Tweet date/time") + ylab("Cummulative tweet count") + scale_colour_discrete(name = "User Category") + scale_size( guide="none" )
p5 <- p5 + theme(axis.text.x=element_text(size=9), axis.title.x=element_text(size=11),axis.text.y=element_text(size=9), axis.title.y=element_text(size=11))
p5 <- p5 + theme(panel.border = element_rect(colour = 'black',size = 1.2, linetype='solid'))
p5
png("umap12-5.png", width=900, height=800, res=120)
p5
dev.off()

# ============ write.tablecreate summary tables to export to excel
umap12_ttype_ucat <- addmargins(table(umap12_u_ctg$category,umap12_u_ctg$Relationship2))
write.table(umap12_ttype_ucat, file="umap12_ttype_ucat.csv",sep=",",row.names=T)
write.table(umap12_ttype_early,file="umap12_ttype_early.csv",sep=",",row.names=T)
write.table(umap12_ttype_late,file="umap12_ttype_late.csv",sep=",",row.names=T)
umap12_user_tweet_dist <- table(umap12$Vertex.1)
write.table(umap12_user_tweet_dist,file="umap12_user_tweet_dist.csv",sep=",",row.names=T)

# == who (which category) is being mentioned, retweeted, replied to ?
umap12_u_ctg_2 <- merge(umap12,twt_cat_umap12, by.x = "Vertex.2", by.y = "twitter")
umap12_ttype_ucat_rec <- addmargins(table(umap12_u_ctg_2$category,umap12_u_ctg_2$Relationship2))
write.table(umap12_ttype_ucat_rec,file="umap12_ttype_ucat_rec.csv",sep=",",row.names=T)

# 
# =======================================================================================================================


# RECSYS2012
# Dates considered: from Sept-9 00:01 UTC to Sept-13 11:59 UTC
# check fields "Twitter Page for Tweet" and "imported ID"
recsys12 <- read.csv('recsys12_iconf2012.csv',  header = TRUE, sep = ",", colClasses=c(rep("factor",15),rep("character",6),rep("numeric",3) ) )
recsys12 <- within(recys12, Relationship.Date..UTC. <- as.POSIXct(Relationship.Date..UTC., format = "%m/%d/%Y %H:%M"))
recsys12 <- within(recys12, Tweet.Date..UTC. <- as.POSIXct(Tweet.Date..UTC., format = "%m/%d/%Y %H:%M"))

#plot tweets as  date/time v/s User 
ht12f <- recsys12
# sort the user wrt time they started participating
tw.dfx=ddply(ht12f, .var = "Vertex.1", .fun = function(x) {return(subset(x, Tweet.Date..UTC. %in% min(Tweet.Date..UTC.),select=c(Vertex.1,Tweet.Date..UTC.))) })
# March 12th 2013 fix ~ remove duplciated rows
tw.dfx <- tw.dfx[!duplicated(tw.dfx),]
tw.dfxa=arrange(tw.dfx,-desc(Tweet.Date..UTC.))
ht12f$Vertex.1=factor(ht12f$Vertex.1, levels = tw.dfxa$Vertex.1)
#and plot the result
p3 <- ggplot(ht12f)+geom_point(aes(x=Tweet.Date..UTC.,y=Vertex.1,colour=Relationship2, shape=Relationship2, size=3)) +
scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-09-9", "2012-09-14")), labels = date_format("%b %d\n%H:%M:%S") ) +
scale_y_discrete(labels = c(1:length(levels(ht12f$Vertex.1)))) + theme_bw()
p3 <- p3 + scale_shape_manual(name = "Tweet Type", values=c(1,2,3,4)) + labs(title = "#recsys2012 Tweet Activity") + xlab("Tweet date/time") + ylab("User ID") + scale_colour_discrete(name = "Tweet Type") + scale_size( guide="none" )
p3 <- p3 + theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
p3

png("recsys12-1.png", width=900, height=800, res=120)
p3
dev.off()

#reorder dataframe based on one column
recsys12_user_tweet_dist <- table(recsys12$Vertex.1)
ht12_users <- as.data.frame(table(ht12f$Vertex.1))
colnames(ht12_users) <- c("user", "tweets")
ht12_users <- ht12_users[sort.list(ht12_users$tweets, decreasing = T),]
#plot it
p <- ggplot(ht12_users) + geom_point(aes(1:nrow(ht12_users), tweets), color = "salmon") + theme_bw() + labs(title = "#recsys2012 tweets per user") + xlab("User ID - sorted by amount of tweets") + ylab("#tweets")
# add lines and labels
#p <- p + geom_hline(aes(1:nrow(ht12_users),tweets), stat="hline", yintercept="mean", linetype = "dashed") + annotate("text", x=40, y=5, hjust=0, label="Mean nbr of tweets/user", size=3)
#p <- p + geom_hline(aes(1:nrow(ht12_users),tweets), stat="hline", yintercept="median", linetype = "dashed") + annotate("text", x=40, y=1, hjust=0, vjust= -0.35, label="Median nbr of tweets/user", size=3)
#p <- p + geom_vline(aes(1:nrow(ht12_users),tweets), stat="vline", xintercept=30, linetype = "dashed") +  annotate("text", x=29.5, y=38, hjust=1, vjust =0, angle = 90, label="50% of users",size=3) 
png("recsys12-2.png", width=900, height=850, res=120)
p
dev.off()

# STATS of RECSYS 2012
length(unique(recsys12$Vertex.1))
# 2) #total tweets
length(recsys12$Vertex.1)
# a) 3)  # Retweets (RT) | # b) # Mentions | # c) # Replies to | # tweets not a), b), c)
table(recsys12$Relationship2)
# same as before, in percentages
table(recsys12$Relationship2)/length(recsys12$Vertex.1)
# % Users Retweeted
length(unique(recsys12$Vertex.2[recsys12$Relationship2 == "Retweet"]))/length(unique(recsys12$Vertex.1))
# % Users Retweeted \ Mentioned \ replied to
length(unique(recsys12$Vertex.2[recsys12$Relationship2 != "Tweet"]))/length(unique(recsys12$Vertex.1))
# % Tweets made by Top-20% contributors
recsys12_users <- as.data.frame(table(recsys12$Vertex.1))
recsys12_users <- arrange(recsys12_users,desc(Freq))
recsys12_top <- recsys12_users[1:ceiling(length(recsys12_users$Var1)/5),]$Var1
recsys12_top <- factor(recsys12_top)
sum(recsys12_users[1:ceiling(length(recsys12_users$Var1)/5),]$Freq)/sum(recsys12_users$Freq) 
# %Tweets made by bottom-20% contributors
sum(tail(recsys12_users,ceiling(length(recsys12_users$Var1)/5))$Freq)/sum(recsys12_users$Freq) 
# =========
recsys12_bot <- tail(recsys12_users,ceiling(length(recsys12_users$Var1)/5))$Var1
recsys12_bot <- factor(recsys12_bot)
#  20% most early contributors
#  a) list of 20$ early contributors
tw.dfx=ddply(recsys12, .var = "Vertex.1", .fun = function(x) {return(subset(x, Tweet.Date..UTC. %in% min(Tweet.Date..UTC.),select=c(Vertex.1,Tweet.Date..UTC.))) })
# March 12th 2013 fix ~ remove duplciated rows
tw.dfx <- tw.dfx[!duplicated(tw.dfx),]
tw.dfxa=arrange(tw.dfx,-desc(Tweet.Date..UTC.))
recsys12_early <- head( tw.dfxa , ceiling(length(recsys12_users$Var1)/5) )$Vertex.1
recsys12_early <- factor(recsys12_early)
# % of the earliest contributors that are top20% contributors
length(subset(recsys12_top, as.character(recsys12_early) %in% as.character(recsys12_top) ))/length(recsys12_top)
#  20% of late contributors
#  a) list of 20$ late contributors
recsys12_late <- tail( tw.dfxa , ceiling(length(recsys12_users$Var1)/5) )$Vertex.1 
recsys12_late <- factor(recsys12_late)
# % of the top 20% lurkers that are late contributors
length(subset(recsys12_late, as.character(recsys12_late) %in% as.character(recsys12_users[recsys12_users$Freq == 1,]$Var1) ))/length(recsys12_bot)

# PLOT 
df_recsys12_early <- subset(recsys12,Vertex.1 %in% recsys12_early )
df_recsys12_late <- subset(recsys12,Vertex.1 %in% recsys12_late)
# plot early 20%
p_early <- ggplot(df_recsys12_early ,aes(Relationship2)) + geom_bar(aes( y = (..count..)/sum(..count..) )) + 
scale_y_continuous(labels = percent_format()) + labs(title="#recsys2012: Tweet Types for Early 20%", x="Tweet Type", y="Percentage") + 
theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
# plot late  20%
# check https://github.com/hadley/ggplot2/wiki/-opts%28%29-List for opts() list
p_late <- ggplot(df_recsys12_late,aes(Relationship2)) + geom_bar(aes( y = (..count..)/sum(..count..) )) + 
scale_y_continuous(labels = percent_format()) + labs(title="#recsys2012: Tweet Types for Late 20%", x="Tweet Type", y="Percentage") + 
theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
#combined plot
grid.arrange(p_early,p_late,nrow=1)
png("recsys12-3.png", width=900, height=400, res=100)
grid.arrange(p_early,p_late,nrow=1)
dev.off()

#the values
recsys12_ttype_late <- table(df_recsys12_late$Relationship2)/nrow(df_recsys12_late)
recsys12_ttype_early <- table(df_recsys12_early$Relationship2)/nrow(df_recsys12_early)

# ================== RECSYS  WHICH category mentions which other category
df_twt_category_2 <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
twt_cat2_recsys12 <- df_twt_category_2[df_twt_category_2$conference == 'recys12_iconf2012',]
recsys12_u_ctg1 <- merge(recsys12[recsys12$Relationship2 != 'Tweet', ],twt_cat2_recsys12, by.x = "Vertex.1", by.y = "twitter")
recsys12_u_ctg2 <- merge(recsys12_u_ctg1,twt_cat_recsys12, by.x = "Vertex.2", by.y = "twitter")
addmargins(table(recsys12_u_ctg2$category.x,recsys12_u_ctg2$category.y))
inter_comm <- addmargins(table(recsys12_u_ctg2$category.x,recsys12_u_ctg2$category.y))
write.table(inter_comm, file='inter_category_recsys2012.csv',sep=',', row.names=T)
# ===========================================================================================

# ========= RECSYS plot time series : cumulative tweet tye over time
df_twt_category <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
twt_cat_recsys12 <- df_twt_category[df_twt_category$conference == 'recys12_iconf2012',]
recsys12_u_ctg <- merge(recsys12,twt_cat_recsys12, by.x = "Vertex.1", by.y = "twitter")


# use merge to add user category on tweets dataframe
recsys12_u_ctg <- arrange(recsys12_u_ctg,-desc(Tweet.Date..UTC.))
recsys12_u_ctg$cumttype_count <- ave(rep(1,length(recsys12_u_ctg$Relationship2)), recsys12_u_ctg$Relationship2, FUN = cumsum)

p4 <- ggplot(recsys12_u_ctg)+geom_line(aes(x=Tweet.Date..UTC.,y=cumttype_count, group=Relationship2,colour=Relationship2)) +
  scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-09-09", "2012-09-14")), labels = date_format("%b %d\n%H:%M:%S") ) + 
  theme_bw()
p4 <- p4 + labs(title = "#recsys2012 Tweet Activity") + xlab("Tweet date/time") + ylab("Cummulative tweet count") + scale_colour_discrete(name = "Tweet Type") + scale_size( guide="none" )
p4 <- p4 + theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
p4
png("recsys12-4.png", width=900, height=800, res=120)
p4
dev.off()
# ========= RECSYS plot time series : cumulative tweet per user category over time
#df_twt_category <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
#twt_cat_recsys12 <- df_twt_category[df_twt_category$conference == 'recsys12_iconf2012',]
#recsys12_u_ctg <- merge(recsys12,twt_cat_recsys12, by.x = "Vertex.1", by.y = "twitter")

# use merge to add user category on tweets dataframe
recsys12_u_ctg <- arrange(recsys12_u_ctg,-desc(Tweet.Date..UTC.))
recsys12_u_ctg$cumcategory_count <- ave(rep(1,length(recsys12_u_ctg$category)), recsys12_u_ctg$category, FUN = cumsum)

p5 <- ggplot(recsys12_u_ctg)+geom_line(aes(x=Tweet.Date..UTC.,y=cumcategory_count, group=category,colour=category), size=1.2) +
  scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-09-09", "2012-09-14")), labels = date_format("%b %d\n%H:%M:%S") ) + 
  theme_bw()
p5 <- p5 + labs(title = "#recsys2012 Tweet Activity") + xlab("Tweet date/time") + ylab("Cummulative tweet count") + scale_colour_discrete(name = "User Category") + scale_size( guide="none" )
p5 <- p5 + theme(axis.text.x=element_text(size=9), axis.title.x=element_text(size=11),axis.text.y=element_text(size=9), axis.title.y=element_text(size=11))
p5 <- p5 + theme(panel.border = element_rect(colour = 'black',size = 1.2, linetype='solid'))
p5
png("recsys12-5.png", width=900, height=800, res=120)
p5
dev.off()

# == who (which category) is being mentioned, retweeted, replied to ?
recsys12_u_ctg_2 <- merge(recsys12,twt_cat_recsys12, by.x = "Vertex.2", by.y = "twitter")
recsys12_ttype_ucat_rec <- addmargins(table(recsys12_u_ctg_2$category,recsys12_u_ctg_2$Relationship2))
write.table(recsys12_ttype_ucat_rec,file="recsys12_ttype_ucat_rec.csv",sep=",",row.names=T)
# to compare
recsys12_ttype_ucat_rec
addmargins(table(recsys12_u_ctg$category,recsys12_u_ctg$Relationship2))

# =======================================================================================================================

# ECTEL2012
# Dates considered: from Sept-18 05:20 UTC to Sept-21 05:20 UTC
# check fields "Twitter Page for Tweet" and "imported ID"
ectel12 <- read.csv('ectel12_iconf2012.csv',  header = TRUE, sep = ",", colClasses=c(rep("factor",15),rep("character",6),rep("numeric",3) ) )
ectel12 <- within(ectel12, Relationship.Date..UTC. <- as.POSIXct(Relationship.Date..UTC., format = "%m/%d/%Y %H:%M"))
ectel12 <- within(ectel12, Tweet.Date..UTC. <- as.POSIXct(Tweet.Date..UTC., format = "%m/%d/%Y %H:%M"))

#plot tweets as  date/time v/s User 
ht12f <- ectel12
# sort the user wrt time they started participating
tw.dfx=ddply(ht12f, .var = "Vertex.1", .fun = function(x) {return(subset(x, Tweet.Date..UTC. %in% min(Tweet.Date..UTC.),select=c(Vertex.1,Tweet.Date..UTC.))) })
# March 12th 2013 fix ~ remove duplciated rows
tw.dfx <- tw.dfx[!duplicated(tw.dfx),]
tw.dfxa=arrange(tw.dfx,-desc(Tweet.Date..UTC.))
ht12f$Vertex.1=factor(ht12f$Vertex.1, levels = tw.dfxa$Vertex.1)
#and plot the result
p3 <- ggplot(ht12f)+geom_point(aes(x=Tweet.Date..UTC.,y=Vertex.1,colour=Relationship2, shape=Relationship2, size=3)) +
scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-09-18", "2012-09-22")), labels = date_format("%b %d\n%H:%M:%S") ) +
scale_y_discrete(labels = c(1:length(levels(ht12f$Vertex.1)))) + theme_bw()
p3 <- p3 + scale_shape_manual(name = "Tweet Type", values=c(1,2,3,4)) + labs(title = "#ectel2012 Tweet Activity") + xlab("Tweet date/time") + ylab("User ID") + scale_colour_discrete(name = "Tweet Type") + scale_size( guide="none" )
p3 <- p3 + theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
p3

png("ectel12-1.png", width=900, height=800, res=120)
p3
dev.off()

#reorder dataframe based on one column
ht12_users <- as.data.frame(table(ht12f$Vertex.1))
colnames(ht12_users) <- c("user", "tweets")
ht12_users <- ht12_users[sort.list(ht12_users$tweets, decreasing = T),]
#plot it
p <- ggplot(ht12_users) + geom_point(aes(1:nrow(ht12_users), tweets), color = "salmon") + theme_bw() + labs(title = "#ectel2012 tweets per user") + xlab("User ID - sorted by amount of tweets") + ylab("#tweets")
# add lines and labels
#p <- p + geom_hline(aes(1:nrow(ht12_users),tweets), stat="hline", yintercept="mean", linetype = "dashed") + annotate("text", x=40, y=5, hjust=0, label="Mean nbr of tweets/user", size=3)
#p <- p + geom_hline(aes(1:nrow(ht12_users),tweets), stat="hline", yintercept="median", linetype = "dashed") + annotate("text", x=40, y=1, hjust=0, vjust= -0.35, label="Median nbr of tweets/user", size=3)
#p <- p + geom_vline(aes(1:nrow(ht12_users),tweets), stat="vline", xintercept=30, linetype = "dashed") +  annotate("text", x=29.5, y=38, hjust=1, vjust =0, angle = 90, label="50% of users",size=3) 
png("ectel12-2.png", width=900, height=850, res=120)
p
dev.off()

# STATS of ECTEL 2012
length(unique(ectel12$Vertex.1))
# 2) #total tweets
length(ectel12$Vertex.1)
# a) 3)  # Retweets (RT) | # b) # Mentions | # c) # Replies to | # tweets not a), b), c)
table(ectel12$Relationship2)
# same as before, in percentages
table(ectel12$Relationship2)/length(ectel12$Vertex.1)
# % Users Retweeted
length(unique(ectel12$Vertex.2[ectel12$Relationship2 == "Retweet"]))/length(unique(ectel12$Vertex.1))
# % Users Retweeted \ Mentioned \ replied to
length(unique(ectel12$Vertex.2[ectel12$Relationship2 != "Tweet"]))/length(unique(ectel12$Vertex.1))
# % Tweets made by Top-20% contributors
ectel12_users <- as.data.frame(table(ectel12$Vertex.1))
ectel12_users <- arrange(ectel12_users,desc(Freq))
ectel12_top <- ectel12_users[1:ceiling(length(ectel12_users$Var1)/5),]$Var1
ectel12_top <- factor(ectel12_top)
sum(ectel12_users[1:ceiling(length(ectel12_users$Var1)/5),]$Freq)/sum(ectel12_users$Freq) 
# %Tweets made by bottom-20% contributors
sum(tail(ectel12_users,ceiling(length(ectel12_users$Var1)/5))$Freq)/sum(ectel12_users$Freq) 
# =========
ectel12_bot <- tail(ectel12_users,ceiling(length(ectel12_users$Var1)/5))$Var1
ectel12_bot <- factor(ectel12_bot)
#  20% most early contributors
#  a) list of 20$ early contributors
tw.dfx=ddply(ectel12, .var = "Vertex.1", .fun = function(x) {return(subset(x, Tweet.Date..UTC. %in% min(Tweet.Date..UTC.),select=c(Vertex.1,Tweet.Date..UTC.))) })
# March 12th 2013 fix ~ remove duplciated rows
tw.dfx <- tw.dfx[!duplicated(tw.dfx),]
tw.dfxa=arrange(tw.dfx,-desc(Tweet.Date..UTC.))
ectel12_early <- head( tw.dfxa , ceiling(length(ectel12_users$Var1)/5) )$Vertex.1
ectel12_early <- factor(ectel12_early)
# % of the earliest contributors that are top20% contributors
length(subset(ectel12_top, as.character(ectel12_early) %in% as.character(ectel12_top) ))/length(ectel12_top)
#  20% of late contributors
#  a) list of 20$ late contributors
ectel12_late <- tail( tw.dfxa , ceiling(length(ectel12_users$Var1)/5) )$Vertex.1 
ectel12_late <- factor(ectel12_late)
# % of the top 20% lurkers that are late contributors
length(subset(ectel12_late, as.character(ectel12_late) %in% as.character(ectel12_users[ectel12_users$Freq == 1,]$Var1) ))/length(ectel12_bot)

# PLOT 
df_ectel12_early <- subset(ectel12,Vertex.1 %in% ectel12_early )
df_ectel12_late <- subset(ectel12,Vertex.1 %in% ectel12_late)
# plot early 20%
p_early <- ggplot(df_ectel12_early ,aes(Relationship2)) + geom_bar(aes( y = (..count..)/sum(..count..) )) + 
  scale_y_continuous(labels = percent_format()) + labs(title="#ectel2012: Tweet Types for Early 20%", x="Tweet Type", y="Percentage") + 
  theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
# plot late  20%
# check https://github.com/hadley/ggplot2/wiki/-opts%28%29-List for opts() list
p_late <- ggplot(df_ectel12_late,aes(Relationship2)) + geom_bar(aes( y = (..count..)/sum(..count..) )) + 
  scale_y_continuous(labels = percent_format()) + labs(title="#ectel2012: Tweet Types for Late 20%", x="Tweet Type", y="Percentage") + 
  theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
#combined plot
grid.arrange(p_early,p_late,nrow=1)
png("ectel12-3.png", width=900, height=400, res=100)
grid.arrange(p_early,p_late,nrow=1)
dev.off()

#the values
ectel12_ttype_late <- table(df_ectel12_late$Relationship2)/nrow(df_ectel12_late)
ectel12_ttype_early <- table(df_ectel12_early$Relationship2)/nrow(df_ectel12_early)


# ================== ECTEL  WHICH category mentions which other category
df_twt_category_2 <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
twt_cat2_ectel12 <- df_twt_category_2[df_twt_category_2$conference == 'ectel12_iconf2012',]
ectel12_u_ctg1 <- merge(ectel12[ectel12$Relationship2 != 'Tweet', ],twt_cat2_ectel12, by.x = "Vertex.1", by.y = "twitter")
ectel12_u_ctg2 <- merge(ectel12_u_ctg1,twt_cat_ectel12, by.x = "Vertex.2", by.y = "twitter")
addmargins(table(ectel12_u_ctg2$category.x,ectel12_u_ctg2$category.y))
inter_comm <- addmargins(table(ectel12_u_ctg2$category.x,ectel12_u_ctg2$category.y))
write.table(inter_comm, file='inter_category_ectel2012.csv',sep=',', row.names=T)
# ===========================================================================================


# ========= ECTEL plot time series : cumulative tweet type over time
df_twt_category <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
twt_cat_ectel12 <- df_twt_category[df_twt_category$conference == 'ectel12_iconf2012',]
ectel12_u_ctg <- merge(ectel12,twt_cat_ectel12, by.x = "Vertex.1", by.y = "twitter")

# use merge to add user category on tweets dataframe
ectel12_u_ctg <- arrange(ectel12_u_ctg,-desc(Tweet.Date..UTC.))
ectel12_u_ctg$cumttype_count <- ave(rep(1,length(ectel12_u_ctg$Relationship2)), ectel12_u_ctg$Relationship2, FUN = cumsum)

p4 <- ggplot(ectel12_u_ctg)+geom_line(aes(x=Tweet.Date..UTC.,y=cumttype_count, group=Relationship2,colour=Relationship2)) +
  scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-09-18", "2012-09-22")), labels = date_format("%b %d\n%H:%M:%S") ) + 
  theme_bw()
p4 <- p4 + labs(title = "#ectel2012 Tweet Activity") + xlab("Tweet date/time") + ylab("Cummulative tweet count") + scale_colour_discrete(name = "Tweet Type") + scale_size( guide="none" )
p4 <- p4 + theme(axis.text.x=element_text(size=8), axis.title.x=element_text(size=10),axis.text.y=element_text(size=8), axis.title.y=element_text(size=10))
p4
png("ectel12-4.png", width=900, height=800, res=120)
p4
dev.off()

# ========= ECTEL plot time series : cumulative tweet per user category over time
#df_twt_category <- read.csv('dbdump_twitter_account.csv',  header = TRUE, sep = ",", colClasses=c("numeric",rep("factor",3) ) )
#twt_cat_ectel12 <- df_twt_category[df_twt_category$conference == 'ectel12_iconf2012',]
#ectel12_u_ctg <- merge(ectel12,twt_cat_ectel12, by.x = "Vertex.1", by.y = "twitter")

# use merge to add user category on tweets dataframe
ectel12_u_ctg <- arrange(ectel12_u_ctg,-desc(Tweet.Date..UTC.))
ectel12_u_ctg$cumcategory_count <- ave(rep(1,length(ectel12_u_ctg$category)), ectel12_u_ctg$category, FUN = cumsum)

p5 <- ggplot(ectel12_u_ctg)+geom_line(aes(x=Tweet.Date..UTC.,y=cumcategory_count, group=category,colour=category), size=1.2) +
  scale_x_datetime(breaks = date_breaks("1 day"), minor_breaks = date_breaks("2 hour"), limits = as.POSIXct(c("2012-09-18", "2012-09-22")), labels = date_format("%b %d\n%H:%M:%S") ) + 
  theme_bw()
p5 <- p5 + labs(title = "#ectel2012 Tweet Activity") + xlab("Tweet date/time") + ylab("Cummulative tweet count") + scale_colour_discrete(name = "User Category") + scale_size( guide="none" )
p5 <- p5 + theme(axis.text.x=element_text(size=9), axis.title.x=element_text(size=11),axis.text.y=element_text(size=9), axis.title.y=element_text(size=11))
p5 <- p5 + theme(panel.border = element_rect(colour = 'black',size = 1.2, linetype='solid'))
p5
png("ectel12-5.png", width=900, height=800, res=120)
p5
dev.off()

# == who (which category) is being mentioned, retweeted, replied to ?
ectel12_u_ctg_2 <- merge(ectel12,twt_cat_ectel12, by.x = "Vertex.2", by.y = "twitter")
ectel12_ttype_ucat_rec <- addmargins(table(ectel12_u_ctg_2$category,ectel12_u_ctg_2$Relationship2))
write.table(ectel12_ttype_ucat_rec,file="ectel12_ttype_ucat_rec.csv",sep=",",row.names=T)
# to compare
ectel12_ttype_ucat_rec
addmargins(table(ectel12_u_ctg$category,ectel12_u_ctg$Relationship2))
