Optimizing add_trace() in a for loop? - r

I'm using the add_trace() function in a for loop to create lines for a 3d network graph in plotly's scatter3d mode. Each add_trace draws an individual line between two nodes in the network. The method is working, but with large number of loops, the speed of the individual loops seems to be slowing down very quickly.
Example data can be downloaded here: https://gist.github.com/pravj/9168fe52823c1702a07b
library(igraph)
library(plotly)
G <- read.graph("karate.gml", format = c("gml"))
L <- layout.circle(G)
vs <- V(G)
es <- as.data.frame(get.edgelist(G))
Nv <- length(vs)
Ne <- length(es[1]$V1)
Xn <- L[,1]
Yn <- L[,2]
network <- plot_ly(type = "scatter3d", x = Xn, y = Yn, z = rep(0, Ne), mode = "markers", text = vs$label, hoverinfo = "text", showlegend = F)
for(i in 1:Ne) {
v0 <- es[i,]$V1
v1 <- es[i,]$V2
x0 <- Xn[v0]
y0 <- Yn[v0]
x1 <- Xn[v1]
y1 <- Yn[v1]
df <- data.frame(x = c(x0, x1), y = c(y0, y1), z = c(0, 0))
network <- add_trace(network, data = df, x = x, y = y, z = z, type = "scatter3d", mode = "lines", showlegend = F,
marker = list(color = '#030303'), line = list(width = 0.5))
}
This example is fairly quick, but when I include a few hundred edges or more, the execution of the individual loops start to slow down radically. I tried different optimization methods (vectorisation etc), but there seems to be no working around the slowness of the add_trace function itself.
Any suggestions?

The most efficient way to add many line segments in plotly is not as a separate trace each, but to use only a single trace that contains all the line segments. You can do this by constructing a data frame with the x,y coordinates of each node to be connected, interspersed with NA's between each line segment. Then use connectgaps=FALSE to break the trace into separate segments at each NA. You can see another example of this approach, applied to spaghetti plots in this answer.
es$breaks <- NA
lines <- data.frame(node=as.vector(t(es)), x=NA, y=NA, z=0)
lines[which(!is.na(lines$node)),]$x <- Xn[lines[which(!is.na(lines$node)),]$node]
lines[which(!is.na(lines$node)),]$y <- Yn[lines[which(!is.na(lines$node)),]$node]
network <- plot_ly(type = "scatter3d", x = Xn, y = Yn, z = rep(0, Ne),
mode = "markers", text = vs$label, hoverinfo = "text",
showlegend = F) %>%
add_trace(data=lines, x=x, y=y, z=z, showlegend = FALSE,
type = 'scatter3d', mode = 'lines+markers',
marker = list(color = '#030303'), line = list(width = 0.5),
connectgaps=FALSE)
Reproducible data for this question
For convenience, here are the data for this question. The OP required downloading a .gml file from github, and installing library(igraph) to process the data into these.
es <- structure(list(
V1 = c(1, 1, 2, 1, 2, 3, 1, 1, 1, 5, 6, 1, 2, 3, 4, 1, 3, 3, 1, 5, 6, 1, 1, 4, 1, 2, 3, 4, 6, 7, 1, 2, 1, 2,
1, 2, 24, 25, 3, 24, 25, 3, 24, 27, 2, 9, 1, 25, 26, 29, 3, 9, 15, 16, 19, 21, 23, 24, 30, 31, 32, 9, 10, 14, 15, 16, 19, 20,
21, 23, 24, 27, 28, 29, 30, 31, 32, 33),
V2 = c(2, 3, 3, 4, 4, 4, 5, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 10, 11, 11, 11, 12, 13, 13,
14, 14, 14, 14, 17, 17, 18, 18, 20, 20, 22, 22, 26, 26, 28, 28, 28, 29, 30, 30, 31, 31, 32, 32, 32, 32, 33, 33, 33, 33, 33, 33,
33, 33, 33, 33, 33, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34)),
.Names = c("V1", "V2"), row.names = c(NA, -78L), class = "data.frame")
theta <- seq(0,2,length.out=35)[1:34]
Xn <- cospi(theta)
Yn <- sinpi(theta)
Nv <- NROW(Xn)
Ne <- NROW(es)
vs <- data.frame(label = as.character(1:Nv))

Related

I want to calculate a formula in R

I have a dataset that starts like this:
In dput it is
structure(list(20, TRUE, c(0, 0, 1, 1, 1, 1, 2, 3, 4, 4, 4, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 7, 7), c(8, 1, 0, 8, 9, 5,
8, 10, 10, 5, 7, 4, 11, 12, 6, 13, 14, 15, 16, 17, 18, 4, 5,
19, 4, 17), c(1, 0, 2, 5, 3, 4, 6, 7, 9, 10, 8, 11, 14, 12, 13,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25), c(2, 1, 11, 21,
24, 5, 9, 22, 14, 10, 0, 3, 6, 4, 7, 8, 12, 13, 15, 16, 17, 18,
19, 25, 20, 23), c(0, 2, 6, 7, 8, 11, 21, 24, 26, 26, 26, 26,
26, 26, 26, 26, 26, 26, 26, 26, 26), c(0, 1, 2, 2, 2, 5, 8, 9,
10, 13, 14, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26), list(c(1,
0, 1), structure(list(), names = character(0)), list(name = c("1",
"3", "5", "6", "8", "9", "12", "19", "2", "4", "7", "10", "11",
"14", "15", "16", "17", "18", "20", "13")), list(`Number of messages` = c(157,
1058, 2481, 833, 178, 119, 66, 222, 20, 343, 3, 4991, 47, 11,
83, 26, 10, 19, 33, 84, 51, 589, 79, 37, 110, 55))), <environment>), class = "igraph")
so far I have the following codelines:
Datensatz <- read_xlsx("...")
Netzwerkgraph <- graph.data.frame(Datensatz[,1:3], directed = TRUE)
actors<-Datensatz$From
relations<-Datensatz$To
weight<-Datensatz$`Number of messages`
How can I calculate the following formula in R with my data set?
I´ve tried the following code
Function <- function(i,j,x,y,z){
i <- actors
j <- relations
w <- weight
for(i in 1:20)
print (-1/(cumsum 1:length(actors, i)(w,i+1))logb(x,base=2)*1/(cumsum 1:length(actors, i)*w,i+1))
}
It isn't entirely clear how you wish to apply the given formula to your example data set, that is, exactly what inputs you are using and what outputs you wish to achieve. Hence, it also isn't clear if the following approach will be sufficient for your purposes. Here is my interpretation thus far.
If one interprets each unique value in the "from" column as being a node i, then it appears that you wish to calculate the sum of messages to each j in the "to" column for each sender i in the "from" column. One approach might then be to calculate all such sums by sender first and then run them all through a simple function that accepts the sum along with some lambda constant.
I used a lambda value of "2" below arbitrarily for illustrative purposes. Additionally, while the formula references a time t, there does not appear to be a time component in your example data set; time isn't represented in this approach. The output would presumably represent the expression for each node at a single point in time.
#written in R version 4.2.1
require(data.table)
##Example data frame
df = data.frame(from = c(1,1,3,3,3), to = c(2,3,1,2,4),nm = c(157,1058,2481,833,178))
df = data.table(df)
df
from to nm
1: 1 2 157
2: 1 3 1058
3: 3 1 2481
4: 3 2 833
5: 3 4 178
##Calculate the sum of messages by sender in "from" column
nf = df[,sum(nm), by = from]
colnames(nf) = c("from","message_total")
nf
from message_total
1: 1 1215
2: 3 3492
## Function
## inputs to function are the total number of messages of a sender in
## "from" column (called cit) and some lambda constant
icit = function(cit,lambda = 2){
-(1/(cit + lambda))*log(1/((cit + lambda)), base = 2)
}
##Find vector of values for each sender in the data set
ans = NULL
for(i in 1:dim(nf)[1]){
ans[i] = icit(nf$message_total[i])
}
ans
[1] 0.008421622 0.003368822

How to add or annotate Latex formula as annotation in boxplot() in R?

I want to annotate my boxplot (create in Base R) with some text and latex formula's, I tried with $..formula..$, but didn't work. Does anyone know a solution?
i = c(1:20)
X = c(13, 18, 25, 58, 25, 31, 39, 42, 17, 35, 46, 22, 18, 20, 26, 14, 33, 19, 20, 21)
df = data.frame(i, X)
boxplot(df$X, data=df, main="Belminuten Data",
xlab=" ", ylab="Aantal Belminuten",
frame = FALSE,
ylimit = c(10, 60),
range=3)
text(x = c(1.3), y = 60, "n = 20") # n should be in italic or in formula style
text(x = c(.7), y = 23.5, "Med = 23.5")
text(x = c(.7), y = 18.5, "Q_1 = 18.5")
library(latex2exp)
i = c(1:20)
X = c(13, 18, 25, 58, 25, 31, 39, 42, 17, 35, 46, 22, 18, 20, 26, 14,
33, 19, 20, 21)
df = data.frame(i, X)
boxplot(df$X, data=df, main="Belminuten Data",
xlab=" ", ylab="Aantal Belminuten",
frame = FALSE,
ylimit = c(10, 60),
range=3)
text(x = c(1.3), y = 60, TeX('$n = 20$'))
text(x = c(.7), y = 13.0, TeX('$Min = 13$'))
text(x = c(.7), y = 18.5, TeX('$Q_1 = 18.5$'))
text(x = c(.7), y = 23.5, TeX('$Med = 23.5$'))
text(x = c(.7), y = 34.0, TeX('$Q_3 = 34$'))
text(x = c(.7), y = 58.0, TeX('$Max = 58$'))

How to get the perfect "Before-After" graph with connected dots and paired U test using ggplot2?

My data looks like this:
mydata <- data.frame(ID = c(1, 2, 3, 5, 6, 7, 9, 11, 12, 13), #patient ID
t1 = c(37, 66, 28, 60, 44, 24, 47, 44, 33, 47), #evaluation before
t4 = c(33, 45, 27, 39, 24, 29, 24, 37, 27, 42), #evaluation after
sexe = c(1, 2, 2, 1, 1, 1, 2, 2, 2, 1)) #subset
I would like to do a simple before-after graph.
So far, I managed to get this:
With this:
library(ggplot2)
ggplot(mydata) +
geom_segment(aes(x = 1, xend = 2, y = t1, yend = t4), size=0.6) +
scale_x_discrete(name = "Intervention", breaks = c("1", "2"), labels = c("T1", "T4"), limits = c(1, 2)) +
scale_y_continuous(name = "Var") + theme_bw()
I am facing multiple issues, can you help me to...
add black circle at the begining and the end of every line? (geom_point() doesn't work)
make line smoother (look how pixelated they are, especially the second one)?
decrease blank space on left and right side of the graph?
add median for T1 and T4 (in red), link those points, compare them with paired mann whitney test and print p-value on the graph?
I would like not to reformat my database to long format I have a lot of other variable and timepoint (not shown here).
I have read other posts (such as here) but solution provided look so complicated for something that seems simple (yet i can't do it...).
Huge thanks for your help!
I will update the graph along with progression :)
EDIT
I would like not to reformat my database to long format as I have a lot of other variables and timepoints (not shown here)...
Here what i would do! Please feel free to ask questions regarding what's going on here.
library(tidyverse)
mydata <- data.frame(ID = c(1, 2, 3, 5, 6, 7, 9, 11, 12, 13), #patient ID
t1 = c(37, 66, 28, 60, 44, 24, 47, 44, 33, 47), #evaluation before
t4 = c(33, 45, 27, 39, 24, 29, 24, 37, 27, 42), #evaluation after
sexe = c(1, 2, 2, 1, 1, 1, 2, 2, 2, 1))
pval <- wilcox.test(x = mydata$t1,y = mydata$t4, paired = T,exact = F)$p.value %>% round(2)
df <- mydata %>%
pivot_longer(2:3,names_to = "Time") %>% # Pivot into long-format
mutate(sexe = as.factor(sexe),
Time = as.factor(Time)) # Make factors
ggplot(df,aes(Time,value,color = sexe,group = ID)) +
geom_point() +
geom_line() +
stat_summary(inherit.aes = F,aes(Time,value),
geom = "point", fun = "median", col = "red",
size = 3, shape = 24,fill = "red"
) +
annotate("text", x = 1.7, y = 60, label = paste('P-Value is',pval)) +
coord_cartesian(xlim = c(1.4,1.6)) +
theme_bw()
Also be aware that it is common to have some variables which repeat through time, in addition to the long format data. See example here:
mydata <- data.frame(ID = c(1, 2, 3, 5, 6, 7, 9, 11, 12, 13), #patient ID
t1 = c(37, 66, 28, 60, 44, 24, 47, 44, 33, 47), #evaluation before
t4 = c(33, 45, 27, 39, 24, 29, 24, 37, 27, 42), #evaluation after
sexe = c(1, 2, 2, 1, 1, 1, 2, 2, 2, 1),
var1 = c(1:10),
var2 = c(1:10),
var3 = c(1:10))
df <- mydata %>%
pivot_longer(2:3,names_to = "Time") %>% # Pivot into long-format
mutate(sexe = as.factor(sexe),
Time = as.factor(Time))
I can address (1) black circles issue:
First, you should tidy your data, so one column holds information of one variable (now 'Var' values on the plot are stored in two columns: 't1' and 't4'). You can achive this with tidyr package.
library(tidyr)
mydata_long <- pivot_longer(mydata, c(t1, t4), names_to = "t")
Now creating points is easy, and the rest of the code becomes a lot clearer:
We can tell ggplot that we want 't' groups on x-axis, their values on y-axis and in case of lines, we want them separate for every 'ID'.
ggplot(mydata_long) +
geom_line(aes(x = t, y = value, group = ID)) + #ploting lines
geom_point(aes(x = t, y = value)) + #ploting points
labs(x = "Intervention", y = "Var") + #changing labels
theme_bw()

Names cut on the x axis of the boxplot

I'm trying to save a boxplot in .tiif format using the code below:
sample_01 <- c(6, 1, 6, 8, 9, 8, 7, 3, 4, 9)
sample_02 <- c(13, 17, 16, 22, 18, 14, 20, 20, 11, 19)
sample_03 <- c(25, 23, 26, 29, 29, 22, 30, 27, 26, 21)
sample_04 <- c(31, 37, 40, 36, 33, 34, 31, 32, 37, 35)
sample_05 <- c(41, 44, 43, 47, 45, 50, 41, 45, 43, 50)
tiff(file = "temp.tiff", width = 3200, height = 3200, units = "px", res = 300)
box <- boxplot(sample_01,sample_02,sample_03,sample_04,sample_05,
names = c("sample_01","sample_02","sample_03","sample_04","sample_05"),
ylab = 'Relative Abundance (%)',
ylim = c(0,55),
col = c('red','green','blue','orange','purple'),
las=2,
cex.axis = 1.5,
cex.lab = 1.5
)
dev.off()
However, variable names are always cut on the graph's x-axis. I tried to use the parameter par(mar = c ()) in several different ways but I was unable to solve the problem. I also changed the height and width values, but without success either. How can I proceed so that the names of the x-axis are saved whole.
You can set the margin of your plot by using par(mar = c(bottom,left,top,right)).
As pointed it out by #AndersonNBarbosa, par(mar(...) need to be specified after tiff(...):
tiff(file = "temp.tiff", width = 3200, height = 3200, units = "px", res = 300)
par(mar = c(8,5,2,2))
box <- boxplot(sample_01,sample_02,sample_03,sample_04,sample_05,
names = c("sample_01","sample_02","sample_03","sample_04","sample_05"),
ylab = 'Relative Abundance (%)',
ylim = c(0,55),
col = c('red','green','blue','orange','purple'),
las=2,
cex.axis = 1.5,
cex.lab = 1.5
)
dev.off()
dc37, you noticed me a mistake you were making. In my script, I was using the command par(mar=c()) before tiff() and this was causing error in the graph with the example below:
par(mar = c(8,5,2,2))
tiff(...)
boxplot(...)
dev.off()
Therefore, when saving the image, the command par(mar=c()) must come after tiff() to be all right. As I show below:
tiff(...)
par(mar = c(8,5,2,2))
boxplot(...)
dev.off()

Why are my error bars on my graph out of place?

I have a graph that I'm trying to make with ggplot and gridExtra, but my error bars are out of place. I want the error bars to be at the top of each bar, not where they are now. What can I do to correct them?
Also, what ggsave parameters will generate a graph with the same pixel parameters that I am using with the r png base function? ggsave seems to work more consistently than this function, so I need to use it.
Data:
###Open packages###
library(readxl)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(gridExtra)
#Dataframes
set1 <- data.frame(type = c(1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
2,
2,
2,
2,
2,
2,
2,
3,
3,
3,
3,
3,
3,
3,
3,
3),
flowRate = c(24,
24,
24,
45,
45,
45,
58,
58,
58,
24,
24,
24,
45,
45,
45,
58,
58,
58,
24,
24,
24,
45,
45,
45,
58,
58,
58),
speed = c(0.563120137230256,
0.301721535875508,
0.170683367727845,
0.698874950490133,
0.158488731250147,
0.162788814307903,
0.105943103772245,
0.682354871986346,
0.17945825301837,
0.806637519498752,
0.599304186634932,
0.268788206619179,
0.518615600601962,
0.907628477211427,
0.144209408332705,
0.161586044320138,
0.946354993801663,
0.488881557759483,
0.497120443885793,
0.666120238846602,
0.264813203831783,
0.717007333314455,
0.95119232422312,
0.833669574933742,
0.450082932184122,
0.309570971522678,
0.732874401666482))
set2 <- data.frame(type = c(1,
1,
1,
1,
1,
1,
1,
1,
1,
2,
2,
2,
2,
2,
2,
2,
2,
2,
3,
3,
3,
3,
3,
3,
3,
3,
3),
flowRate = c(24,
24,
24,
45,
45,
45,
58,
58,
58,
24,
24,
24,
45,
45,
45,
58,
58,
58,
24,
24,
24,
45,
45,
45,
58,
58,
58),
speed = c(0.489966876244169,
0.535542121502899,
0.265940150225231,
0.399521957817437,
0.0831661276630631,
0.302201301891001,
0.78194419406759,
0.202331797255324,
0.192182716686147,
0.163038660094618,
0.658020173938572,
0.735633308902771,
0.480982144690572,
0.749452781972296,
0.491759702396918,
0.459610541236644,
0.397660083986082,
0.939983924945833,
0.128956722185581,
0.998492083119223,
0.440514184126494,
0.242917958355044,
0.350643319960552,
0.02613674288471,
0.71625407018877,
0.589325978787179,
0.649116781211748))
Code:
#Standard error of the mean function
sem <- function(x) sd(x)/sqrt(length(x))
#Aggregate dataframes, mean and Standard Error
mean_set1 <- aggregate(set1, by=list(set1$flowRate, set1$speed), mean)
mean_set1 <- select(mean_set1, -Group.1, -Group.2)
mean_set1 <- arrange(mean_set1, type, flowRate)
sem_set1 <- aggregate(set1, by=list(set1$flowRate, set1$speed), sem)
sem_set1 <- as.data.frame(sem_set1)
sem_set1 <- cbind(mean_set1$type, mean_set1$flowRate, sem_set1$Group.2)
sem_set1 <- as.data.frame(sem_set1)
mean_set2 <- aggregate(set2, by=list(set2$flowRate, set2$speed), mean)
mean_set2 <- select(mean_set2, -Group.1, -Group.2)
mean_set2 <- arrange(mean_set2, type, flowRate)
sem_set2 <- aggregate(set2, by=list(set2$flowRate, set2$speed), sem)
sem_set2 <- as.data.frame(sem_set2)
sem_set2 <- cbind(mean_set2$type, mean_set2$flowRate, sem_set2$Group.2)
sem_set2 <- as.data.frame(sem_set2)
#Graph sets
set1_graph <- ggplot(mean_set1, aes(x=type, y=speed, fill=factor(flowRate)))+
geom_bar(stat="identity",width=0.6, position="dodge", col="black")+
scale_fill_discrete(name="Flow Rate")+
xlab("type")+ylab("Speed")+
geom_errorbar(aes(ymin= mean_set1$speed,ymax=mean_set1$speed+sem_set1$V3), width=0.2, position = position_dodge(0.6))
set2_graph <- ggplot(mean_set2, aes(x=type, y=speed, fill=factor(flowRate)))+
geom_bar(stat="identity",width=0.6, position="dodge", col="black")+
scale_fill_discrete(name="Speed")+
xlab("type")+ylab("Flow Rate")+
geom_errorbar(aes(ymin= mean_set2$speed,ymax=mean_set2$speed+sem_set2$V3), width=0.2, position = position_dodge(0.6))
#Grid.arrange and save image
png("image.png", width = 1000, height = 700)
grid.arrange(set1_graph, set2_graph,nrow=1, ncol=2)
dev.off()

Resources