create directed arrow plots of two variables using ggplot2 in R - r

I have two variables (V1,V2) measured on same subject (id) at two time points (timepoint). I want to have a scatterplot with arrow paths to show how values moved from T1 to T2 for the same subject.
In my example, some subjects do not have change in V1 nor V2, it would be ideal to show just as one dot for those sub (sub 1 for example), but I am OK with two dots for two visits, since they will be overlap. There are also sub with a decrease in either V1 or V2 (sub 2 for example), those sub were shown in red arrow above. The third group of subjects show an increase in either V1 or V2 (sub 6 and 7): these sub were in green.
However, what I really need is all arrows point from T1 to T2. That is I hope the green arrow change direction.
The dataset can be generated by:
datatest <- data.frame(timepoint =rep(seq(2,1),8),
id = c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8),
V1= c( 30.29, 30.29, 21.60, 31.43, 20.75,20.75, 21.60, 30.03, 21.60, 31.30, 31.60, 21.72, 31.6, 20.02, 11.60, 20.16),
V2=c(40, 40, 30.78, 41.63, 40.41, 40.41,30.78, 40.97, 20.78, 40.84, 41.85, 41.85, 40.78, 31.79,20.78, 30.23))
which looks like this:
timepoint id V1 V2
1 2 1 30.29 40.00
2 1 1 30.29 40.00
3 2 2 21.60 30.78
4 1 2 31.43 41.63
5 2 3 20.75 40.41
6 1 3 20.75 40.41
7 2 4 21.60 30.78
8 1 4 30.03 40.97
9 2 5 21.60 20.78
10 1 5 31.30 40.84
11 2 6 31.60 41.85
12 1 6 21.72 41.85
13 2 7 31.60 40.78
14 1 7 20.02 31.79
15 2 8 11.60 20.78
16 1 8 20.16 30.23
To generate the (wrong) plot I currently have, please run the codes below:
library(ggplot2)
library(lemon)
ggplot(datatest, aes(V1,V2,color=as.factor(timepoint),group=id)) +ggtitle("V2 vs V1 from T1 to T2")+
geom_pointline(linesize=1, size=2, distance=4, arrow = arrow(angle = 30, length = unit(0.1, "inches"), ends = "first", type = "open") )+
scale_x_continuous(limits = c(0,33), breaks=seq(0,30,10), expand = c(0, 0)) +
scale_y_continuous(limits = c(0,43), breaks=seq(0,44,10),expand = c(0, 0))+
scale_color_manual(values=c("green","red"))+labs(color = "Timepoint")
The plot currently looks like this:
Thank you!

Would this get you closer?
library(dplyr)
library(tidyr)
library(ggplot2)
data <- data.frame(timepoint =rep(seq(2,1),8),
id = c(1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8),
V1= c( 30.29, 30.29, 21.60, 31.43, 20.75,20.75, 21.60, 30.03, 21.60, 31.30, 31.60, 21.72, 31.6, 20.02, 11.60, 20.16),
V2=c(40, 40, 30.78, 41.63, 40.41, 40.41,30.78, 40.97, 20.78, 40.84, 41.85, 41.85, 40.78, 31.79,20.78, 30.23))
data <- data %>%
mutate(row_id = paste0("T", timepoint)) %>%
pivot_wider(id_cols = id,
names_from = row_id,
values_from = c(V1, V2)) %>%
mutate(colour = ifelse((V1_T1 > V1_T2) | (V2_T1 > V2_T2), "red", "green"))
ggplot(data = data) +
geom_point(aes(x = V1_T1, y = V2_T1)) +
geom_point(aes(x = V1_T2, y = V2_T2)) +
geom_segment(aes(x = V1_T1, xend = V1_T2, y = V2_T1 , yend = V2_T2, colour = colour),
arrow = arrow(length = unit(0.3,"cm"))) +
scale_x_continuous(
limits = c(0, 33),
breaks = seq(0, 30, 10),
expand = c(0, 0)
) +
scale_y_continuous(
limits = c(0, 43),
breaks = seq(0, 44, 10),
expand = c(0, 0)
)
You can filter the object data to remove those lines where V1 and V2 do not change and not draw the lines with length zero.

Related

data restructure for ggplot geom_bar() stacked bar plotting

avg data frame
structure(list(cluster = 1:10, `B cells` = c(0.0369711424087593,
0.00526325696315245, 0.0601665087700304, 0.0231936137674591,
0.00766480549892195, 0.0285649960414246, 0.0044030329888148,
0.00345795624392323, 0.00309644760567017, 0.00757469580646642
), DCreg = c(0.0304752063136609, 0.174423402403555, 0.0163287878795231,
0.0192154395050034, 0.124511133655915, 0.0296144152010606, 0.205920199256583,
0.114542510479173, 0.485649315606826, 0.0260997195368302), `Dendritic cells` = c(0.156500506395882,
0.0106345235402551, 0.185348445999056, 0.395476210792188, 0.0719924126421944,
0.104614178324861, 0.0226961213600642, 0.00292885066859525, 0.0122661582750054,
0.118394797602606), `Dendritic cells CD103` = c(0.0482626330670718,
0.0140976438812366, 0.030373962919268, 0.0614351282717271, 0.189884617234425,
0.35658217311524, 0.0170390739879794, 0.0042469791834164, 0.0233514821789908,
0.0619204360724114), Endothelium = c(0.11337268119519, 0.027025412632833,
0.43869939276274, 0.0662483745710424, 0.0331520081202891, 0.164940771021627,
0.050135082662031, 0.00351285357934976, 0.0201434603120533, 0.0658151087814588
), Epithelium = c(0.00418217375070304, 0.000413203430326014,
0.0104665752013841, 0.00525017082076173, 0.00415698684351819,
0.0333637286413386, 0.000431569929321054, 0, 0.0011976402913935,
0.000419107154908937), Fibroblasts = c(0.00612607297867521, 0.0116371963351148,
0.0108995123396445, 0.0117009481628146, 0.00674570810846355,
0.0145571600114712, 0.0120879220427041, 0.00272604244680674,
0.00772202564316953, 0.0272894372187893), `Macrophages other` = c(0.00101589948056542,
0.000645130694683314, 0, 0, 0.000639755622911849, 0, 0.000197788594031649,
0.00136588418173722, 0, 0.000420171738310913), `Macrophages type 1` = c(0.221136736926214,
0.0101728310491049, 0.0295121583899105, 0.0455316207473085, 0.0230660380060092,
0.0222078529371378, 0.015179095607796, 0.00459851371158574, 0.0112212936162074,
0.02937463664781), `Macrophages type 2` = c(0.0411011962682536,
0.0522714029078864, 0.012334445025602, 0.0568282306829578, 0.0453391303748083,
0.0181451496347937, 0.239616155787136, 0.0115489617356957, 0.04981525808734,
0.462030477544264), Neutrophils = c(0.0766806635700175, 0.00442125133471751,
0.0476726698091672, 0.0236749605376406, 0.00911361867045396,
0.0236169696110325, 0.00537803767758349, 0.0032239571528306,
0.00201957474248881, 0.0160311845078706), `NK cells` = c(0, 0,
0.000108464194313773, 0, 0, 8.99698299254026e-05, 0.000114169258081956,
0, 4.57749702462694e-05, 2.78396436525612e-05), `T cells CD4` = c(0.0330641154468336,
0.0213946654236908, 0.0323515137814534, 0.148686432010321, 0.0500449048718068,
0.0685338874314457, 0.0273478878575203, 0.00472971607890761,
0.0328998359523529, 0.0354818425253482), `T cells CD8` = c(0.0172498783937768,
0.00877876825324442, 0.0156948623402281, 0.0207354640030442,
0.0145536348676947, 0.0146643634343241, 0.0155197086731341, 0.00171509323694132,
0.0135851481885585, 0.0159896002840603), `T reg cells` = c(0.00451599932441037,
0.0058712074137469, 0.00274652046695111, 0.0167445990360021,
0.0127422536359504, 0.0142171857157357, 0.00996063310868601,
0.00089148571457417, 0.0113706843090688, 0.00663049091849752),
Tumour = c(0.0765887917753441, 0.651476092235795, 0.0173767962070959,
0.0647526184622169, 0.395840854655601, 0.0472273714361081,
0.368387800802699, 0.839842321316499, 0.323145170321728,
0.111585860905902), Unclassified = c(0.132756302704642, 0.00147401150065844,
0.0899193839136316, 0.0405261886295129, 0.0105521371910369,
0.0590598276124738, 0.00558572040583437, 0.000668874269964592,
0.00247072989889988, 0.0149145931108126)), class = "data.frame", row.names = c(NA,
-10L))
cluster B cells DCreg Dendritic cells Dendritic cells CD103 Endothelium Epithelium Neutrophils NK cells T cells CD4 T cells CD8 T reg cells Tumour Unclassified
1 1 0.036971142 0.03047521 0.156500506 0.048262633 0.113372681 0.0041821738 0.076680664 0.000000e+00 0.033064115 0.017249878 0.0045159993 0.07658879 0.1327563027
2 2 0.005263257 0.17442340 0.010634524 0.014097644 0.027025413 0.0004132034 0.004421251 0.000000e+00 0.021394665 0.008778768 0.0058712074 0.65147609 0.0014740115
3 3 0.060166509 0.01632879 0.185348446 0.030373963 0.438699393 0.0104665752 0.047672670 1.084642e-04 0.032351514 0.015694862 0.0027465205 0.01737680 0.0899193839
4 4 0.023193614 0.01921544 0.395476211 0.061435128 0.066248375 0.0052501708 0.023674961 0.000000e+00 0.148686432 0.020735464 0.0167445990 0.06475262 0.0405261886
5 5 0.007664805 0.12451113 0.071992413 0.189884617 0.033152008 0.0041569868 0.009113619 0.000000e+00 0.050044905 0.014553635 0.0127422536 0.39584085 0.0105521372
6 6 0.028564996 0.02961442 0.104614178 0.356582173 0.164940771 0.0333637286 0.023616970 8.996983e-05 0.068533887 0.014664363 0.0142171857 0.04722737 0.0590598276
7 7 0.004403033 0.20592020 0.022696121 0.017039074 0.050135083 0.0004315699 0.005378038 1.141693e-04 0.027347888 0.015519709 0.0099606331 0.36838780 0.0055857204
8 8 0.003457956 0.11454251 0.002928851 0.004246979 0.003512854 0.0000000000 0.003223957 0.000000e+00 0.004729716 0.001715093 0.0008914857 0.83984232 0.0006688743
9 9 0.003096448 0.48564932 0.012266158 0.023351482 0.020143460 0.0011976403 0.002019575 4.577497e-05 0.032899836 0.013585148 0.0113706843 0.32314517 0.0024707299
10 10 0.007574696 0.02609972 0.118394798 0.061920436 0.065815109 0.0004191072 0.016031185 2.783964e-05 0.035481843 0.015989600 0.0066304909 0.11158586 0.0149145931
I have the above data frame and am trying to create a stacked bar using ggplot geom_bar() where each bar = 1 cluster (10 clusters, so 10 bars) and each bar is filled with the proportions of each cell type contributing to a cluster (proportion values for each cluster add up to 1).
I have started by changing the layout of the data :
avgt = avg %>% pivot_longer(cols = -cluster)
Which gave me this layout:
cluster name value
1 1 B cells 0.0370
2 1 DCreg 0.0305
3 1 Dendritic cells 0.157
4 1 Dendritic cells CD103 0.0483
5 1 Endothelium 0.113
6 1 Epithelium 0.00418
7 1 Fibroblasts 0.00613
8 1 Macrophages other 0.00102
9 1 Macrophages type 1 0.221
10 1 Macrophages type 2 0.0411
However I am not sure what to do next as if I use the 'cluster' column as X and 'name' column for the 'fill' I, as expected, get equal proportions for each cell type
p = ggplot(avgt, aes(x = as.factor(cluster), fill = as.factor(name)))+
geom_bar(position = "fill") +
theme_classic()+
scale_y_continuous(labels = scales::percent) +
coord_flip() +
theme(axis.text.y = element_text(size = 20),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20),
axis.text=element_text(size=20)) +
theme(legend.text = element_text(size = 20)) +
xlab("Community")+
ylab("Percentage distribution") +
labs( fill = "")
p
geom_bar() stacked plot result
Any ideas of how I can get this to work?
Thanks in advance

plotly: Map size (shape) to value colum in scatterplot

I really appreaciate the 'plotly' r-package. Currently I run into an issue, where I want to visualize a data frame as points and map the point size (as well as the shape potentially) to a dimension of the data frame.
The problem I run into with my own dataset is, that the sizes are somehow "mixed up" in the sense, that the bigger points don't correspond to the bigger values.
I haven't fully understood the options I have with plotly (sizeref and other marker-options; the fundamental difference between mapping the dimension directly or in the marker arguments; etc) , so this is my best shot as a minimal example right here.
(The second plot is closer to what I currently do. If this one could be fixed, it would be preferable to me)
Your thoughts are greatly appreciated. :)
library(plotly)
set.seed(1)
df <- data.frame(x = 1:10,
y = rep(c("id1", "id2"), 5),
col = factor(sample(3, 10, replace = TRUE)))
df$size <- c(40, 40, 40, 30, 30, 30, 20, 20, 20, 10)
df
#> x y col size
#> 1 1 id1 1 40
#> 2 2 id2 2 40
#> 3 3 id1 2 40
#> 4 4 id2 3 30
#> 5 5 id1 1 30
#> 6 6 id2 3 30
#> 7 7 id1 3 20
#> 8 8 id2 2 20
#> 9 9 id1 2 20
#> 10 10 id2 1 10
# Mapping looks right, but the size may not be correct
plot_ly(df,
x = ~x,
y = ~y,
color = ~col,
size = ~size,
type = 'scatter',
mode = 'markers',
hoverinfo = "text",
text = ~paste('</br> x: ', x,
'</br> y: ', y,
'</br> col: ', col,
'</br> size: ', size)
# , marker = list(size = ~size)
)
# Size looks right, but mapping to points is wrong
plot_ly(df,
x = ~x,
y = ~y,
color = ~col,
# size = ~size,
type = 'scatter',
mode = 'markers',
hoverinfo = "text",
text = ~paste('</br> x: ', x,
'</br> y: ', y,
'</br> col: ', col,
'</br> size: ', size)
, marker = list(size = ~size)
)
devtools::session_info() # excerpt
#> plotly * 4.8.0

r plotly chart based on multiple columns [duplicate]

This question already has answers here:
Format axis tick labels to percentage in plotly
(2 answers)
Closed 2 years ago.
I have a df which can have 2 or more columns with the first one month always fixed.I am trying to plot them using plotly r. As of now it has three columns: month,apple,orange. Based on analysis it can have another column banana. Below is the code I am using right now but it even takes the column month for y-axis. How do I fix this:
> sample_test
month apple orange
2 Aug-17 2 1
3 Dec-17 2 1
4 Feb-18 2 1
5 Jan-18 2 1
6 Jul-17 2 1
7 Jun-17 2 1
8 May-17 2 1
9 Nov-17 2 1
10 Oct-17 2 1
11 Sep-17 2 1
p<- plot_ly(sample_test, x = sample_test$month, name = 'alpha', type = 'scatter', mode = 'lines',
line = list(color = 'rgb(24, 205, 12)', width = 4)) %>%
layout(#title = "abbb",
xaxis = list(title = "Time"),
yaxis = list (title = "Percentage"))
for(trace in colnames(sample_test)){
p <- p %>% plotly::add_trace(y = as.formula(paste0("~`", trace, "`")), name = trace)
}
p
The output looks like this :
Does this help?
sample_test <- read.table(
text = ' month apple orange
2 Aug-17 2 1
3 Dec-17 2 1
4 Feb-18 2 1
5 Jan-18 2 1
6 Jul-17 2 1
7 Jun-17 2 1
8 May-17 2 1
9 Nov-17 2 1
10 Oct-17 2 1
11 Sep-17 2 1'
)
sample_test$month <- as.Date(paste('01', sample_test$month, sep = '-'), format = '%d-%b-%y')
library(plotly)
p <- plot_ly(sample_test, type = 'scatter', mode = 'lines',
line = list(color = 'rgb(24, 205, 12)', width = 4)) %>%
layout(#title = "abbb",
xaxis = list(title = "Time"),
yaxis = list (title = "Percentage", tickformat = '%'))
for(trace in colnames(sample_test)[2:ncol(sample_test)]){
p <- p %>% plotly::add_trace(x = sample_test[['month']], y = sample_test[[trace]], name = trace)
}
p
There are couple of things to note here -
While dealing with dates, it's best to format them as dates. This can save a lot of headache later on. It is also useful as most if not all functions that require dealing with dates have methods built to handle them.
While adding traces in a for loop, always reference the vector to be plotted explicitly like data$vector or data[['vector']] and not like y = ~vector, because plotly for some reason ends up plotting just one trace over and over again.
You can specify a trace for the first y element, which will give you your raw counts. Next you can add a format for your y-axis using tickformat, which will convert to percentages.
sample_test <- data.frame(month = c("Aug-17", "Dec-17", "Feb-18"), apple = c(2,2,2), orange = c(1,1,1))
p <- plot_ly(sample_test, x = sample_test$month, y = ~apple, name = 'alpha', type = 'scatter', mode = 'lines',
line = list(color = 'rgb(24, 205, 12)', width = 4)) %>%
layout(xaxis = list(title = "Time")) %>%
layout(yaxis = list(tickformat = "%", title = "Percentage"))
Although for some reason this appears to just multiply by 100 and add a % label for some reason, rather than actually calculate a percentage. From this SO answer, looks like that's all it does. I don't really use plotly, but in ggplot you can do this if you reshape your data to long and map your categorical variable (in this case fruit) as a percent.
Edit: per OP's comment, removed month from being traced.
p <- plot_ly(type = 'scatter', mode = 'lines') %>%
layout(yaxis = list(tickformat = "%", title = "Percentage"))
colNames <- names(sample_test)
colNames <- colNames[-which(colNames == 'month')]
for(trace in colNames){
p <- p %>% plotly::add_trace(data = sample_test, x = ~ month, y = as.formula(paste0("~`", trace, "`")), name = trace)
print(paste0("~`", trace, "`"))
}
p

Multiple line chart using plotly r

I have a data frame which I am trying to plot using plotly as multiple line chart.Below is how the dataframe looks like:
Month_considered pct.x pct.y pct
<fct> <dbl> <dbl> <dbl>
1 Apr-17 79.0 18.4 2.61
2 May-17 78.9 18.1 2.99
3 Jun-17 77.9 18.7 3.42
4 Jul-17 77.6 18.5 3.84
5 Aug-17 78.0 18.3 3.70
6 Sep-17 78.0 18.9 3.16
7 Oct-17 77.6 18.9 3.49
8 Nov-17 77.6 18.4 4.01
9 Dec-17 78.5 18.0 3.46
10 Jan-18 79.3 18.4 2.31
11 2/1/18 78.9 19.6 1.48
When I iterate through to plot multiple lines below is the code used.
colNames <- colnames(delta)
p <-
plot_ly(
atc_seg_master,
x = ~ Month_considered,
type = 'scatter',
mode = 'line+markers',
line = list(color = 'rgb(205, 12, 24)', width = 4)
)
for (trace in colNames) {
p <-
p %>% plotly::add_trace(y = as.formula(paste0("~`", trace, "`")), name = trace)
}
p %>%
layout(
title = "Trend Over Time",
xaxis = list(title = ""),
yaxis = list (title = "Monthly Count of Products Sold")
)
p
This is how the output looks like
My question is how to remove trace 0 and month_considered to remove from the chart even though its not in colnames which I loop through to add the lines.
It looks like you were getting tripped up by two things:
When you initially defined p and included the data and x arguments, a trace was created -- trace 0. You can define a plot without providing any data or x values to start by just using p <- plot_ly() along with any desired layout features.
When you loop through the column names, your x axis column, Month_Considered is part of the set. You can exclude this by using setdiff() (part of base R) to create a vector with all of your column names except for Months_Considered
Putting those two things together, one way (of many possible) to accomplish what you're going for is as follows:
library(plotly)
df <- data.frame(Month_Considered = seq.Date(from = as.Date("2017-01-01"), by = "months", length.out = 12),
pct.x = seq(from = 70, to = 80, length.out = 12),
pct.y = seq(from = 30, to = 40, length.out = 12),
pct = seq(from = 10, to = 20, length.out = 12))
## Define a blank plot with the desired layout (don't add any traces yet)
p <- plot_ly()%>%
layout(title = "Trend Over Time",
xaxis = list(title = ""),
yaxis = list (title = "Monthly Count of Products Sold") )
## Make sure our list of columns to add doesnt include the Month Considered
ToAdd <- setdiff(colnames(df),"Month_Considered")
## Add the traces one at a time
for(i in ToAdd){
p <- p %>% add_trace(x = df[["Month_Considered"]], y = df[[i]], name = i,
type = 'scatter',
mode = 'line+markers',
line = list(color = 'rgb(205, 12, 24)', width = 4))
}
p

Draw a graph in R with header elaborate on two columns

I have a table with header expanded on two columns. How to draw a 3D graph on this table OR what would be a way to draw a graph on tables having elaborated headers. Kindly suggest me alternate ways to achieve this (if any)
Crime Table:
year
2014 2015 2016
Reported Detected Reported Detected Reported Detected
Murder 221 208 178 172 26 20
Murder(Gain) 20 16 11 9 1 1
Dacoity 51 45 44 36 5 1
Robbery 538 316 351 201 23 10
Chain Snatching 528 394 342 229 23 0
Code:
library(tables)
#CLASS 1 CRIMES 2014
c14 <- structure(list(`Reported` = c(221, 20, 51,
538, 528), `Detected` = c(208, 16, 45, 316, 394)), .Names = c("Reported",
"Detected"), row.names = c("Murder", "Murder(Gain)", "Dacoity", "Robbery", "Chain Snatching"), class = "data.frame")
c14
#CLASS 1 CRIMES 2015
c15 <- structure(list(`Reported` = c(178, 11, 44,
351, 342), `Detected` = c(172, 9,
36, 201, 229)), .Names = c("Reported",
"Detected"), row.names = c("Murder", "Murder(Gain)", "Dacoity",
"Robbery", "Chain Snatching"), class = "data.frame")
c15
#CLASS 1 CRIMES 31-01-2016
c16 <- structure(list(`Reported` = c(26, 1, 5,
23, 23), `Detected` = c(20, 1,
1, 10, 0)), .Names = c("Reported",
"Detected"), row.names = c("Murder", "Murder(Gain)", "Dacoity",
"Robbery", "Chain Snatching"), class = "data.frame")
c16
# rbind with rownames as a column
st <- rbind(
data.frame(c14, year = '2014', what = factor(rownames(c14), levels = rownames(c14)),
row.names= NULL, check.names = FALSE),
data.frame(c15,year = '2015',what = factor(rownames(c15), levels = rownames(c15)),
row.names = NULL,check.names = FALSE),
data.frame(c16,year = '2016',what = factor(rownames(c16), levels = rownames(c16)),
row.names = NULL,check.names = FALSE)
)
crimetable <- tabular(Heading()*what ~ year*(`Reported` +`Detected`)*Heading()*(identity),data=st)
crimetable
As I hate 3D plots for 3-way tables and I like ggplot2, I suggest this:
Gather your data into "long" format:
library(tidyr)
st_long = gather(st, type, count, -c(year, what))
head(st_long, 3)
# year what type count
# 1 2014 Murder Reported 221
# 2 2014 Murder(Gain) Reported 20
# 3 2014 Dacoity Reported 51
As you can see, both Detected and Reported columns are now included in the same column called type. This is useful for ggplot2, as it can easily create facets. Facets are separate elements within the plot that share the same aesthetic components but work with on different groups of data:
library(ggplot2)
ggplot(st_long, aes(year, count, group = what, color = what)) +
geom_line() +
facet_wrap(~ type)
(I am not saying that line plot is the only/best plot here, but it is often used when comparing frequencies across different time-points.)

Resources