Reorder not working in ggplot with multiple facets - r

I have the data below:
LETTER ID NUMBER
1 A 805qhau1hbnm1 0.001
2 A 47s11wwxy8x7c 0.521
3 A 92g6022uvxtmf 0.036
4 A 92pkgg5y0gvkk 0.002
5 B gxx44abszy02j 0.066
6 B agupupsu0gq26 0.001
7 B 92g6022uvxtmf 0.003
8 B 92g6022uvxtmf 0.003
9 B agupupsu0gq26 0.004
10 B dwvprfgxafqct 0.058
11 B 92pkgg5y0gvkk 0.161
12 B 2264vrpp4b02v 0.444
13 B 92g6022uvxtmf 0.084
14 B 1ypga6ay26dyk 0.018
15 B 9tkrv34jdmvtk 0.414
16 B agupupsu0gq26 0.001
17 B agupupsu0gq26 0.002
18 B gxx44abszy02j 0.065
19 B 0mtz8hnvvm63r 0.012
20 B 9ta79k8xtyzdy 0.006
21 B 92g6022uvxtmf 0.014
22 A 47s11wwxy8x7c 0.539
23 A 92g6022uvxtmf 0.028
24 A 92pkgg5y0gvkk 0.003
25 A 92pkgg5y0gvkk 0.002
26 A 805qhau1hbnm1 0.001
27 A fmubqnkxnj16f 0.451
28 B 448pxv1p0ffjp 0.040
29 B 3cj2kj0rx311k 0.012
30 B 9ta79k8xtyzdy 0.006
31 B gxx44abszy02j 0.064
32 B agupupsu0gq26 0.002
33 B agupupsu0gq26 0.001
34 A 92pkgg5y0gvkk 0.002
35 A 65a353h1x9yfd 0.055
36 B dbrx980zu7bmk 0.009
And I have the ggplot code below:
l_myPlot <- ggplot(data=l_data, aes(x=reorder(x=ID, X=NUMBER, sum, order=T), y = NUMBER))+
geom_bar(stat='identity' )+
facet_wrap(~ LETTER, scales="free_x")+
theme(axis.text.x=element_text(angle=90, hjust=1))+
scale_y_continuous()
As you can see, I am reordering the x axis based on the addition of the number column. The issue is that the sorting is not being done properly. The A facet ID 92...vkk should be the second bar in the order, not the fourth.

My approach is to use factors to order a new identifier created out of LETTER and ID and then use scale_x_discrete(labels =) to alter the x axis labels.
library(ggplot2)
library(dplyr)
# summarise the data
ld <- l_data %>% group_by(LETTER, ID) %>% transmute(sum = sum(NUMBER))
ld <- ld[!duplicated(ld) ,]
# Sort in correct order
ld <- ld[with(ld, order(LETTER, sum)) ,]
# Factor in the sorted order
ld$new_ID <- factor(paste(ld$LETTER, ld$ID),
levels = paste(ld$LETTER, ld$ID))
# Plot
l_myPlot <- ggplot() +
geom_bar( data = ld,
aes(x = new_ID,
y = sum ),
stat = 'identity' ) +
facet_wrap( ~ LETTER
, scales = "free_x"
) +
scale_x_discrete(labels=ld$ID) +
theme ( axis.text.x = element_text( angle = 90, hjust = 1 ) ) +
scale_y_continuous()
l_myPlot

Related

sorting data in geom_tile()

My data:
GFP_mywide<-bothdata[1:10,c(2,3,4)]
GFP_long<-melt(GFP_mywide, id =c('Entrez.Symbol'))
looks like this:
>GFP_long
Entrez.Symbol variable value
1 TRIP11 GFP_my 1.015
2 SIN3B GFP_my 0.336
3 SF3B1 GFP_my 0.315
4 PSMD14 GFP_my 0.254
5 RAD51 GFP_my 0.286
6 BARD1 GFP_my 0.157
7 BRCA1 GFP_my 0.275
8 BRCA1 GFP_my 0.230
9 U5200KD GFP_my 0.772
10 SETD5 GFP_my 0.364
11 TRIP11 GFP_wide 0.020
12 SIN3B GFP_wide 0.055
13 SF3B1 GFP_wide 0.071
14 PSMD14 GFP_wide 0.102
15 RAD51 GFP_wide 0.109
16 BARD1 GFP_wide 0.139
17 BRCA1 GFP_wide 0.146
18 BRCA1 GFP_wide 0.146
19 U5200KD GFP_wide 0.151
20 SETD5 GFP_wide 0.179
I want to create a heatmap when the values are sorted according to GFP_wide, so in the plot I will see the green becoming red for the GFP_wide, and GFP_my will be ordered by the same Entrez.Symbol. Until now I have a heatmap but can't find a way to sort the values based on GFP_wide. How do I do that?
This is my code and result:
ggplot(GFP_long, aes(x=Entrez.Symbol,y=variable,fill=value)) +
geom_tile(aes(fill = value))+
scale_fill_gradient(low="red", high="green")
Using the data from your other question:
library(dplyr); library(forcats) # please check these load successfully
GFP_long %>%
arrange(desc(variable), -value) %>%
mutate(Entrez.Symbol = fct_inorder(Entrez.Symbol)) %>%
ggplot(aes(x=Entrez.Symbol,y=variable,fill=value)) +
geom_tile(aes(fill = value))+
# I'm modifying here to show the top row is ordered; hard to see in original
scale_fill_gradient2(low="red", mid = "gray80", high="green", midpoint = 0.2)
Data: how to order column by a different column value r
GFP_long<-data.frame(Entrez.Symbol<-c("TRIP11","SIN3B","SF3B1","PSMD14","RAD51","BARD1",
"BRCA1","BRCA1","U5200KD","SETD5","TRIP11","SIN3B",
"SF3B1","PSMD14","RAD51","BARD1","BRCA1","BRCA1",
"U5200KD","SETD5"),
variable<-c("GFP_my","GFP_my","GFP_my","GFP_my","GFP_my","GFP_my","GFP_my",
"GFP_my","GFP_my","GFP_my","GFP_wide","GFP_wide","GFP_wide","GFP_wide",
"GFP_wide","GFP_wide","GFP_wide","GFP_wide","GFP_wide","GFP_wide"),
value<-c(1.015,0.336,0.315,0.254,0.286,0.157,0.275,0.230,0.772,0.364,
0.020,0.055,0.071,0.102,0.109,0.139,0.146,0.146,0.151,0.179))
colnames(GFP_long)<-c("Entrez.Symbol","variable","value")

How to graph an ANOVA w/ conditional p-values and confidence intervals in R?

I have df1:
Rate Dogs MHI_2018 Points Level AGE65_MORE P_Elderly
1 0.10791173 0.00000000 59338 236.4064 C 8653 15.56267
2 0.06880040 0.00000000 57588 229.4343 C 44571 20.44335
3 0.08644537 0.00000000 50412 200.8446 C 10548 18.23651
4 0.29591635 0.00000000 29267 116.6016 A 1661 16.38390
5 0.05081301 0.00000000 37365 148.8645 B 3995 20.29980
6 0.02625200 0.00000000 45400 180.8765 D 20247 17.71748
7 0.80321285 0.02974862 39917 159.0319 D 6562 19.52105
8 0.07682852 0.00000000 42132 167.8566 D 5980 22.97173
9 0.18118814 0.00000000 47547 189.4303 B 7411 16.78482
10 0.07787555 0.00000000 39907 158.9920 B 2953 22.99665
11 0.15065913 0.00000000 39201 156.1793 C 2751 20.72316
12 0.33362247 0.00000000 46495 185.2390 B 2915 19.45019
13 0.03652168 0.00000000 49055 195.4382 B 10914 19.92988
14 0.27998133 0.00000000 42423 169.0159 A 2481 23.15446
15 0.05407451 0.00000000 40203 160.1713 A 7790 21.06202
16 0.07233796 0.00000000 39057 155.6056 A 2629 19.01765
17 0.08389061 0.00000000 45796 182.4542 B 15446 18.51106
18 0.05220569 0.00000000 34035 135.5976 B 6921 18.06578
19 0.05603418 0.00000000 39491 157.3347 B 12322 17.26133
20 0.15875536 0.00000000 60367 240.5060 C 12400 15.14282
With
AOV <- aov(Rate~Level, data = df)
TukeyHSD(AOV)
$Level
diff lwr upr p adj
B-A -0.066558621 -0.3783957 0.2452784 0.9272012
C-A -0.061063140 -0.4026635 0.2805372 0.9551663
D-A 0.126520253 -0.2624089 0.5154494 0.7890519
C-B 0.005495482 -0.2848090 0.2958000 0.9999404
D-B 0.193078874 -0.1516699 0.5378277 0.4049948
D-C 0.187583392 -0.1843040 0.5594708 0.4923479
I would now like to make a plot of this data with means and confidence intervals. I would also like to plot the p_adj between variables if it is < 0.50. Output would look like:
One solution is to use ggsignif package but first you need to prepare the output of TukeyHSD for its use in ggsignif:
AOV <- aov(Rate~Level, data = df)
t <-as.data.frame(TukeyHSD(AOV)$Level)
library(tidyverse)
MAX <-df %>% group_by(Level) %>% summarise(Max = max(Rate))
T1 <- t %>% rownames_to_column("Group") %>%
mutate(Start = sub("^(.).*","\\1",Group),
End = sub(".*(.)$","\\1",Group)) %>%
left_join(.,MAX, by = c("Start" = "Level")) %>%
left_join(.,MAX, by = c("End" = "Level")) %>%
mutate(End = factor(End)) %>%
rowwise() %>%mutate(ypos = max(Max.x, Max.y)*(1+0.25*as.numeric(End)))
Source: local data frame [6 x 10]
Groups: <by row>
# A tibble: 6 x 10
Group diff lwr upr `p adj` Start End Max.x Max.y ypos
<chr> <dbl> <dbl> <dbl> <dbl> <chr> <fct> <dbl> <dbl> <dbl>
1 B-A -0.0666 -0.378 0.245 0.927 B A 0.334 0.296 0.417
2 C-A -0.0611 -0.403 0.281 0.955 C A 0.159 0.296 0.370
3 D-A 0.127 -0.262 0.515 0.789 D A 0.803 0.296 1.00
4 C-B 0.00550 -0.285 0.296 1.00 C B 0.159 0.334 0.500
5 D-B 0.193 -0.152 0.538 0.405 D B 0.803 0.334 1.20
6 D-C 0.188 -0.184 0.559 0.492 D C 0.803 0.159 1.41
Now, you can plot your data and add significance based on the T1 dataset:
library(ggsignif)
library(ggplot2)
ggplot(df, aes(x = Level, y = Rate))+
geom_jitter(width = 0.2)+
stat_summary(fun.data = "mean_cl_normal", geom = "errorbar", width = 0, color = "red") +
stat_summary(fun = "mean", geom = "errorbar", aes(ymax = ..y.., ymin = ..y..), col = "red", width = 0.5) +
geom_signif(data = subset(T1,`p adj` <0.5), manual = TRUE,
aes(xmax = End, xmin = Start, y_position= ypos, annotations = round(`p adj`,3)))

How to calculate the Bonferroni Lower and Upper limits in R?

With the following data, I am trying to calculate the Chi Square and Bonferroni lower and upper Confidence intervals. The column "Data_No" identifies the dataset (as calculations needs to be done separately for each dataset).
Data_No Area Observed
1 3353 31
1 2297 2
1 1590 15
1 1087 16
1 817 2
1 847 10
1 1014 28
1 872 29
1 1026 29
1 1215 21
2 3353 31
2 2297 2
2 1590 15
3 1087 16
3 817 2
The code I used is
library(dplyr)
setwd("F:/GIS/July 2019/")
total_data <- read.csv("test.csv")
result_data <- NULL
for(i in unique(total_data$Data_No)){
data <- total_data[which(total_data$Data_No == i),] data <- data %>%
mutate(RelativeArea = Area/sum(Area), Expected = RelativeArea*sum(Observed), OminusE = Observed-Expected, O2 = OminusE^2, O2divE = O2/Expected, APU = Observed/sum(Observed), Alpha = 0.05/2*count(Data_No),
Zvalue = qnorm(Alpha,lower.tail=FALSE), lower = APU-Zvalue*sqrt(APU*(1-APU)/sum(Observed)), upper = APU+Zvalue*sqrt(APU*(1-APU)/sum(Observed)))
result_data <- rbind(result_data,data) }
write.csv(result_data,file='final_result.csv')
And the error message I get is:
Error in UseMethod("summarise_") : no applicable method for
'summarise_' applied to an object of class "c('integer', 'numeric')"
The column that I am calling "Alpha" is the alpha value of 0.05/2k, where K is the number of categories - in my example, I have 10 categories ("Data_No" column) for the first dataset, so "Alpha" needs to be 0.05/20 = 0.0025, and it's corresponding Z value is 2.807. The second dataset has 3 categories (so 0.05/6) and the third has 2 categories (0.05/4) in my example table (Data_No" column). Using the values from the newly calculated "Alpha" column, I then need to calculate the ZValue column (Zvalue = qnorm(Alpha,lower.tail=FALSE)) which I then use to calculate the lower and upper confidence intervals.
From the above data, here are the results that I should get, but note that I have had to manually calculate Alpha column and Zvalue, rather than insert those calculations within the R code:
Data_No Area Observed RelativeArea Alpha Z value lower upper
1 3353 31 0.237 0.003 2.807 0.092 0.247
1 2297 2 0.163 0.003 2.807 -0.011 0.033
1 1590 15 0.113 0.003 2.807 0.025 0.139
1 1087 16 0.077 0.003 2.807 0.029 0.146
1 817 2 0.058 0.003 2.807 -0.011 0.033
1 847 10 0.060 0.003 2.807 0.007 0.102
1 1014 28 0.072 0.003 2.807 0.078 0.228
1 872 29 0.062 0.003 2.807 0.083 0.234
1 1026 29 0.073 0.003 2.807 0.083 0.234
1 1215 21 0.086 0.003 2.807 0.049 0.181
2 3353 31 0.463 0.008 2.394 0.481 0.811
2 2297 2 0.317 0.008 2.394 -0.027 0.111
2 1590 15 0.220 0.008 2.394 0.152 0.473
3 1087 16 0.571 0.013 2.241 0.723 1.055
3 817 2 0.429 0.013 2.241 -0.055 0.277
Please note that I only included some of the columns generated from the code.
# You need to check the closing bracket for lower c.f. sqrt value. Following code should work.
data <- read.csv("test.csv")
data <- data %>% mutate(RelativeArea =
Area/sum(Area), Expected = RelativeArea*sum(Observed), OminusE =
Observed-Expected, O2 = OminusE^2, O2divE = O2/Expected, APU =
Observed/sum(Observed), lower =
APU-2.394*sqrt(APU*(1-APU)/sum(Observed)), upper =
APU+2.394*sqrt(APU*(1-APU)/sum(Observed)))
#Answer to follow-up question.
#Sample Data
Data_No Area Observed
1 3353 31
1 2297 2
2 1590 15
2 1087 16
#Code to run
total_data <- read.csv("test.csv")
result_data <- NULL
for(i in unique(total_data$Data_No)){
data <- total_data[which(total_data$Data_No == i),]
data <- data %>% mutate(RelativeArea =
Area/sum(Area), Expected = RelativeArea*sum(Observed), OminusE =
Observed-Expected, O2 = OminusE^2, O2divE = O2/Expected, APU =
Observed/sum(Observed), lower =
APU-2.394*sqrt(APU*(1-APU)/sum(Observed)), upper =
APU+2.394*sqrt(APU*(1-APU)/sum(Observed)))
result_data <- rbind(result_data,data)
}
write.csv(result_data,file='final_result.csv')
#Issue in calculating Alpha. I have updated the code.
library(dplyr)
setwd("F:/GIS/July 2019/")
total_data <- read.csv("test.csv")
#Creating the NO_OF_CATEGORIES column based on your question.
total_data$NO_OF_CATEGORIES <- 0
total_data[which(total_data$Data_No==1),]$NO_OF_CATEGORIES <- 10
total_data[which(total_data$Data_No==2),]$NO_OF_CATEGORIES <- 3
total_data[which(total_data$Data_No==3),]$NO_OF_CATEGORIES <- 2
#Actual code
result_data <- NULL
for(i in unique(total_data$Data_No)){
data <- total_data[which(total_data$Data_No == i),]
data <- data %>%
mutate(RelativeArea = Area/sum(Area), Expected = RelativeArea*sum(Observed), OminusE = Observed-Expected, O2 = OminusE^2, O2divE = O2/Expected, APU = Observed/sum(Observed), Alpha = 0.05/(2*(unique(data$NO_OF_CATEGORIES))),
Zvalue = qnorm(Alpha,lower.tail=FALSE), lower = APU-Zvalue*sqrt(APU*(1-APU)/sum(Observed)), upper = APU+Zvalue*sqrt(APU*(1-APU)/sum(Observed)))
result_data <- rbind(result_data,data) }
write.csv(result_data,file='final_result.csv')

Create and plot a table which preserves the ordering of the factor

When creating and plotting a table the names are numeric values and I would like for them to stay in numeric order.
Code :
library(plyr)
set.seed(1234)
# create a random vector of different categories
number_of_categories <- 11
probability_of_each_category <- c(0.1,0.05, 0.05,0.08, 0.01,
0.1, 0.2, 0.3, 0.01, 0.02,0.08)
number_of_samples <- 1000
x <- sample( LETTERS[1:number_of_categories],
number_of_samples,
replace=TRUE,
prob=probability_of_each_category)
# just a vector of zeros and ones
outcome <- rbinom(number_of_samples, 1, 0.4)
# I want x to be 1,2,...,11 so that it demonstrates the issue when
# creating the table
x <- mapvalues(x,
c(LETTERS[1:number_of_categories]),
seq(1:number_of_categories))
# the table shows the ordering
prop.table(table(x))
plot(table(x, outcome))
Table :
> prop.table(table(x))
x
1 10 11 2 3 4 5 6 7 8 9
0.105 0.023 0.078 0.044 0.069 0.083 0.018 0.097 0.195 0.281 0.007
Plot :
I would like the plot and the table in the order
1 3 4 5 ... 10 11
Rather than
1 10 11 2 3 4 5 6 7 8 9
You can either convert x to numeric before feeding it to table
plot(table(as.numeric(x), outcome))
Or order the table's rows by the as.numeric of the rownames
t <- table(x, outcome)
t <- t[order(as.numeric(rownames(t))),]
plot(t)
A simple to solve this problem is to format the numbers to include a leading zero during mapvalues(), using sprintf().
x <- mapvalues(x,
c(LETTERS[1:number_of_categories]),
sprintf("%02d",seq(1:number_of_categories)))
# the table shows the ordering
prop.table(table(x))
plot(table(x, outcome))
...and the output:
> prop.table(table(x))
x
01 02 03 04 05 06 07 08 09 10 11
0.104 0.067 0.038 0.073 0.019 0.112 0.191 0.291 0.011 0.019 0.075

R multiple line plot using a dataframe [duplicate]

This question already has answers here:
Plot multiple lines (data series) each with unique color in R
(10 answers)
Closed 6 years ago.
I can't seem to figure out how to create a graph for multiple line plots.
This is my dataframe:
topics before_event after_event current
1 1 0.057 0.044 0.064
2 2 0.059 0.055 0.052
3 3 0.058 0.037 0.044
4 4 0.036 0.055 0.044
5 5 0.075 0.064 0.066
6 6 0.047 0.045 0.045
7 7 0.043 0.043 0.041
8 8 0.042 0.041 0.046
9 9 0.049 0.046 0.039
10 10 0.043 0.060 0.045
11 11 0.054 0.054 0.062
12 12 0.065 0.056 0.068
13 13 0.042 0.045 0.048
14 14 0.067 0.054 0.055
15 15 0.049 0.052 0.053
The variables in the dataframe are all numeric vectors, example:
topics <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15)
I know I should be using 'ggplot2' and 'reshape' but I can't seem to work out the correct code to represent topics on the x-axis, the scale 0-1 on the y-axis, and each var (before_event, after_event, current) as three individual lines.
Any help would be really appreciated!
We can use matplot from base R
matplot(df1[,1], df1[-1], type = 'l', xlab = "topics", ylab = "event", col = 2:4, pch = 1)
legend("topright", legend = names(df1)[-1], pch = 1, col=2:4)
we can use ggplot and geom_line
library(ggplot2)
topics <- seq(1,15,1)
before_event <- runif(15, min=0.042, max=0.070)
after_event <- runif(15, min=0.040, max=0.065)
current <- runif(15, min=0.041, max=0.066)
df <- data.frame(topics,before_event,after_event,current) #create data frame from the above vectors
df.m <- melt(df, id.vars="topics") # melt the dataframe using topics as id
# plot the lines using ggplot and geom_line
ggplot(data = df.m,
aes(x = topics, y = value, group = variable, color = variable)) +
geom_line(size = 2)

Resources