I wanted to loop a piece of code that produces graph by ggplot2. My data set looks like the example "prot", juts prot is one accession and in the original data I have many more accessions.
For single accession it looks nice. Just with the looping, I thought that it will place one iteration per one page in .pdf
but it doesn't. This single plot already creates combined plots, so I don't know now how and where to place facet_wrap or facet_grid?
or maybe there is other solution?
help, please help.
'library(ggplot2)
ggplot(prot, aes(factor(genotype), value, fill = Light)) +
geom_bar(stat="identity", position = "dodge") +
scale_fill_brewer(palette = "Set1")
'> prot
Accession genotype variable value Light
966 AT1G01050 WT ML_WT_Dejan_05 219971.1 ML
2828 AT1G01050 WT ML_WT_Dejan_06 286308.6 ML
4690 AT1G01050 WT ML_WT_Dejan_14 1177873.5 ML
6552 AT1G01050 m ML_m_Dejan_08 861982.0 ML
8414 AT1G01050 m ML_m_Dejan_10 3786163.0 ML
10276 AT1G01050 m ML_m_Dejan_11 1289267.7 ML
12138 AT1G01050 f ML_f_Dejan_01 400419.3 ML
14000 AT1G01050 f ML_f_Dejan_04 929297.2 ML
15862 AT1G01050 f ML_f_Dejan_09 12245991.9 ML
17724 AT1G01050 ntrc ML_ntrc_Dejan_02 785773.5 ML
19586 AT1G01050 ntrc ML_ntrc_Dejan_03 971133.1 ML
21448 AT1G01050 ntrc ML_ntrc_dejan7 592207.0 ML
23310 AT1G01050 ntrc ML_ntrc_Dejan_12R 347127.5 ML
25204 AT1G01050 WT FL_WT_Dejan_20 131817.0 FL
27134 AT1G01050 WT FL_WT_Dejan_39 560424.7 FL
29064 AT1G01050 WT FL_WT_Dejan_33 9304183.7 FL
30994 AT1G01050 WT FL_WT_Dejan_34 647452.4 FL
32924 AT1G01050 m FL_m_Dejan_21 712381.5 FL
34854 AT1G01050 m FL_m_Dejan_26 6089158.8 FL
36784 AT1G01050 m FL_m_Dejan_28 11341334.1 FL
38714 AT1G01050 f FL_f_Dejan_19 13140258.2 FL
40644 AT1G01050 f FL_f_Dejan_31 11256554.9 FL
42574 AT1G01050 f FL_f_Dejan_35 1621509.9 FL
44504 AT1G01050 f FL_f_Dejan37 392228.2 FL
46434 AT1G01050 ntrc FL_ntrc_Dejan_30 9069074.8 FL
48364 AT1G01050 ntrc FL_ntrc_Dejan_38 562403.6 FL
50294 AT1G01050 ntrc FL_ntrc_Dejan29 175258.6 FL
79347 AT1G01050 WT LL_WT_Dejan_41 2443625.6 LL
81783 AT1G01050 WT LL_WT_Dejan_43 8529143.7 LL
84219 AT1G01050 WT LL_WT_Dejan_49 11054552.6 LL
86655 AT1G01050 m LL_m_Dejan_44 14325152.0 LL
89091 AT1G01050 m LL_m_Dejan_45 13114486.4 LL
91527 AT1G01050 m LL_m_Dejan_54 8250430.1 LL
93963 AT1G01050 f LL_f_Dejan_47 12431354.5 LL
96399 AT1G01050 f LL_f_Dejan_48 11884118.5 LL
98835 AT1G01050 f LL_f_Dejan_53 8408509.1 LL
101271 AT1G01050 ntrc LL_ntrc_Dejan_46 12214783.1 LL
103707 AT1G01050 ntrc LL_ntrc_Dejan_50 1286828.3 LL
106143 AT1G01050 ntrc LL_ntrc_Dejan_42 1819043.9 LL
plots<- list()
pdf("TEST_boxplot.pdf")
IDs<-unique(prot$Accession)
for (i in 1:length(IDs)){
temp <- prot[(prot$Accession)==IDs[i],]
p<- ggplot(temp, aes(factor(genotype), value, fill = Light)) +
geom_bar(stat="identity", position = "dodge") +
scale_fill_brewer(palette = "Set1")+
ggtitle(as.character(i))
plots[[i]] <- p
#plots[[paste(i)]] = p
#multiplot(plotlist = plots, cols = 1)
}
dev.off()
I generated a prot toy dataset with two levels for Accession.
The code below prints 2 graphs on two pages of the TEST_boxplot.pdf file.
Here is the file generated.
library(ggplot2)
prot1 <- read.table(text="
n Accession genotype variable value Light
966 AT1G01050 WT ML_WT_Dejan_05 219971.1 ML
2828 AT1G01050 WT ML_WT_Dejan_06 286308.6 ML
4690 AT1G01050 WT ML_WT_Dejan_14 1177873.5 ML
6552 AT1G01050 m ML_m_Dejan_08 861982.0 ML
8414 AT1G01050 m ML_m_Dejan_10 3786163.0 ML
10276 AT1G01050 m ML_m_Dejan_11 1289267.7 ML
12138 AT1G01050 f ML_f_Dejan_01 400419.3 ML
14000 AT1G01050 f ML_f_Dejan_04 929297.2 ML
15862 AT1G01050 f ML_f_Dejan_09 12245991.9 ML
17724 AT1G01050 ntrc ML_ntrc_Dejan_02 785773.5 ML
19586 AT1G01050 ntrc ML_ntrc_Dejan_03 971133.1 ML
21448 AT1G01050 ntrc ML_ntrc_dejan7 592207.0 ML
23310 AT1G01050 ntrc ML_ntrc_Dejan_12R 347127.5 ML
25204 AT1G01050 WT FL_WT_Dejan_20 131817.0 FL
27134 AT1G01050 WT FL_WT_Dejan_39 560424.7 FL
29064 AT1G01050 WT FL_WT_Dejan_33 9304183.7 FL
30994 AT1G01050 WT FL_WT_Dejan_34 647452.4 FL
32924 AT1G01050 m FL_m_Dejan_21 712381.5 FL
34854 AT1G01050 m FL_m_Dejan_26 6089158.8 FL
36784 AT1G01050 m FL_m_Dejan_28 11341334.1 FL
38714 AT1G01050 f FL_f_Dejan_19 13140258.2 FL
40644 AT1G01050 f FL_f_Dejan_31 11256554.9 FL
42574 AT1G01050 f FL_f_Dejan_35 1621509.9 FL
44504 AT1G01050 f FL_f_Dejan37 392228.2 FL
46434 AT1G01050 ntrc FL_ntrc_Dejan_30 9069074.8 FL
48364 AT1G01050 ntrc FL_ntrc_Dejan_38 562403.6 FL
50294 AT1G01050 ntrc FL_ntrc_Dejan29 175258.6 FL
79347 AT1G01050 WT LL_WT_Dejan_41 2443625.6 LL
81783 AT1G01050 WT LL_WT_Dejan_43 8529143.7 LL
84219 AT1G01050 WT LL_WT_Dejan_49 11054552.6 LL
86655 AT1G01050 m LL_m_Dejan_44 14325152.0 LL
89091 AT1G01050 m LL_m_Dejan_45 13114486.4 LL
91527 AT1G01050 m LL_m_Dejan_54 8250430.1 LL
93963 AT1G01050 f LL_f_Dejan_47 12431354.5 LL
96399 AT1G01050 f LL_f_Dejan_48 11884118.5 LL
98835 AT1G01050 f LL_f_Dejan_53 8408509.1 LL
101271 AT1G01050 ntrc LL_ntrc_Dejan_46 12214783.1 LL
103707 AT1G01050 ntrc LL_ntrc_Dejan_50 1286828.3 LL
106143 AT1G01050 ntrc LL_ntrc_Dejan_42 1819043.9 LL
", header=T)
prot2 <- prot1
prot2$Accession <- "AT3G53620"
prot <- rbind(prot1,prot2)
plots <- list()
pdf("TEST_boxplot.pdf", onefile=T)
IDs<-unique(prot$Accession)
for (i in 1:length(IDs)){
temp <- prot[(prot$Accession)==IDs[i],]
p<- ggplot(temp, aes(factor(genotype), value, fill = Light)) +
geom_bar(stat="identity", position = "dodge") +
scale_fill_brewer(palette = "Set1")+
ggtitle(as.character(i))
plots[[i]] <- p
print(p)
}
dev.off()
Related
I have a modelsummary of three fixed effects regressions like so:
remotes::install_github("lrberge/fixest")
remotes::install_github("vincentarelbundock/modelsummary")
library(fixest)
library(modelsummary)
mod1 <- feols(mpg ~ hp | cyl, data = mtcars)
mod2 <- feols(mpg ~ wt | cyl, data = mtcars)
mod3 <- feols(mpg ~ drat | cyl, data = mtcars)
modelsummary(list(mod1, mod2, mod3), output = "markdown")
Model 1
Model 2
Model 3
hp
-0.024
(0.015)
wt
-3.206
(1.188)
drat
1.793
(1.564)
Num.Obs.
32
32
32
R2
0.754
0.837
0.745
R2 Adj.
0.727
0.820
0.718
R2 Within
0.080
0.392
0.048
R2 Within Adj.
0.047
0.371
0.014
AIC
167.9
154.6
169.0
BIC
173.8
160.5
174.9
RMSE
2.94
2.39
2.99
Std.Errors
by: cyl
by: cyl
by: cyl
FE: cyl
X
X
X
Instead of having the table show merely whether certain fixed effects were present, is it possible to show the number of fixed effects that were estimated instead?
The raw models do contain this information:
> mod1
OLS estimation, Dep. Var.: mpg
Observations: 32
Fixed-effects: cyl: 3
Standard-errors: Clustered (cyl)
Estimate Std. Error t value Pr(>|t|)
hp -0.024039 0.015344 -1.56664 0.25771
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RMSE: 2.94304 Adj. R2: 0.727485
Within R2: 0.07998
Yes, you’ll need to define a glance_custom.fixest() method. See this section of the docs for detailed instructions and many examples:
https://vincentarelbundock.github.io/modelsummary/articles/modelsummary.html#customizing-existing-models-part-i
And here’s an example with fixest:
library(fixest)
library(tibble)
library(modelsummary)
models <- list(
feols(mpg ~ hp | cyl, data = mtcars),
feols(mpg ~ hp | am, data = mtcars),
feols(mpg ~ hp | cyl + am, data = mtcars)
)
glance_custom.fixest <- function(x, ...) {
tibble::tibble(`# FE` = paste(x$fixef_sizes, collapse = " + "))
}
modelsummary(models, gof_map = c("nobs", "# FE"))
(1)
(2)
(3)
hp
-0.024
-0.059
-0.044
(0.015)
(0.000)
(0.016)
Num.Obs.
32
32
32
# FE
3
2
3 + 2
I have seen several examples how it might be possible to select desired pairwise comparisons, but unfortunately do not know how to apply that to my data.
Here is my abbreviated data set: https://www.dropbox.com/s/x9xjc9o0222rg0w/df.csv?dl=0
# FIXED effects: age and brain_area
df$age <- factor(df$age)
df$brain_area <- factor(df$brain_area)
# RANDOM effects: subject_ID and section
df$subject_ID <- factor(df$subject_ID)
df$section <- factor(df$section)
# dependent variable: DV
# ___________________ mixed TWO-way ANOVA
require(lme4)
require(lmerTest)
require(emmeans)
model = lmer(DV ~ age * brain_area + (1 | subject_ID), data = df)
anova(model) # significant interaction and both main effects
# ____________________ ALL pairwise comparisons
emmeans(model, pairwise~brain_area|age, adj='fdr')
# ____________________ I marked below comparisons that I would like to exclude (but keep all others)
$contrasts
age = old:
contrast estimate SE df t.ratio p.value
a - b 0.0412 0.0158 174 2.603 0.0125
a - c -0.0566 0.0158 174 -3.572 0.0007
a - control 0.3758 0.0158 174 23.736 <.0001 # exclude
a - d -0.0187 0.0158 174 -1.182 0.2387
b - c -0.0978 0.0158 174 -6.175 <.0001
b - control 0.3346 0.0158 174 21.132 <.0001 # exclude
b - d -0.0599 0.0158 174 -3.786 0.0004
c - control 0.4324 0.0158 174 27.308 <.0001
c - d 0.0378 0.0158 174 2.389 0.0199
control - d -0.3946 0.0158 174 -24.918 <.0001 # exclude
age = young:
contrast estimate SE df t.ratio p.value
a - b 0.0449 0.0147 174 3.063 0.0032
a - c -0.0455 0.0147 174 -3.105 0.0032
a - control 0.2594 0.0147 174 17.694 <.0001 # exclude
a - d 0.0202 0.0147 174 1.377 0.1702
b - c -0.0904 0.0147 174 -6.169 <.0001
b - control 0.2145 0.0147 174 14.631 <.0001 # exclude
b - d -0.0247 0.0147 174 -1.686 0.1040
c - control 0.3049 0.0147 174 20.799 <.0001
c - d 0.0657 0.0147 174 4.483 <.0001
control - d -0.2392 0.0147 174 -16.317 <.0001 # exclude
# ____________________ The line below seems to work BUT completely excludes 'control' level from factor 'brain_area'. I do not wish to completely exclude it...
emmeans(model, specs=pairwise~brain_area| age,
at = list(brain_area = c("a", "b", "c", "d")), adj='fdr' )
You need to provide the contrast coefficients manually. In this case, it's fairly simple to obtain all of them, then remove the ones you don't want; something like this:
EMM <- emmeans(model, ~ brain_area | age)
EMM # show the means
coef <- emmeans:::pairwise.emmc(levels(EMM)[["brain_area"]])
coef <- coef[-c(3, 6, 10)]
contrast(EMM, coef, adjust = "fdr")
I was trying to run a regression models on multiple subgroups of a dataframe using purrr::map_dfr(), but somehow I get this somewhat weird error.
library(dplyr)
library(purrr)
# Create some data
test_df = map_dfr(seq_len(5), ~mtcars, .id = 'group')
# Run regression on subgroups
map_dfr(seq_len(5),
~ function(.x){
glm(am ~ mpg + cyl + disp + hp + drat + wt + qsec + vs + gear + carb,
family = binomial,
data = test_df[group == .x,]) %>%
coefficients()
},
.id = 'group')
Error: Argument 1 must be a data frame or a named atomic vector.
Run `rlang::last_error()` to see where the error occurred.
Any suggestion will be appreciated.
If we are using function(x), there is no need for ~ or viceversa. It is a lambda function compact syntax in tidyverse
map_dfr(seq_len(5),
~ {
glm(am ~ mpg + cyl + disp + hp + drat + wt + qsec + vs + gear + carb,
family = binomial,
data = test_df[test_df$group == .x,]) %>%
coefficients()
},
.id = 'group')
-output
# A tibble: 5 x 12
group `(Intercept)` mpg cyl disp hp drat wt qsec vs gear carb
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 -11.6 -0.881 2.53 -0.416 0.344 23.2 7.44 -7.58 -47.0 42.9 -21.6
2 2 -11.6 -0.881 2.53 -0.416 0.344 23.2 7.44 -7.58 -47.0 42.9 -21.6
3 3 -11.6 -0.881 2.53 -0.416 0.344 23.2 7.44 -7.58 -47.0 42.9 -21.6
4 4 -11.6 -0.881 2.53 -0.416 0.344 23.2 7.44 -7.58 -47.0 42.9 -21.6
5 5 -11.6 -0.881 2.53 -0.416 0.344 23.2 7.44 -7.58 -47.0 42.9 -21.6
NOTE: output is the same as the input example was using the same data
To create the data frame:
num <- sample(1:25, 20)
x <- data.frame("Day_eclosion" = num, "Developmental" = c("AP", "MA",
"JU", "L"), "Replicate" = 1:5)
model <- glmer(Day_eclosion ~ Developmental + (1 | Replicate), family =
"poisson", data= x)
I get this return from:
a <- lsmeans(model, pairwise~Developmental, adjust = "tukey")
a$contrasts
contrast estimate SE df z.ratio p.value
AP - JU 0.2051 0.0168 Inf 12.172 <.0001
AP - L 0.3009 0.0212 Inf 14.164 <.0001
AP - MA 0.3889 0.0209 Inf 18.631 <.0001
JU - L 0.0958 0.0182 Inf 5.265 <.0001
JU - MA 0.1839 0.0177 Inf 10.387 <.0001
L - MA 0.0881 0.0222 Inf 3.964 0.0004
I am looking for a simple way to turn this output (just p values) into:
AP MA JU L
AP - <.0001 <.0001 <.0001
MA - - <.0001 0.0004
JU - - - <.0001
L - - -
I have about 20 sets of these that I need to turn into tables, so the simpler and more general the better.
Bonus points if the output is tab-deliminated, etc, so that I can easily paste into word/excel.
Thanks!
Here's a function that works...
pvmat = function(emm, ...) {
emm = update(emm, by = NULL) # need to work harder otherwise
pv = test(pairs(emm, reverse = TRUE, ...)) $ p.value
fmtpv = sprintf("%6.4f", pv)
fmtpv[pv < 0.0001] = "<.0001"
lbls = do.call(paste, emm#grid[emm#misc$pri.vars])
n = length(lbls)
mat = matrix("", nrow = n, ncol = n, dimnames = list(lbls, lbls))
mat[upper.tri(mat)] = fmtpv
idx = seq_len(n - 1)
mat[idx, 1 + idx] # trim off last row and 1st col
}
Illustration:
require(emmeans)
> warp.lm = lm(breaks ~ wool * tension, data = warpbreaks)
> warp.emm = emmeans(warp.lm, ~ wool * tension)
> warp.emm
wool tension emmean SE df lower.CL upper.CL
A L 44.6 3.65 48 37.2 51.9
B L 28.2 3.65 48 20.9 35.6
A M 24.0 3.65 48 16.7 31.3
B M 28.8 3.65 48 21.4 36.1
A H 24.6 3.65 48 17.2 31.9
B H 18.8 3.65 48 11.4 26.1
Confidence level used: 0.95
> pm = pvmat(warp.emm, adjust = "none")
> print(pm, quote=FALSE)
B L A M B M A H B H
A L 0.0027 0.0002 0.0036 0.0003 <.0001
B L 0.4170 0.9147 0.4805 0.0733
A M 0.3589 0.9147 0.3163
B M 0.4170 0.0584
A H 0.2682
Notes
As provided, this does not support by variables. Accordingly, the first line of the function disables them.
Using pairs(..., reverse = TRUE) generates the P values in the correct order needed later for upper.tri()
you can pass arguments to test() via ...
To create a tab-delimited version, use the clipr package:
clipr::write_clip(pm)
What you need is now in the clipboard and ready to paste into a spreadsheet.
Addendum
Answering this question inspired me to add a new function pwpm() to the emmeans package. It will appear in the next CRAN release, and is available now from the github site. It displays means and differences as well as P values; but the user may select which to include.
> pwpm(warp.emm)
wool = A
L M H
L [44.6] 0.0007 0.0009
M 20.556 [24.0] 0.9936
H 20.000 -0.556 [24.6]
wool = B
L M H
L [28.2] 0.9936 0.1704
M -0.556 [28.8] 0.1389
H 9.444 10.000 [18.8]
Row and column labels: tension
Upper triangle: P values adjust = “tukey”
Diagonal: [Estimates] (emmean)
Upper triangle: Comparisons (estimate) earlier vs. later
Dear Stackoverflow Users,
here is an example table, simillar to what I have, only mine has over 1000 protein and here I've placed 2:
`#for stack overflow#
Accession <- rep(c("AT1G01320.1", "AT1G01050.1"), each =14)
Description<- rep(c("protein1", "protein2"), each = 14)
genotype <- c("WT", "WT","WT", "WT", "m", "m", "m", "f", "f", "f", "f", "ntrc", "ntrc", "ntrc")
genotype <- c("WT", "WT","WT", "WT", "m", "m", "m", "f", "f", "f", "f", "ntrc", "ntrc", "ntrc")
variable <- c("WT1", "WT2","WT3", "WT4", "m1", "m2", "m3", "f1", "f2", "f3", "f4", "ntrc1", "ntrc2", "ntrc3", "WT1", "WT2","WT3", "WT4", "m1", "m2", "m3", "f1", "f2", "f3", "f4", "ntrc1", "ntrc2", "ntrc3")
value <- c(5535705, 8034106, 4879639, 6817736, 23109581, 3778870, 6020611, 4480108, 6131362, 4210275, 27630841, 4702864,2966520, 9065916, 151903.67, 417423.81, 2895121.80, 810620.92, 822284.83, 6477122.14, 12266704.79, 11196940.77, 12143974.82, 1040832.60, 136497.86, 9294097.54, 506386.62, 32266.71)
prot<- data.frame(Accession, Description, genotype, variable, value)
> prot
Accession Description genotype variable value
1 AT1G01320.1 protein1 WT WT1 5535705.00
2 AT1G01320.1 protein1 WT WT2 8034106.00
3 AT1G01320.1 protein1 WT WT3 4879639.00
4 AT1G01320.1 protein1 WT WT4 6817736.00
5 AT1G01320.1 protein1 m m1 23109581.00
6 AT1G01320.1 protein1 m m2 3778870.00
7 AT1G01320.1 protein1 m m3 6020611.00
8 AT1G01320.1 protein1 f f1 4480108.00
9 AT1G01320.1 protein1 f f2 6131362.00
10 AT1G01320.1 protein1 f f3 4210275.00
11 AT1G01320.1 protein1 f f4 27630841.00
12 AT1G01320.1 protein1 ntrc ntrc1 4702864.00
13 AT1G01320.1 protein1 ntrc ntrc2 2966520.00
14 AT1G01320.1 protein1 ntrc ntrc3 9065916.00
15 AT1G01050.1 protein2 WT WT1 151903.67
16 AT1G01050.1 protein2 WT WT2 417423.81
17 AT1G01050.1 protein2 WT WT3 2895121.80
18 AT1G01050.1 protein2 WT WT4 810620.92
19 AT1G01050.1 protein2 m m1 822284.83
20 AT1G01050.1 protein2 m m2 6477122.14
21 AT1G01050.1 protein2 m m3 12266704.79
22 AT1G01050.1 protein2 f f1 11196940.77
23 AT1G01050.1 protein2 f f2 12143974.82
24 AT1G01050.1 protein2 f f3 1040832.60
25 AT1G01050.1 protein2 f f4 136497.86
26 AT1G01050.1 protein2 ntrc ntrc1 9294097.54
27 AT1G01050.1 protein2 ntrc ntrc2 506386.62
28 AT1G01050.1 protein2 ntrc ntrc3 32266.71
>
I want to write a loop that will first subset the original data frame containing >1000 entries into subsets based on single protein ID, than do one way ANOVA and Tukeys HSD, get p adj from Tukeys, than print it into pdf.
so far I have:
`IDs<-unique((prot$Accession))
tukey_fullAA <- list()
table_fullAA <- NULL
for (i in 1:length(IDs)){
temp <- prot[(prot$Accession)==IDs[i],]
AV<- summary(aov(temp$value ~ temp$genotype))
tukey_fullAA <- list(TukeyHSD(aov(temp$value ~ temp$genotype)))
}
for(j in 1:length(tukey_fullAA))## important loop over whole list
{
tukey <- tukey_fullAA[[j]]
factor_table <- unlist(lapply(tukey, function(x) nrow(x)))
factor_table <- rep(names(factor_table), factor_table)
tukey_bound <- NULL
for (k in 1:length(tukey))
{
tukey_bound <- rbind(tukey_bound, tukey[[k]])
}
pairs <- rownames(tukey_bound)
rownames(tukey_bound) <- NULL
tukey_bound <- as.data.frame(tukey_bound)
tukey_bound$parameter <- factor_table
tukey_bound$pairs <- pairs
table_fullAA <- rbind(table_fullAA, tukey_bound)
}
as it is now it doesn't loop, I struggled to get the Tukey HSD into table, When I have it I want to find the significant values p adj, I get also confused about adding a column that would say what protein are these values for. I imagine it as a column first column containing one name for as many rows as it needs for out put.
thanks a lot!