Essentially what I want to do is automate this for all my data:
plants_A<-plants_sorted[plants_sorted[, 'treatment']== 'A', ]
plants_A1<-plants_A[plants_A[, 'replicate']== '1', ]
lm(weight~time, data = plants_A1)
From 'plants' I want to make lm's for all treatment and replicate combinations.
I have also managed to split the data using:
plants_treat_repl <- split(plants, paste(plants$treatment, plants$replicate))
But I can't seem to make lm's from this split data.
# sample data
structure(list(treatment = structure(c(1L, 1L, 1L, 1L, 1L, 2L ), .Label = c("A",
"B", "C", "D"), class = "factor"), replicate = c(1, 2, 3, 4, 5, 1), time =
structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c("6", "8", "10", "12", "14",
"16"), class = "factor"), weight = c(2, 0, 0, 0, 0.5, 2.6), trtrep =
structure(c(1L, 5L, 9L, 13L, 17L, 2L), .Label = c("A.1", "B.1", "C.1", "D.1",
"A.2", "B.2", "C.2", "D.2", "A.3", "B.3", "C.3", "D.3", "A.4", "B.4", "C.4",
"D.4", "A.5", "B.5", "C.5", "D.5"), class = "factor")), row.names = c(NA, 6L),
class = "data.frame")
Let's transition this to a reproducible example using built-in data. (Feel free to share reproducible sample data in the question - dput(your_data[1:10, ]) is a great way to share data reproducibly, pick an appropriate subset.)
## split the data by group
mt_split = split(mtcars, mtcars$cyl)
## fit models to each group
mods = lapply(mt_split, lm, formula = mpg ~ wt)
## extract the coefficients from each model
lapply(mods, coef)
# $`4`
# (Intercept) wt
# 39.571196 -5.647025
#
# $`6`
# (Intercept) wt
# 28.408845 -2.780106
#
# $`8`
# (Intercept) wt
# 23.868029 -2.192438
Try lapply function:
lms <- lapply(split(plants, paste(plants$treatment,plants$replicate)),
function(x)lm(weight~time, data = x))
lms is a list of all posibles lm's.
You haven't told us exactly what doesn't work, but:
plants <- transform(plants, trtrep = interaction(treatment, replicate)
lme4::lmList(weight~time | trtrep, data=plants)
should work. You might get away with defining the replicate on the fly:
lme4::lmList(weight~time | interaction(treatment, replicate), data=plants)
Related
I'm trying to add significance annotations to an errorbar plot with a factor x-axis and dodged groups within each level of the x-axis. It is a similar but NOT identical use case to this
My base errorbar plot is:
library(ggplot2)
library(dplyr)
pres_prob_pd = structure(list(x = structure(c(1, 1, 1, 2, 2, 2, 3, 3, 3), labels = c(`1` = 1,
`2` = 2, `3` = 3)), predicted = c(0.571584427222816, 0.712630712634987,
0.156061969566517, 0.0162388386564817, 0.0371877245103279, 0.0165022541901018,
0.131528946944238, 0.35927812866896, 0.0708662221985375), std.error = c(0.355802875027348,
0.471253661425626, 0.457109887762665, 0.352871728451576, 0.442646879181155,
0.425913568532558, 0.376552208691762, 0.48178172708116, 0.451758041335245
), conf.low = c(0.399141779923204, 0.496138837620712, 0.0701919316506831,
0.00819832576725402, 0.0159620304815404, 0.00722904089045731,
0.0675129352870401, 0.17905347369819, 0.030504893442457), conf.high = c(0.728233665534388,
0.861980236164486, 0.311759350126477, 0.031911364587827, 0.0842227723261319,
0.0372248587668487, 0.240584344249407, 0.590437963881823, 0.156035177669385
), group = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("certain",
"neutral", "uncertain"), class = "factor"), group_col = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("certain", "neutral",
"uncertain"), class = "factor"), language = structure(c(2L, 2L,
2L, 1L, 1L, 1L, 3L, 3L, 3L), .Label = c("english", "dutch", "german"
), class = "factor"), top = c(0.861980236164486, 0.861980236164486,
0.861980236164486, 0.0842227723261319, 0.0842227723261319, 0.0842227723261319,
0.590437963881823, 0.590437963881823, 0.590437963881823)), row.names = c(NA,
-9L), groups = structure(list(language = structure(1:3, .Label = c("english",
"dutch", "german"), class = "factor"), .rows = structure(list(
4:6, 1:3, 7:9), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 3L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
#dodge
pd = position_dodge(.75)
#plot
p = ggplot(pres_prob_pd,aes(x=language,y=predicted,color=group,shape=group)) +
geom_point(position=pd,size=2) +
geom_errorbar(aes(ymax=conf.high,ymin=conf.low),width=.125,position=pd)
p
What I want to do is annotate the plot such that the contrasts between group within each level of language are annotated for significance. I've plotted points representing the relevant contrasts and (toy) sig. annotations as follows:
#bump function
f = function(x){
v = c()
bump=0.025
constant = 0
for(i in x){
v = c(v,i+constant+bump)
bump = bump + 0.075
}
v
}
#create contrasts
combs = data.frame(gtools::combinations(3, 2, v=c("certain", "neutral", "uncertain"), set=F, repeats.allowed=F)) %>%
mutate(contrast=c("cont_1","cont_2","cont_3"))
combs = rbind(combs %>% mutate(language = 'english'),
combs %>% mutate(language='dutch'),
combs %>% mutate(language = "german")) %>%
left_join(select(pres_prob_pd,language:top)%>%distinct(),by='language') %>%
group_by(language)
#long transform and calc y_pos
combs_long = mutate(combs,y_pos=f(top)) %>% gather(long, probability, X1:X2, factor_key=TRUE) %>% mutate(language=factor(language,levels=c("english","dutch","german"))) %>%
arrange(language,contrast)
#back to wide
combs_wide =combs_long %>% spread(long,probability)
combs_wide$p = rep(c('***',"*","ns"),3)
#plot
p +
geom_point(data=combs_long,
aes(x = language,
color=probability,
shape=probability,
y=y_pos),
inherit.aes = T,
position=pd,
size=2) +
geom_text(data=combs_wide,
aes(x=language,
label=p,
y=y_pos+.025,
group=X1),
color='black',
position=position_dodge(.75),
inherit.aes = F)
What I am failing to achieve is plotting a line connecting each of the contrasts of group within each level of language, as is standard when annotating significant group-wise differences. Any help much appreciated!
We have a survey that asks for 'select all that apply' so the result is a string inside quotes with the values separated by commas. i.e. "red, black,green"
There are other question about income so I have a factor with 'low, medium, high'
I want to be able to answer questions: What percent selected 'Red', then group that by income.
I can split the string with
'''df4 <- c("black,silver,green")'''
I can create a data frame with a timestamp and the split string with
'''t2 <- as.data.frame(c(df2[2],l2))'''
I am not able to understand how to do this for all rows at one time.
Here is a DPUT of the input:
structure(list(RespData = structure(1:2, .Label = c("1/20/2020",
"1/21/2020"), class = "factor"), CarColor = c("red,blue,green,yellow",
"black,silver,green")), row.names = c(NA, -2L), class = "data.frame")
and here is a DPUT of the desired output:
structure(list(RespData = structure(c(1L, 1L, 1L, 1L, 2L, 2L,
2L), .Label = c("1/20/2020", "1/21/2020"), class = "factor"),
Cars = structure(c(3L, 1L, 2L, 4L, 5L, 6L, 2L), .Label = c("blue",
"green", "red", "yellow", "black", "silver"), class = "factor")), row.names = c(NA,
-7L), class = "data.frame")
Example of Function:
MySplitFunc <- function(ListIn) {
# build an empty data frame and set the column names
x1.all <- ListIn[0,]
names(x1.all) <- c("ResponseTime", "Descriptive")
# for each row build the data and combine to growing list
for(x in 1:nrow(ListIn)) {
#print(x)
r1 <- ListIn[x,1]
c1 <- strsplit(ListIn[x,2],",")
x1 <- as.data.frame(c(r1,c1))
# set the names and combine to all
names(x1) <- c("ResponseTime", "Descriptive")
x1.all <- rbind(x1.all,x1)
}
# strip the whitespace
x1.all <- data.frame(lapply(x1.all, trimws), stringsAsFactors = TRUE)
return(x1.all)
}
I have a dataset with age as continuous and as a factor, sex as a factor and 4 groups.
structure(list(Age = c(9, 12, 16, 57), Age_1 = structure(c(2L,
3L, 3L, 7L), .Label = c("8", "1", "2", "3", "4", "5", "6", "7"
), class = "factor"), Sex = structure(c(2L, 1L, 2L, 1L), .Label = c("M",
"F", "U"), class = "factor"), N = structure(c(2L, 2L, 2L,
2L), .Label = c("0", "1"), class = "factor"), G = structure(c(1L,
1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), L_1 =
structure(c(1L,
1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), C_1 =
structure(c(1L,
1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), G_1 =
structure(c(1L,
1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), m = structure(c(1L,
1L, 1L, 1L), .Label = c("0", "1"), class = "factor"), A = c(1,
1, 1, 1)), row.names = c(NA, 4L), class = "data.frame")
I want to do logistic regression for each variable (Age, Age_1 and sex) for each of the groups (N, G, L_1, C_1, G_1, m).
eg.
mylogit <- glm(N ~ Sex, data = logistic_s, family = "binomial")
mylogit <- glm(N ~ Age, data = logistic_s, family = "binomial")
I am using gtsummary for combining the variables in a table.
library(gtsummary)
tbl_n <-
tbl_uvregression(
logistic_s[c("N", "Age", "sex", "Age_1")],
method = glm,
y = N,
method.args = list(family = binomial),
exponentiate = TRUE
)
tbl_n
This produces the output for one group (eg. N) with the variables Age, Age_1, Sex.
I want to repeat this with each of the groups (eg N, G, L_1 etc) and then combine the tables to make one combined table.
I am open to using different packages if there are other options that suit this better. I want to make a table that can be exported in word.
I agree that some reproducible code would be helpful. I am not 100% certain what kind of output you're looking to get. You want to build some univariate logistic regression models, separately for 2 or more groups?
If that is correct, here is one way to go about it:
I will use the trial data set in the gtsummary package as an example. I'll make the grouping variable treatment (trt).
library(gtsummary)
library(tidyverse)
trial_subset <-
trial %>%
select(trt, response, age, marker, grade)
We'll begin by constructing univariate regression tables stratified by trt using the tbl_uvregression() function from gtsummary package. They will be stored in a new column in the data frame called tbl_uv.
df_uv <-
trial_subset %>%
# group by trt, and nest data within group
group_by(trt) %>%
nest() %>%
# build univariate logistic regression models separately within grouping variable
mutate(
tbl_uv = map(
data,
~tbl_uvregression(
data = .x,
y = response,
method = glm,
method.args = list(family = binomial),
exponentiate = TRUE
)
)
)
#> # A tibble: 2 x 3
#> # Groups: trt [2]
#> trt data tbl_uv
#> <chr> <list> <list>
#> 1 Drug A <tibble [98 x 4]> <tbl_vrgr>
#> 2 Drug B <tibble [102 x 4]> <tbl_vrgr>
We can now use the tables saved in df_uv to merge them into a single table using the tbl_merge() function.
tbl_merge(
tbls = df_uv$tbl_uv, # list of the univariate logistic regression tables
tab_spanner = paste0("**", df_uv$trt, "**") # adding stars to bold the header
)
This produces the table below. I hope this is helpful!
I am trying to create this type of chart from the data on the left (arbitrary values for simplicity):
The goal is to plot variable X on the x-axis with the mean on the Y-axis and error bars equal to the standard error se.
The problem is that values 1-10 should be each be represented individually (blue curve), and that the values for A and B should be plotted on each of the 1-10 values (green and red line).
I can draw the curve if I manually save the data and manually copy the values for A and B to each value for X but this is not very time efficient. Is there a more elegant way to do this?
Thanks in advance!
EDIT: As suggested the code:
df <- structure(list(X = structure(c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 2L, 11L, 12L), .Label = c("1", "10", "2", "3", "4", "5",
"6", "7", "8", "9", "A", "B"), class = "factor"), mean = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 5.5, 6.5), sd = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), se = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("X", "mean", "sd", "se"), class = "data.frame", row.names = c(NA,-12L))
df<-as.data.frame(df)
df$X<-factor(df$X)
plot <- ggplot(df, aes(x=df$X, y=df$mean)) + geom_point() + geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.1)
plot
Im afraid I don't know ggplot, but hopefully this is what you want (it might also aid others in understanding your question).
You want a ggplot with three lines,
1. df$X,df$mean
2. df$X,df$row_A_mean
3. df$X,df$row_B_mean
4. error bars of the SE column
df <- structure(list(X = structure(c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 2L, 11L, 12L), .Label = c("1", "10", "2", "3", "4", "5",
"6", "7", "8", "9", "A", "B"), class = "factor"), mean = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 5.5, 6.5), sd = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), se = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("X", "mean", "sd", "se"), class = "data.frame", row.names = c(NA,-12L))
df<-as.data.frame(df)
df$X<-factor(df$X)
plot <- ggplot(df, aes(x=df$X, y=df$mean)) + geom_point() + geom_errorbar(aes(ymin=mean-se, ymax=mean+se), width=.1)
plot
#row A mean
df$row_A_mean<-rep(df[11,]$mean,nrow(df))# note that this could also be replaces by a horizontal line, unless the mean changes
#row A sd
df$row_A_sd<-rep(df[11,]$sd,nrow(df))
plot(as.numeric(df$X),df$mean,type="p",col="red")
lines(as.numeric(df$X),df$row_A_mean,col="green")
If we use a subset to define the data elements of the ggplot, we can come up with one solution using geom_hline:
theme_set(theme_bw())
ggplot(data = df[1:10,])+
geom_errorbar(aes(x = X, ymin = mean - se, ymax = mean + se))+
geom_point(aes(x = X, y = mean))+
geom_line(aes(x = X, y = mean), group = 1)+
geom_hline(data = df[11,], aes(yintercept = mean, colour = 'A'))+
geom_hline(data = df[12,], aes(yintercept = mean, colour = 'B'))
It's helpful to reorient your data into long form so that you can really utilize the aesthetic part of ggplot. Generally I would use reshape2::melt for this, but your data the way it's currently formatted doesn't really lend itself to it. I'll show you what I mean by long form and you can get the idea what we're shooting for:
#setting variables for your classes so it's a bit more scalable - reset as applicable
x.seriesLength <- 10
x.class.name <- "X" #name of the main series class; X in your example
a.vec <- c(5.5, 1, 1, "A")
b.vec <- c(6.5, 1, 1, "B")
#trimming df so we can reshape
df <- df[1:x.seriesLength, 2:4]
df$class <- x.class.name #adding class column
#converting your static A and B values to long form, sending to a data.frame and adding to df
add <- matrix(c(rep(a.vec, times = x.seriesLength),
rep(b.vec, times = x.seriesLength)),
byrow = T,
ncol = 4)
colnames(add) <- c("mean", "sd", "se", "class")
df <- rbind(df, add)
print(df)
Then we need to do a bit more cleaning:
df$rownum <- rep(1:x.seriesLength, times = 3)
df[,1:3] <- sapply(df[,1:3], as.numeric) #casting as numeric
df$barmin <- df$mean - df$sd
df$barmax <- df$mean + df$sd
Now we have a long form data frame with the required data. We can then use the new class column to plot and color multiple series.
#use class column to tell ggplot which points belong to which series
g <- ggplot(data = df) +
geom_point(aes(x = rownum, y = mean, color = class)) +
geom_errorbar(aes(x = rownum, ymin=barmin, ymax=barmax, color = class), width=.1)
g
Edit: If you want lines instead of points, just replace geom_point with geom_line.
My data looks something like this:
example <- structure(list(ID = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L), .Label = c("A1", "A2", "A3"), class = "factor"), y = c(44.1160205053166,
33.0574407376116, 50.5295183433918, 44.1160205053166, 33.0574407376116,
50.5295183433918, 44.1160205053166, 33.0574407376116, 50.5295183433918
), day = structure(c(1392647220, 1392733620, 1392820020, 1392647220,
1392733620, 1392820020, 1392647220, 1392733620, 1392820020), class = c("POSIXct",
"POSIXt"), tzone = ""), P = c(16.345885329647, 6.21615618292708,
9.89848991157487, 14.4955473870505, 8.47820783441421, 2.36668747442309,
10.4325918923132, 9.26802998466883, 14.8380589560838), o = c(25.6364896567538,
10.5067015672103, 12.0306829502806, 25.6364896567538, 10.5067015672103,
12.0306829502806, 25.6364896567538, 10.5067015672103, 12.0306829502806
)), .Names = c("ID", "y", "day", "P", "x"), row.names = c(NA,
-9L), class = "data.frame")
I want to ran a regression of Y on P on day 1, 2, and 3. That is
y ~ p[1] + p[2] + p[3] + x
What is the best way of doing this? Do I need to create a new data frame with these variables before running lm? Or there is a better way?
Thanks!
Use substet parameter in lm function
lm(Y ~ P, data=df, subset=df$P %in% 1:3)