Optimizing degrees of freedom in spline regression - r

I have two gene-expression time-course data sets:
First, gene expression was measured over 14 time points from 4 groups:
df1 <- structure(list(val = c(-0.1, -0.13, -0.4, -0.3, -0.3, -0.2, -0.24,
0.1, 0.2, 0.13, 0, 0.63, 0.83, 0.85, -0.07, -0.07, -0.27, -0.2,
-0.2, -0.1, 0.2, 0.1, 0.07, 0.17, 0.6, 0.75, 1.1, 1.1, -0.13,
-0.15, -0.26, -0.25, -0.14, 0.04, 0.2, 0.24, 0.23, 0.2, 0.1,
0.73, 1, 1.3, 0, 0.06, -0.24, -0.17, -0.17, -0.04, 0.16, 0.1,
0.14, 0.27, 0.34, 0.9, 0.97, 1.04),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39),
group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,4L),
.Label = c("a", "b", "c", "d"), class = "factor")), .Names = c("val","time", "group"),
row.names = c(NA, -56L), class = "data.frame")
df1$group <- factor(df1$group,levels=c("a","b","c","d"))
which looks like this (adding a loess smoothed trend line):
library(ggplot2)
ggplot(df1,aes(x=time,y=val,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
Second, gene expression was measured over similar 14 time points but now from 2 different groups, each represented by the two sexes:
df2 <- structure(list(val = c(-0.23, -0.01, -0.14, -0.01, -0.21, -0.16,
-0.24, -0.11, 0.02, -0.11, -0.01, -0.25, -0.47, -1.25, 0.02,
-0.3, -0.02, 0.14, 0.25, -0.05, 0.15, 0.11, -0.24, -0.18, -0.39,
-0.49, -0.5, -0.65, -0.06, 0.09, 0.1, 0.15, 0.08, 0.15, 0.4,
0.24, 0.07, 0.08, -0.18, -0.35, -0.19, -0.81, -0.16, 0.29, -0.05,
0.14, 0.14, 0.48, 0.34, 0.11, -0.07, -0.13, -0.41, -0.22, -0.54,
-0.76, 0.35, 0.34, -0.06, 0.21, 0.14, 0.14, 0.25, 0.22, 0.25,
0.16, 0.3, 0.44, 0.08, 0.48, 0.1, 0.16, -0.03, -0.22, 0.2, 0.01,
-0.09, -0.02, -0.01, 0.06, -0.13, 0.19, 0.11, -0.04, -0.39, 0.03,
-0.01, 0.09, 0.1, -0.14, -0.12, -0.1, 0.36, 0.08, 0.09, 0.09,
0.42, 0.37, -0.14, 0.12, 0.09, 0.03, 0.06, -0.25, 0.2, -0.06,
-0.44, 0.23, 0.03, 0.16, 0.81, 0.83),
time = c(-1, 0, 1, 1.58,2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0,1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17,7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58,6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58,5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17,4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39,
-1, 0, 1, 1.58, 2, 2.58, 3, 3.32, 3.58, 4.17, 4.58, 5.58, 6.17, 7.39),
sex = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
.Label = c("F", "M"), class = "factor"), group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
.Label = c("a", "b"), class = "factor")), .Names = c("val", "time", "sex", "group"), row.names = c(NA, -112L), class = "data.frame")
df2$sex <- ordered(df2$sex,levels=c("M","F"))
df2$group <- ordered(df2$group,levels=c("a","b"))
df2$col <- factor(paste0(df2$group,":",df2$sex))
which looks like this (adding a loess smoothed trend line):
ggplot(df2,aes(x=time,y=val,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
For df1, I would like to estimate the effect of time on val, adjusting for group.
For df2, I would like to estimate the effect of time:group on val, adjusting for sex.
Looking at the data I thought using spline regressions would be appropriate so I used the gam function from the mgcv package, which as far as I understand optimizes the degrees of freedom of the splines fitted to the data.
This is what I fitted for df1:
mgcv1.fit <- mgcv::gam(val ~ group+s(time),data=df1)
Which gives:
Family: gaussian
Link function: identity
Formula:
val ~ group + s(time)
Estimated degrees of freedom:
7.18 total = 11.18
GCV score: 0.01258176
But 7.18 degrees of freedom seems too much for these data.
For df2:
mgcv2.fit <- mgcv::gam(val ~ sex+s(time,by=group),data=df2)
which gives:
Family: gaussian
Link function: identity
Formula:
val ~ sex + s(time, by = group)
Estimated degrees of freedom:
1.72 total = 3.72
GCV score: 0.08522094
I guess that in this case I'd imagine the degrees of freedom to be slightly higher.
One more point. Plotting the fitted values for these two data sets:
df1$mgcv <- mgcv1.fit$fitted.values
ggplot(df1,aes(x=time,y=mgcv,color=group))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
which looks fine.
But for df2
df2$mgcv <- mgcv2.fit$fitted.values
ggplot(df2,aes(x=time,y=mgcv,color=col))+geom_point()+theme_minimal()+geom_smooth(se=F)+theme(legend.position="top",legend.title=element_blank())
Looks like it flipped the group labels.
So my questions are:
Am I using mgcv::gam correctly for optimizing the spline degrees of freedom for my questions?
Does mgcv reorders the samples in its fitted.values?

First of all, mgcv does the right thing on the factor levels. If you check str(df2$sex), you will see that "M" (male) is the first level and "F" (female) is the second. But it seems from str(df2$col) that "F" is the first, so you get some mislabeling when making plot.
Secondly, your second model has not been specified correctly.
The spline s(time) is under centering constraint when there is no "by" variable, or the "by" is a factor. So you to provide your "by" variable group as a separate term in your model formula to catch its marginal effect;
Since the "by" variable group is an ordered variable, mgcv applies contrasts on it, dropping the first level "a" when constructing the s(time, by = group). So you need to provide a separate s(time) as the baseline smooth.
Your current mgcv2.fit is a rather poor model (not surprising), giving an explained deviance of 9%. But if you do the following you get 64%.
gam(val ~ sex + s(time) + group + s(time, by = group), data = df2, method = "REML")
The ggplot now looks right (I haven't changed df2$col so the coloring is still probably reversed).
gam defaults to use "GCV.Cp" as smoothing parameter selection method. But it is recommended to use "REML" as it is less prone to overfitting.
Remark 1
If the "by" variable group is a (non-ordered) factor, it is not subject to contrasts. So the model formula should be:
val ~ sex + group + s(time, by = group)
The following is quoted from 'by' variables section of ?gam.models:
If a ‘by’ variable is a ‘factor’ then it generates an indicator
vector for each level of the factor, unless it is an ‘ordered’
factor. In the non-ordered case, the model matrix for the smooth
term is then replicated for each factor level, and each copy has
its rows multiplied by the corresponding rows of its indicator
variable. The smoothness penalties are also duplicated for each
factor level. In short a different smooth is generated for each
factor level (the ‘id’ argument to ‘s’ and ‘te’ can be used to
force all such smooths to have the same smoothing parameter).
‘ordered’ ‘by’ variables are handled in the same way, except that
no smooth is generated for the first level of the ordered factor
(see ‘b3’ example below). This is useful for setting up
identifiable models when the same smooth occurs more than once in
a model, with different factor ‘by’ variables.
Remark 2
I am not to judge your model, but there seems to be a clear within-group difference between "F" and "M". From your data we see that "F" and "M" has a bigger difference in group "b" than in group "a". At the moment the effect of sex is identical in both groups, and it is just a vertical shift. You can observe this in the above ggplot in this answer. It is up to you to decide the model in the end, but just in case that you want to model this sex-group interaction, you can do
df2$sex_group <- with(df2, interaction(sex, group)) ## the new variable is unordered
test <- gam(val ~ sex + group + s(time, by = sex_group), data = df2, method = "REML")
Note how I provide two factor variables to by. An auxiliary variable sex_group is created.

Related

Apply rules to growing window

I want to loop through the dataframe Out using a window that:
Grows one increment at a time (so the rear of the window is fixed and the front of the window grows - window gets bigger)
At each increment, the following rules should be run over the window:
if (mean(Speed_out) <= 0.152682)
Behaviour <- Lying
else if (Movement_Out == “left”) <= 20.8 && (mean(Speed_Out) >=
0.200921)
Behaviour <- Grazing
If no rules are met then the window should grow one increment at a time until a rule is met.
Once a rule is met, all of the previous increments should be labelled with the Behaviour assigned to that rule above.
The next window should then start at the next element after where the last window terminated.
The initial window size should be adjustable (the window size at the start and after each terminated window).
Notes:
The units (Movement_Out == “left”) <= 20.8 mean that if "left" occupies less than 20.8% of the window.
Example:
Here's a short example of the output I'd like from the data provided below where the starting window size was set to 4:
Speed_Out Movement_Out Behaviour
1 0.220 left Lying
2 0.155 left Lying
3 0.120 forward Lying
4 0.090 non-moving Lying <== window terminates here
5 0.125 non-moving Grazing <== new window starts here
6 0.125 non-moving Grazing
7 0.155 non-moving Grazing
8 0.340 forward Grazing
9 0.370 forward Grazing <== window terminates here
10 0.185 forward Grazing <== new window starts here
11 0.155 right Grazing
12 0.220 non-moving Grazing
13 0.220 non-moving Grazing
14 0.280 non-moving Grazing <== window terminates here
15 0.215 non-moving Grazing <== new window starts here
16 0.060 right Grazing
17 0.340 non-moving Grazing
18 0.555 forward Grazing <== window terminates here
19 0.275 right And so on..
20 0.215 forward
Dataframe for your use
Out <- structure(list(Speed_Out = c(0.22, 0.155, 0.12, 0.09, 0.125,
0.125, 0.155, 0.34, 0.37, 0.185, 0.155, 0.22, 0.22, 0.28, 0.215,
0.06, 0.34, 0.555, 0.275, 0.215, 0.185, 0.06, 0.245, 0.31, 0.345,
0.375, 0.375, 0.87, 1.025, 0.405, 0, 0.185, 0.31, 0.155, 0.125,
0.22, 0.375, 0.345, 0.345, 0.405, 0.31, 0.34, 0.245, 0.155, 0.19,
0.22, 0.185, 0.12, 0.185, 0.155, 0.245, 0.31, 0.155, 0.155, 0.25,
0.215, 0.09, 0.06, 0.245, 0.495, 0.495, 0.34, 0.28, 0.31, 0.28,
0.25, 0.25, 0.185, 0.155, 0.25, 0.28, 0.28, 0.34, 0.215, 0.125,
0.155, 0.34, 0.34, 0.09, 0.59, 1.71, 1.18, 0.185, 0.215, 0.185,
0.185, 0.155, 0.19, 0.19, 0.19, 0.87, 2.045, 2.73, 1.585, 0.22,
0.25, 0.435, 0.405, 0.405, 0.405, 0.715, 0.62, 0.37, 0.4, 0.185,
0.375, 0.59, 0.525, 0.245, 0.495, 0.495, 0.68, 0.775, 0.25, 0.31,
0.34, 0.28, 0.28, 0.25, 1.55, 2.695, 1.705, 1.21, 0.87, 0.25,
1.52, 1.52, 0.405, 0.81, 2.08, 2.915, 1.705, 0.435, 0.22, 0.78,
1.215, 0.84, 0.495, 0.495, 0.56, 0.375, 0.28, 0.715, 1.025, 0.495,
0.65, 1.18, 1.09, 0.995, 0.87, 0.435, 0.125, 0.435, 0.555, 0.775,
1.12, 1.555, 1.15, 0.25, 0.87, 0.93, 0.28, 0.31, 0.31, 0.375,
0.78, 0.655, 0.53, 0.62, 0.525, 0.37, 0.555, 1.025, 0.655, 1.12,
1.585, 0.715, 0.155, 0.28, 1.12, 2.11, 1.645, 0.715, 0.465, 0.84,
0.81, 0.655, 0.84, 0.435, 0.28, 0.215, 0.93, 1.335, 0.65, 0.185,
0.155, 0.34, 0.4, 0.37, 0.435, 0.405, 0.28, 0.28, 0.25, 0.25,
0.745, 1.24, 0.805, 1.055, 1.085, 0.465, 0.375, 0.5, 0.59, 0.37,
0.185, 0.34, 0.37, 0.435, 0.405, 0.06, 0.125, 0.25, 0.31, 0.405,
0.78, 0.56, 0.215, 0.495, 0.87, 1.025, 0.62, 0.405, 0.405, 0.405,
0.31, 0.215, 0.465, 0.435, 0.34, 0.275, 0.215, 0.25, 0.22, 0.22,
0.125, 0.245, 0.34, 0.31, 0.37, 0.31, 0.31, 0.245, 0.185, 0.25,
0.22, 0.22, 0.31, 0.28, 0.22, 0.28, 0.53, 0.655, 0.375, 0.19,
0.405, 0.435, 0.28, 0.215, 0.77, 0.96, 1.865, 1.83, 0.495, 0.655,
1.615, 1.395, 0.31, 0.31, 0.25, 0.28, 0.34, 0.34), Movement_Out = structure(c(2L,
2L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 4L, 3L, 3L, 3L, 3L, 4L, 3L,
1L, 4L, 1L, 1L, 2L, 2L, 3L, 4L, 3L, 2L, 4L, 1L, 2L, 1L, 3L, 3L,
1L, 3L, 2L, 4L, 3L, 1L, 3L, 1L, 1L, 1L, 4L, 3L, 3L, 3L, 3L, 1L,
3L, 3L, 3L, 2L, 4L, 3L, 3L, 4L, 2L, 3L, 1L, 1L, 2L, 4L, 1L, 2L,
4L, 3L, 3L, 4L, 3L, 3L, 2L, 4L, 2L, 1L, 2L, 4L, 4L, 2L, 4L, 2L,
1L, 2L, 3L, 1L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 1L, 3L, 3L,
2L, 2L, 3L, 1L, 2L, 4L, 3L, 4L, 2L, 3L, 1L, 4L, 4L, 3L, 1L, 2L,
1L, 1L, 4L, 1L, 2L, 4L, 2L, 1L, 1L, 2L, 4L, 2L, 2L, 4L, 1L, 1L,
2L, 4L, 2L, 4L, 2L, 1L, 2L, 2L, 4L, 2L, 4L, 2L, 4L, 3L, 1L, 4L,
2L, 1L, 1L, 2L, 4L, 2L, 4L, 2L, 4L, 4L, 2L, 4L, 1L, 1L, 4L, 2L,
4L, 4L, 3L, 4L, 4L, 2L, 1L, 1L, 1L, 4L, 1L, 1L, 4L, 4L, 2L, 2L,
4L, 1L, 2L, 2L, 4L, 4L, 4L, 2L, 2L, 1L, 4L, 4L, 2L, 3L, 1L, 2L,
2L, 4L, 4L, 1L, 2L, 4L, 4L, 2L, 2L, 4L, 2L, 4L, 2L, 4L, 1L, 1L,
2L, 1L, 4L, 4L, 3L, 4L, 2L, 4L, 3L, 1L, 1L, 2L, 1L, 1L, 4L, 2L,
4L, 2L, 4L, 3L, 1L, 4L, 1L, 1L, 2L, 4L, 2L, 1L, 4L, 1L, 4L, 3L,
2L, 3L, 2L, 4L, 3L, 3L, 2L, 1L, 3L, 1L, 1L, 3L, 2L, 3L, 3L, 3L,
1L, 2L, 4L, 2L, 3L, 2L, 1L, 4L, 3L, 2L, 4L, 4L, 2L, 4L, 1L, 1L,
2L, 2L, 4L, 1L, 2L, 4L, 2L, 4L, 3L, 4L), .Label = c("forward",
"left", "non-moving", "right"), class = "factor")), .Names = c("Speed_Out",
"Movement_Out"), row.names = c(NA, 283L), class = "data.frame")
Ok, I have to say that this has been less trivial than I expected. My answer is ugly and most likely not optimal, but it seems to work.
There seem to be a few spots where even when the rest of the data was taken into account, none of the conditions were met, so the behaviour for those stayed at NA.
library(dplyr)
# Create id variable used to join results later
Out <- Out %>%
mutate(id=row_number())
# Initial window size
window_size <- 4
# Initialize variables used in loop
w <- window_size
i<-1
window_cnt<-1
out_behaviour <- data.frame(id=as.numeric(), Behaviour=as.character(), stringsAsFactors = FALSE)
while (i <= NROW(Out)){
print(paste0("Row: ", i, ", Window Size: ", w))
df <- Out[i:(i+w-1),] %>%
mutate(mean_sp=mean(Speed_Out),
mvmt=sum(ifelse(Movement_Out=="left",1 ,0))/NROW(.)) %>%
mutate(Behaviour=case_when(mean_sp <= 0.152682 ~ "Lying",
mvmt <= 0.208 & mean_sp >= 0.200921 ~ "Grazing",
TRUE ~ as.character(NA)),
window_nr=window_cnt)
if (!all(is.na(df$Behaviour))){
i<-w+i
w<-window_size
out_behaviour <- rbind(out_behaviour, df %>% select(id, Behaviour, window_nr))
window_cnt<-window_cnt+1
} else {
if (w<=NROW(Out)-i){
w<-w+1
} else {
w<-window_size
i<-i+1
}
}
rm(df)
}
# Join Behaviour column bacl to original data frame
Out <- left_join(Out, out_behaviour, by="id") %>% select(-id)
# Clean up workspace
rm(i, w, window_size, window_cnt, out_behaviour)
And the first 20 outputs
Speed_Out Movement_Out Behaviour window_nr
1 0.220 left Lying 1
2 0.155 left Lying 1
3 0.120 forward Lying 1
4 0.090 non-moving Lying 1
5 0.125 non-moving Grazing 2
6 0.125 non-moving Grazing 2
7 0.155 non-moving Grazing 2
8 0.340 forward Grazing 2
9 0.370 forward Grazing 2
10 0.185 forward Grazing 3
11 0.155 right Grazing 3
12 0.220 non-moving Grazing 3
13 0.220 non-moving Grazing 3
14 0.280 non-moving Grazing 3
15 0.215 non-moving Grazing 4
16 0.060 right Grazing 4
17 0.340 non-moving Grazing 4
18 0.555 forward Grazing 4
19 0.275 right Grazing 5
20 0.215 forward Grazing 5
I know the code is a mess, so let me know if it needs some extra commenting.

Too many geom_points after facetting in ggplot2

Running the following script I was hoping to have one datapoint for each of the six terms with different colors depending on the dataset, facetted by adjustment. However, I get three and four point for each term in each facet. Any idea how this can happen when I only have 24 rows in the dataset?
library(ggplot2)
tb5 <- structure(list(term = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 1L,
2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L, 6L, 1L, 2L, 3L, 4L, 5L,
6L), .Label = c("A", "B", "C", "D", "E", "F"), class = "factor"),
coef = c(-1.444, 0.035, -0.034, 0.005, 0.001, 2.43, -1.032,
0.032, -0.024, 0.025, 0.003, 1.758, -1.148, 0.02, 0.003,
0.027, 0.003, 12.713, -1.494, 0.028, -0.021, 0.007, 0.004,
13.499), ci.lb = c(-1.826, 0.025, -0.087, -0.011, -0.004,
0.3, -1.293, 0.026, -0.061, 0.016, -0.001, -0.273, -1.48,
0.011, -0.045, 0.014, -0.003, 11.858, -1.931, 0.015, -0.08,
-0.014, -0.002, 12.624), ci.ub = c(-1.071, 0.045, 0.019,
0.022, 0.007, 7.305, -0.775, 0.038, 0.012, 0.035, 0.007,
6.613, -0.816, 0.029, 0.051, 0.039, 0.008, 13.569, -1.056,
0.04, 0.038, 0.027, 0.01, 14.375), Adjusted = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Adjusted", "Unadjusted"
), class = "factor"), Dataset = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("a", "b"), class = "factor")), .Names = c("term",
"coef", "ci.lb", "ci.ub", "Adjusted", "Dataset"), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -24L))
ggplot(data = tb5,aes(x=term,y=coef,color=Dataset))+geom_point()+
facet_grid(facets = ~Adjusted)+
geom_jitter(height = .8)

Rotate a faceted, grouped bar plot

**UPDATED BELOW
I have created a plot, I literally need it horizontal, but the coord_flip() leaves the facets on the bottom instead of having nested groups on the left.
The data:
srvc_data <- structure(list(dept = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("Distribution Centre Services",
"IT", "Marketing", "Merchandise & Inventory", "Operations and Communication"
), class = "factor"), label = c("test5", "test7", "test3", "test10",
"test4", "test6", "test2", "test1", "test11", "test12", "test9",
"test8", "test18", "test19", "test15", "test17", "test13", "test16",
"test20", "test14", "test22", "test21", "test25", "test23", "test24",
"test27", "test26", "test28", "test29", "test31", "test33", "test30",
"test32", "test38", "test36", "test37", "test43", "test34", "test35",
"test40", "test39", "test42", "test41", "test5", "test7", "test3",
"test10", "test4", "test6", "test2", "test1", "test11", "test12",
"test9", "test8", "test18", "test19", "test15", "test17", "test13",
"test16", "test20", "test14", "test22", "test21", "test25", "test23",
"test24", "test27", "test26", "test28", "test29", "test31", "test33",
"test30", "test32", "test38", "test36", "test37", "test43", "test34",
"test35", "test40", "test39", "test42", "test41"), Gap = c(-0.07,
-0.13, -0.15, -0.16, -0.16, -0.21, -0.22, -0.24, -0.24, -0.25,
-0.3, -0.3, -0.18, -0.19, -0.24, -0.29, -0.3, -0.34, -0.36, -0.41,
-0.46, -0.63, -0.16, -0.18, -0.21, -0.22, -0.27, -0.29, -0.31,
-0.31, -0.35, -0.39, -0.42, -0.15, -0.15, -0.2, -0.21, -0.22,
-0.27, -0.29, -0.29, -0.31, -0.36, -0.07, -0.13, -0.15, -0.16,
-0.16, -0.21, -0.22, -0.24, -0.24, -0.25, -0.3, -0.3, -0.18,
-0.19, -0.24, -0.29, -0.3, -0.34, -0.36, -0.41, -0.46, -0.63,
-0.16, -0.18, -0.21, -0.22, -0.27, -0.29, -0.31, -0.31, -0.35,
-0.39, -0.42, -0.15, -0.15, -0.2, -0.21, -0.22, -0.27, -0.29,
-0.29, -0.31, -0.36), impeff = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Importance", "Effectiveness"), class = "factor"),
score = c(0.87, 0.79, 0.78, 0.82, 0.81, 0.81, 0.92, 0.92,
0.78, 0.81, 0.86, 0.91, 0.79, 0.79, 0.87, 0.93, 0.9, 0.9,
0.82, 0.95, 0.91, 0.95, 0.77, 0.79, 0.82, 0.8, 0.83, 0.9,
0.91, 0.94, 0.89, 0.94, 0.91, 0.82, 0.74, 0.78, 0.81, 0.83,
0.85, 0.82, 0.81, 0.8, 0.83, 0.8, 0.66, 0.63, 0.66, 0.65,
0.6, 0.7, 0.68, 0.54, 0.56, 0.56, 0.61, 0.61, 0.6, 0.63,
0.64, 0.6, 0.56, 0.46, 0.54, 0.45, 0.32, 0.61, 0.61, 0.61,
0.58, 0.56, 0.61, 0.6, 0.63, 0.54, 0.55, 0.49, 0.67, 0.59,
0.58, 0.6, 0.61, 0.58, 0.53, 0.52, 0.49, 0.47)), row.names = c(NA,
-86L), .Names = c("dept", "label", "Gap", "impeff", "score"), class = "data.frame")
And the code:
ggplot(data = srvc_data, aes(x = label, y = score)) +
geom_bar( aes(fill = impeff),stat = "identity", position = "dodge",width = 1) +
facet_grid(~dept, switch = "x", scales = "free", space = "free") +
#coord_flip()+
The plot (without the flip) looks like the below, I need it horizontal, with the facet categories on the far left. How does the coord_flip() work? Why wouldn't it also flip/move the facet strips? Please ignore the crammed formatting!
**UPDATE
So thanks to #neilfws I have fixed the plot, by switching the order of the data.
ggplot(data = srvc_data, aes(x = label, y = score)) +
geom_bar( aes(fill = impeff),stat = "identity", position = "dodge",width = 1) +
facet_grid(dept~., switch = "y", scales = "free_y", space = "free") +
coord_flip()
Now I have the correctly oriented plot, but there is lots of unused space for all the labels that are unused in each facet. Within the facet_grid call, setting scales = "free" doesn't work, nor does drop = T. Any ideas? Plot below for reference.
If you coord_flip, you also need to reverse the faceting relationship (~), to place it on the side, and the switch, to place it on the y-axis. Does this get you close to what you want?
ggplot(srvc_data, aes(label, score)) +
geom_bar( aes(fill = impeff), stat = "identity", position = "dodge", width = 1) +
facet_grid(dept ~ ., switch = "y", scales = "free", space = "free") + coord_flip()

How to get geom_smooth() ignore my colour grouping

I'm trying to make a plot with fitted lines for two levels of my factor(grouped by color). I used shapes to group another variant but when I try to fit smoother, I end up with 4 lines while I only need one two lines in total (1 per color)
Here is the data and code I use:
data <- structure(list(K = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2s", "4s"), class = "factor"),
q = c(0.12, 0.11, 0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04,
0.03, 0.02, 0.01, 0.12, 0.11, 0.1, 0.09, 0.08, 0.07, 0.06,
0.05, 0.04, 0.03, 0.02, 0.01, 0.12, 0.11, 0.1, 0.09, 0.08,
0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01, 0.12, 0.11, 0.1,
0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01), rarity = c(0.907,
0.9206, 0.9359, 0.9321, 0.9405, 0.9344, 0.9449, 0.9106, 0.8844,
0.8829, 0.8989, 0.798, 0.7464, 0.8225, 0.877, 0.8521, 0.9127,
0.9317, 0.9245, 0.9595, 0.9628, 0.9573, 0.9423, 0.9428, 0.5802,
0.6414, 0.5123, 0.57, 0.587, 0.5655, 0.5231, 0.517, 0.4694,
0.5459, 0.3745, 0.3274, 0.7936, 0.7821, 0.7297, 0.7227, 0.6814,
0.6608, 0.6721, 0.6202, 0.5924, 0.5659, 0.5448, 0.6138),
metric = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("APD", "ED"
), class = "factor")), .Names = c("K", "q", "rarity", "metric"
), class = "data.frame", row.names = c(NA, -48L))
library(ggplot2)
ggplot(data=data, aes(x=q, y=rarity, colour=metric, shape=K))+
ggtitle("Relationship")+
xlab("rate of character change")+
ylab("Correlation coefficient to average rarity")+
geom_point()+
geom_smooth(method=lm,se=FALSE)
Any advice on that?
You're getting two lines for each group becase it's being split by both metric and K. You really want the shape aesthetic to only apply to the point layer, not the smooth later. It's better just to move the aes() for that property there.
ggplot(data=data, aes(x=q, y=rarity, colour=metric))+
ggtitle("Relationship")+
xlab("rate of character change")+
ylab("Correlation coefficient to average rarity")+
geom_point(aes(shape=K))+
geom_smooth(method=lm,se=FALSE)

Passing a String Via Index in R

I'm trying to use a for loop to pull subsets of data out of a dataframe with R.
I have a little vector to hold all the possible occurences of the names in that column
meter_class<-c("one_s_120","nine_s_120", "nine_s_480","fortyfive_s_120", "fortyfive_s_480")
Whenever I try to address it by index reference, it fails. Either nothing in the data subset survives (NULLs everywhere), or R complains about not passing the right argument by using meter_class[1]
attach(meter_class[1])
Error in attach(meter_class[1]) : file 'one_s_120' not found
subset(cal, cal$Form==as.character(meter_class[1]))
[1] Test Amps Type Accuracy Voltage Form
<0 rows> (or 0-length row.names)
Also, here's the output of dput on the datafram cal:
structure(list(Test = structure(c(3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Burst", "ESD", "Inspection",
"Surge"), class = "factor"), Amps = c(15, 15, 1.5, 2.5, 2.5,
0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 15, 15,
1.5, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5,
0.25, 15, 15, 1.5, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25, 2.5, 2.5,
0.25, 2.5, 2.5, 0.25, 15, 15, 1.5, 2.5, 2.5, 0.25, 2.5, 2.5,
0.25, 2.5, 2.5, 0.25, 2.5, 2.5, 0.25), Type = structure(c(2L,
1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L,
3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L,
2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L,
1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 1L, 3L), .Label = c("Lag - 0.5",
"Unity - Full", "Unity - Light"), class = "factor"), Accuracy = c(-0.011,
0.012, 0.027, 0.033, 0.076, 0.006, 0.052, 0.046, -0.016, 0.021,
0.008, 0.023, 0.034, 0.036, 0.038, 0.002, 0.012, 0.097, 0.055,
0.093, 0.033, 0.068, 0.048, -0.016, 0.042, 0.03, 0.035, 0.041,
0.024, 0.027, 0.004, -0.012, 0.002, 0.038, 0.084, 0.015, 0.049,
0.045, -0.009, 0.025, 0.002, 0.029, 0.03, 0.032, 0.064, 0.011,
0.024, 0.033, 0.054, 0.085, 0.027, 0.071, 0.059, 0.01, 0.051,
0.012, 0.051, 0.048, 0.04, 0.051), Voltage = c(120, 120, 120,
120, 120, 120, 480, 480, 480, 120, 120, 120, 480, 480, 480, 120,
120, 120, 120, 120, 120, 480, 480, 480, 120, 120, 120, 480, 480,
480, 120, 120, 120, 120, 120, 120, 480, 480, 480, 120, 120, 120,
480, 480, 480, 120, 120, 120, 120, 120, 120, 480, 480, 480, 120,
120, 120, 480, 480, 480), Form = structure(c(3L, 3L, 3L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("45S", "9S", "i210plus"
), class = "factor")), .Names = c("Test", "Amps", "Type", "Accuracy",
"Voltage", "Form"), class = "data.frame", row.names = c(NA, -60L
))
I know this is a simple thing to do if you know how to do it...Can anyone light the way?
Thanks!
It seems that none of the values of "meter_class" are represented in "Form" in your data frame.
unique(df$Form)
# [1] i210plus 9S 45S
meter_class %in% unique(df$Form)
# [1] FALSE FALSE FALSE FALSE FALSE
Just try two forms of subsetting, using values of "Form" actually present in the data:
subset(df, Form == "9S")
df[df$Form == "9S", ]
I also note that you wish to "pull subsets of data out of a dataframe". Not knowing the full story and your objectives of doing so, but please note that there are loads of functions that allow you to perform calculations, plotting, or whatever, on subsets of your data.
Update following comment
You can subset a data frame by combining logical conditions with logical operators (see e.g. ?Extract, ?&)
meter_class <- c("i210plus", "9S", "45S")
df[df$Form == "9S" & df$Voltage == 120, ]
# or
subset(df, Form == "9S" & Voltage == 120)

Resources