Error in R t_test , not enough "x" observations - r

I am trying to conduct group-wise t-test , but the code i am using returnign an error. It has worked alright for me previously and on other data frame but for this data frame its giving this error
Error in t.test.default(x = 0.0268, y = 0.0223, paired = FALSE,
var.equal = FALSE, : not enough 'x' observations
My Code is
stat.test.BACI5 <- Flaov %>%
group_by(`Treatment`) %>%
t_test(`Observed` ~ Control, detailed = TRUE) %>%
adjust_pvalue(method = "bonferroni") %>%
add_significance()
Here is the data structure
structure(list(Treatment = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Phase1", "Phase2"), class = "factor"), Group = structure(c(3L,
4L, 2L, 3L, 2L, 4L, 1L, 2L, 4L, 3L, 1L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 4L, 2L, 3L, 2L, 4L, 3L, 1L, 2L, 4L, 1L, 3L,
1L, 1L, 1L, 2L, 1L, 3L, 2L, 1L, 2L, 3L, 1L, 1L, 1L, 2L, 2L, 2L,
4L, 2L, 1L, 1L, 1L, 4L, 1L, 3L, 1L, 3L, 4L, 2L, 1L, 1L, 2L, 4L,
2L, 3L, 1L, 1L, 2L), .Label = c("Group A ", "Group B", "Group C ",
"Group D"), class = "factor"), Observed = c(0.1057, 0.151, 0.0576,
0.1267, 0.0941, 0.1554, 0.0247, 0.0832, 0.2807, 0.1137, 0.0325,
0.0777, 0.0362, 0.0637, 0.0303, 0.0223, 0.0932, 0.0363, 0.0641,
0.0453, 0.0359, 0.0334, 0.2006, 0.0538, 0.1114, 0.0661, 0.2452,
0.1043, 0.0489, 0.0663, 0.1967, 0.0321, 0.1042, 0.0268, 0.0313,
0.0255, 0.0787, 0.038, 0.1212, 0.0839, 0.0446, 0.0986, 0.1364,
0.0335, 0.0409, 0.0407, 0.0871, 0.0584, 0.0875, 0.1961, 0.0711,
0.0191, 0.0363, 0.0474, 0.1608, 0.0349, 0.1099, 0.0399, 0.1095,
0.2011, 0.057, 0.0418, 0.0394, 0.054, 0.2033, 0.0631, 0.1089,
0.0441, 0.0261, 0.0686), Control = c(0.1061, 0.154, 0.0585, 0.1289,
0.1076, 0.15856, 0.02997, 0.1022, 0.2849, 0.1193, 0.03292, 0.0888,
0.04628, 0.06454, 0.03341, 0.0239, 0.1013, 0.0364, 0.0883, 0.06363,
0.0566, 0.04036, 0.20641, 0.06206, 0.1158, 0.0687, 0.2457, 0.12643,
0.05126, 0.05705, 0.1987, 0.04719, 0.08199, 0.02312, 0.0317,
0.07045, 0.06395, 0.06043, 0.1251, 0.0912, 0.04575, 0.1018, 0.1379,
0.03834, 0.048, 0.04131, 0.0926, 0.06242, 0.0965, 0.1972, 0.0742,
0.0211, 0.04318, 0.05741, 0.1616, 0.06552, 0.1104, 0.04814, 0.11015,
0.2081, 0.06341, 0.04329, 0.04486, 0.06179, 0.2114, 0.05545,
0.1127, 0.04327, 0.03355, 0.07189), factors = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("Phase1", "Phase2"), class = "factor")), row.names = c(NA,
70L), class = "data.frame")

If you are doing a t test between observed and control in the different treatment groups, the formula is wrong, the left hand side of the formula should be the response variable and right hand side should be grouping variable.
In your case, you need to pivot the data long to get something like this:
library(tidyr)
Flaov[,c("Treatment","Observed","Control")] %>%
pivot_longer(-c(Treatment)) %>% group_by(Treatment)
# A tibble: 140 x 3
# Groups: Treatment [2]
Treatment name value
<fct> <chr> <dbl>
1 Phase1 Observed 0.106
2 Phase1 Control 0.106
3 Phase1 Observed 0.151
4 Phase1 Control 0.154
5 Phase1 Observed 0.0576
6 Phase1 Control 0.0585
7 Phase1 Observed 0.127
8 Phase1 Control 0.129
9 Phase1 Observed 0.0941
10 Phase1 Control 0.108
# … with 130 more rows
Then we further pipe it to test:
Flaov[,c("Treatment","Observed","Control")] %>%
pivot_longer(-c(Treatment)) %>%
group_by(Treatment) %>%
t_test(value ~ name)
# A tibble: 2 x 9
Treatment .y. group1 group2 n1 n2 statistic df p
* <fct> <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
1 Phase1 value Control Observed 46 46 0.482 90.0 0.631
2 Phase2 value Control Observed 24 24 0.323 46.0 0.748

Related

Multiple fixed effect levels missing in lmer from lme4

I am running mixed linear models using lmer from lme4. We are testing the effect of family, strain and temperature on several growth factors for brook trouts. I have 4 families (variable FAMILLE) form which we sampled our individuals. 2 are from the selected strain and 2 are from the control strain (variable Lignee). For each strain, the 2 families were either marked as resistant (Res) or sensible (Sens). So my fixed effect variable (FAMILLE), is nested in my variable Lignee. The expermiment was conducted at 3 different temperatures.
Here is what my dataframe looks like :
structure(list(BASSIN = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4"), class = "factor"), t.visee = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("15", "17", "19"), class = "factor"), FAMILLE = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L), .Label = c("RES", "SENS"), class = "factor"), Lignee = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("CTRL", "SEL"), class = "factor"), taux.croiss.sp.poids = c(0.8,
1.14285714285714, 1.42857142857143, 0.457142857142857, -0.228571428571429,
0.628571428571429, 0.971428571428571, 0.742857142857143, 1.08571428571429,
0.8, 0.571428571428571, 1.02857142857143, 0.8, 0.285714285714286,
0.285714285714286, 0.571428571428571, 0.742857142857143, 1.14285714285714,
0.628571428571429, 0.742857142857143, 1.02857142857143, 0.285714285714286,
0.628571428571429, 0.628571428571429, 0.857142857142857, 0.8,
1.08571428571429, 1.37142857142857, 0.742857142857143, 1.08571428571429,
0.0571428571428571, 0.571428571428571, 0.171428571428571, 0.8,
0.685714285714286, 0.285714285714286, 0.285714285714286, 0.8,
0.457142857142857, 1.02857142857143, 0.342857142857143, 0.742857142857143,
0.857142857142857, 0.457142857142857, 0.742857142857143, 1.25714285714286,
0.971428571428571, 0.857142857142857, 0.742857142857143, 0.514285714285714
)), row.names = c(NA, -50L), class = c("tbl_df", "tbl", "data.frame"
))
Lignee has 2 levels (Sel and Ctrl)
FAMILLE has 2 levels (Sens and Res)
So I have 4 distinct levels :
Lignee Sel and FAMILLE Sens
Lignee Sel and FAMILLE Res
Lignee Ctrl and FAMILLE Sens
Lignee Ctrl and FAMILLE Res
when I run for example this line to test the effect of the variables on the rate of weight gain:
model6 <- lmer((taux.croiss.sp.poids) ~ t.visee + Lignee/FAMILLE + (1 |BASSIN), data = mydata1, REML = FALSE)
and then
summary(model6)
<Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's method ['lmerModLmerTest']
Formula: (taux.croiss.sp.poids) ~ t.visee + Lignee/FAMILLE + (1 | BASSIN)
Data: mydata1
AIC BIC logLik deviance df.resid
115.2 139.5 -50.6 101.2 228
Scaled residuals:
Min 1Q Median 3Q Max
-3.11527 -0.59489 0.05557 0.69775 2.79920
Random effects:
Groups Name Variance Std.Dev.
BASSIN (Intercept) 0.01184 0.1088
Residual 0.08677 0.2946
Number of obs: 235, groups: BASSIN, 4
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 0.770942 0.209508 194.702337 3.680 0.000302 ***
t.visee -0.019077 0.011682 231.005933 -1.633 0.103809
LigneeSEL 0.214062 0.054471 231.007713 3.930 0.000112 ***
LigneeCTRL:FAMILLESENS -0.008695 0.054487 231.038877 -0.160 0.873358
LigneeSEL:FAMILLESENS -0.205001 0.054242 231.016973 -3.779 0.000200 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) t.vise LgnSEL LCTRL:
t.visee -0.948
LigneeSEL -0.131 0.000
LCTRL:FAMIL -0.124 -0.007 0.504
LSEL:FAMILL 0.000 0.000 -0.498 0.000>
From what I can understand, the model chooses 1 family as the reference group, which won't be in the output. But the problem here is that 2 groups are missing :
LigneeCTRL:FAMILLERES
AND
LigneeSEL:FAMILLERES
Does somebody knows why my output is missing not ONE but TWO of the groups?
I'm french canadian so don't hesitate if some things are not clear, I will try to re-explain in other words!
Also, this is my 1st message on Stack, I tried to include everything needed but dont hesitate if I need to include some other things!
Thanks in advance

Store coefficients, confidence intervalls and odds ratios in one dataframe after univariate logistic regression with multiple independent variables

The dataframe looks like this (dput at the end):
A B C D E F G H I J K
<fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
1 No ev~ fema~ >=60 <30 B 1 >=150 M 0 >=30 No
2 No ev~ fema~ <60 <30 A 0 <150 B 0 <30 No
3 No ev~ fema~ <60 >=30 A 1 >=150 M 0 <30 No
4 No ev~ fema~ >=60 <30 A 0 >=150 M 0 <30 No
5 No ev~ male <60 >=30 B 1 >=150 B 0 <30 No
6 No ev~ male >=60 <30 A 1 >=150 M 1 >=30 No
7 event fema~ >=60 >=30 A 1 >=150 B 0 <30 Yes
8 No ev~ fema~ <60 <30 A 0 >=150 M 0 >=30 No
9 No ev~ male >=60 <30 A 0 >=150 B 1 <30 No
10 No ev~ male >=60 <30 B 1 >=150 M 0 <30 No
# ... with 140 more rows
I perform univariate logistic regression where A is the dependent variable and the others B:K are the independent variables. All are factor.
This code works:
lapply(c("B","C","D","E","F","G","H","I","J", "K"),
function(var) {
formula <- as.formula(paste("A ~", var))
res.logist <- glm(formula, data = df_fake, family = binomial)
summary(res.logist)
})
However there is a bunch of information in the output(each of them useful) but I want to know if this kind of output is possible in R:
Desired output:
Estimate Std. Error z value Pr(>|z|) OR lowerlimit upperlimit
Bmale 0.2941 0.6917 0.425 0.671 ? ? ?
C>=60 0.5653 0.7269 0.778 0.437
D>=30 1.7579 0.7061 2.489 0.0128 *
EB 0.7302 0.6929 1.054 0.292
F1 1.4508 0.7298 1.988 0.0468 *
G>=150 0.1238 0.6917 0.179 0.858
HM 1.0223 0.7274 1.405 0.16
I1 0.5325 0.7351 0.724 0.469
J>=30 0.6581 0.7372 0.893 0.372
KYes 5.0814 0.9917 5.124 0.00000029909 ***
The data:
df <- structure(list(A = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No event",
"event"), class = "factor"), B = structure(c(1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L,
1L, 1L), .Label = c("female", "male"), class = "factor"), C = structure(c(2L,
1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 2L), .Label = c("<60", ">=60"), class = "factor"),
D = structure(c(1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("<30", ">=30"), class = "factor"),
E = structure(c(2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L), .Label = c("A", "B"), class = "factor"),
F = structure(c(2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
G = structure(c(2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L), .Label = c("<150", ">=150"), class = "factor"),
H = structure(c(2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("B", "M"), class = "factor"),
I = structure(c(1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
1L, 1L, 1L, 1L), .Label = c("0", "1"), class = "factor"),
J = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("<30", ">=30"), class = "factor"),
K = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L), .Label = c("No", "Yes"), class = "factor")), row.names = c(NA,
-150L), class = c("tbl_df", "tbl", "data.frame"))
#TarJae. Maybe this might be helpful.
You can use map_df from purrr to go through all of your independent variables.
If you use tidy from broom you can get the output desired. If you add conf.int to TRUE you will get confidence intervals (default level is 0.95).
You can also obtain relative odds ratios with exp of your estimate. An exponentiate option for tidy would only show exponentiated coefficients.
library(tidyverse)
library(broom)
map_df(set_names(names(df)[names(df) != "A"]),
~glm(formula(paste("A ~ ", .x)), data = df, family = binomial) %>%
tidy(conf.int = TRUE)) %>%
filter(term != "(Intercept)") %>%
mutate(OR = exp(estimate))
Output
term estimate std.error statistic p.value conf.low conf.high OR
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Bmale 0.294 0.692 0.425 0.671 -1.07 1.73 1.34
2 C>=60 0.565 0.727 0.778 0.437 -0.808 2.15 1.76
3 D>=30 1.76 0.706 2.49 0.0128 0.364 3.22 5.80
4 EB 0.730 0.693 1.05 0.292 -0.640 2.16 2.08
5 F1 1.45 0.730 1.99 0.0468 0.0724 3.04 4.27
6 G>=150 0.124 0.692 0.179 0.858 -1.24 1.56 1.13
7 HM 1.02 0.727 1.41 0.160 -0.351 2.61 2.78
8 I1 0.532 0.735 0.724 0.469 -1.07 1.92 1.70
9 J>=30 0.658 0.737 0.893 0.372 -0.943 2.05 1.93
10 KYes 5.08 0.992 5.12 0.000000299 3.31 7.31 161.
I don't know for any function to do this automatically, but you can also do it yourself. Here is a code to get all the coefficients you want:
res.logist <- glm(A ~ B, data = df, family = binomial)
res <- tibble(Independant = names(res.logist[[1]])[2],
Estimate = res.logist[[1]][2],
'Std. Error' = summary(res.logist)$coefficients[2,2],
'z value' = summary(res.logist)$coefficients[2,3],
'Pr(>|z|)' = summary(res.logist)$coefficients[2,4],
OR = exp(coef(res.logist))[2],
lowerlimit = confint(res.logist)[2,1],
upperlimit = confint(res.logist)[2,2])
You can include this inside function and append each interaction to a data frame outside the function.

Getting specific combination of interaction as variable in logistic regression with R

I have this dataset and want to perform a regression analysis on it. I have to predictive variables urban_rural and religious. Now I want to have two specific interaction variables: 1.) Urban/not religious and 2.) Rural/religious. I know that interaction is possible through the sign *, but this does not give me the desired combination of interaction. I guess one has to set the reference variable manually?
structure(list(urban_rural = structure(c(1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Urban", "Rural", "Refugee camp"
), class = "factor"), religious = structure(c(2L, 1L, 2L, 2L,
3L, 2L, 2L, 3L, 1L, 3L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 3L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 1L, 3L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 3L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 1L, 3L, 1L, 2L, 3L, 2L,
2L, 1L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 3L, 2L,
3L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 3L, 2L, 3L, 1L), .Label = c("Religious", "Somewhat religious",
"Not religious"), class = "factor"), family_role_recoded = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label = c("Agree/strongly agree",
"Disagree/strongly disagree", "Don't know"), class = "factor")), row.names = c(NA,
250L), class = "data.frame")
I used these regression models:
model1 <- glm(family_role_recoded ~ urban_rural,
family=binomial(link='logit'),
subset = (family_role_recoded != "Don't know" & urban_rural != "Refugee camp"),
data=dataset)
model2 <- glm(family_role_recoded ~ religious,
family=binomial(link='logit'),
subset = (family_role_recoded != "Don't know" & urban_rural != "Refugee camp"),
data=dataset)
model3 <- glm(family_role_recoded ~ urban_rural + religious,
family=binomial(link='logit'),
subset = (family_role_recoded != "Don't know" & urban_rural != "Refugee camp"),
data=dataset)
Does anyone have an idea how to solve this problem?
If you set the reference for religious to be "Somewhat religious" first. We can look at the results first :
library(broom)
dataset$religious = relevel(dataset$religious,ref="Somewhat religious")
fit0 = glm(family_role_recoded ~ urban_rural*religious,data=dataset,family=binomial())
# A tibble: 6 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -0.902 0.181 -4.99 6.03e-7
2 urban_ruralRural -0.484 0.532 -0.910 3.63e-1
3 religiousReligious -0.0141 0.456 -0.0308 9.75e-1
4 religiousNot religious 1.47 0.391 3.76 1.67e-4
5 urban_ruralRural:religiousReligious 0.995 1.14 0.876 3.81e-1
6 urban_ruralRural:religiousNot religio… 0.201 0.993 0.203 8.39e-1
You have one of the terms rural/religious. Intuitively, the Urban/Not religious term would be the flip of urban_ruralRural:religiousNot religio. We can also manually define the interaction terms we need:
dataset$Rural_religious = with(dataset,as.numeric(urban_rural=="Rural" & religious=="Religious"))
dataset$Urban_not_religious = with(dataset,as.numeric(urban_rural=="Urban" & religious=="Not religious"))
fit = glm(family_role_recoded ~ 0+urban_rural+religious+Urban_not_religious+Rural_religious,data=dataset,family=binomial())
tidy(fit)
# A tibble: 6 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 urban_ruralUrban -0.902 0.181 -4.99 0.000000603
2 urban_ruralRural -1.39 0.500 -2.77 0.00556
3 religiousReligious -0.0141 0.456 -0.0308 0.975
4 religiousNot religious 1.67 0.913 1.83 0.0667
5 Urban_not_religious -0.201 0.993 -0.203 0.839
6 Rural_religious 0.995 1.14 0.876 0.381
You need to do a post hoc test. For that you can use the R package "emmeans"

ggplot2 geom_ribbon to colour-label time series

I want to colour-label a time series using ggplot2. I have two distinct states (classes) as given by the labels vector below and I want to super-impose this on top of a plot of IBM Close prices.
Below is my attempt, which unfortunately can not produce distinctively colour-coded regions by state. I am mainly having trouble with using geom_ribbon():
library(TTR)
library(ggplot2)
data <- getYahooData("IBM", start = 20130101, end = 20150101, freq = "daily")
df <- data.frame(data)
df$Date <-as.Date(row.names(df),"%Y-%m-%d")
# plot colour coded states on top of the original signal
dput(labels)
c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L)
# colour and class labels to be used by geom_ribbon
df_bg <- data.frame(x = c(0, rep(which(as.logical(diff(labels))), each=2), length(labels)),
ymin = min(df$Close, na.rm = TRUE),
ymax = 1.1*max(df$Close, na.rm = TRUE),
fill = factor(rep(labels[c(which(as.logical(diff(labels))), length(labels) )], each=2))
)
ggplot(data=df) +
geom_line(aes(x=1:nrow(df), y=diff_ma )) +
labs(title="IBM 2-State HMM") +
geom_ribbon(data = df_bg,
aes(x = x, ymin=ymin, ymax=ymax, fill=fill), alpha=.2) +
xlab("Date") +
ylab("Levels") +
theme(legend.justification = c(1, 0), legend.position = c(1, 0))
As you can see below, rather than distinct red and blue filled regions, this geom_ribbon code produces a mixture of the two.
What am I doing wrong here? I think it is the fill column in df_bg, but I am not entirely sure. Also, if I wanted to plot by Date in the x axis, can I still use geom_ribbon to colour label?
P.S: This question is related to my previous one here.
You need to add a group argument to your data and geom_ribbon call. Otherwise it groups by color and just uses the filling color from the minimum to the maximum value for each of your colors.
# colour and class labels to be used by geom_ribbon
df_bg <- data.frame(x = c(0, rep(which(as.logical(diff(labels))), each=2), length(labels)),
ymin = min(df$Close, na.rm = TRUE),
ymax = 1.1*max(df$Close, na.rm = TRUE),
fill = factor(rep(labels[c(which(as.logical(diff(labels))), length(labels) )],
each=2)),
grp = factor(rep(seq(sum(as.logical(diff(labels)), na.rm=TRUE)+1), each=2))
)
#
ggplot(data=df) +
geom_line(aes(x=1:nrow(df), y=Close)) +
labs(title="IBM 2-State HMM") +
geom_ribbon(data = df_bg,
aes(x = x, ymin=ymin, y=180, ymax=ymax, fill=fill, group=grp), alpha=.2) +
xlab("Date") +
ylab("Levels") +
theme(legend.justification = c(1, 0), legend.position = c(1, 0))
EDIT: In order to change the x-axis to Date format, you can use the Date in your geom_line command and change the x of your df_bg to a Date.
# colour and class labels to be used by geom_ribbon
df_bg <- data.frame(x = df[c(1, rep(which(as.logical(diff(labels))), each=2), length(labels)), "Date"],
ymin = min(df$Close, na.rm = TRUE),
ymax = 1.1*max(df$Close, na.rm = TRUE),
fill = factor(rep(labels[c(which(as.logical(diff(labels))), length(labels) )],
each=2)),
grp = factor(rep(seq(sum(as.logical(diff(labels)), na.rm=TRUE)+1), each=2))
)
#
ggplot(data=df) +
geom_line(aes(x=Date, y=Close)) +
labs(title="IBM 2-State HMM") +
geom_ribbon(data = df_bg,
aes(x = x, ymin=ymin, y=180, ymax=ymax, fill=fill, group=grp), alpha=.2) +
xlab("Date") +
ylab("Levels") +
theme(legend.justification = c(1, 0), legend.position = c(1, 0))

clustering qualitative data in R

I have a data.frame (df) that looks like that:
ZN.N ZL.N
MMP2 (1.89,3.58] (2.13,4.1]
AEBP1 (1.89,3.58] (2.13,4.1]
A1AG1 (1.89,3.58] (2.13,4.1]
A1AT [0.364,1.89] [0.275,2.13]
A2MG [0.364,1.89] [0.275,2.13]
ENOA (1.89,3.58] (2.13,4.1]
And I would like to cluster the row.names (proteins) based on the two variables (ZN.N and ZL.N). Could I use a k.means approach or a hierarchical clustering for this kind of data?
I've tried
df.k2 <- k.means(df, 2)
but it doesn't work. I'm really new on clustering so apologise whether the question is really silly, thanks a lot
Here is the dput of my data.frame
structure(list(ZN.N = structure(c(2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L,1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("[0.364,1.89]", "(1.89,3.58]"), class = "factor"),
ZL.N = structure(c(2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 1L,
3L, 3L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 3L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 2L, 2L, 3L, 1L, 3L, 2L,
1L, 1L, 2L, 3L, 1L), .Label = c("[0.275,2.13]", "(2.13,4.1]",
"(4.1,6.78]"), class = "factor")), .Names = c("ZN.N", "ZL.N"), class = "data.frame", row.names = c("MMP2", "AEBP1", "A1AG1", "A1AT", "A2MG", "ENOA", "ANGI", "ANGL2", "ANT3", "APOA1", "APOA2", "APOD", "PGBM", "PGS1", "CAH3", "CRAC1", "CILP1", "CILP2", "COMP", "CH3L1", "CH3L2", "CSPG4", "CCD80", "CO1A1", "CO2A1", "CO3A1", "CO6A1", "COCA1", "COFA1", "COIA1", "CO1A2", "CO6A2", "COBA2", "CO6A3", "C1QB", "C1R", "C1S", "CO3", "CO4B", "CO8A", "CFAB", "CFAH", "CRP", "KCRM", "CLC3A", "ECM1", "FIBA", "FIBB", "FIBG", "FGFP2", "FMOD", "FINC", "FBLN1", "FSTL1", "G3P", "HPT", "HBA", "HBB", "H2B1L", "H32", "H4", "HPLN1", "IGHA1", "IGHG1", "IGKC", "LAC6", "IGHM", "INHBA", "IBP3", "ITIH1", "MMP1", "LDHA", "LYSC", "TIMP1", "TIMP2", "MIME", "MOES", "MYG", "NID2", "NUCB1", "OSTP", "PPIA", "PPIB", "POSTN", "PRDX2", "PGAM1", "PA2GA", "PLTP", "PEDF", "IPSP", "LMNA", "PCOC1", "PRELP", "AMBP", "PDIA3", "PDIA6", "S10AA", "S10A8", "PRG4", "KPYM", "RNAS1", "HTRA1", "TRFE", "ALBU", "SAMP", "SMOC2", "MMP3", "TARSH", "TENA", "TENX", "TETN", "TSP3", "TSP4", "BGH3", "TTHY", "TR11B", "RL40", "CSPG2", "VIME", "VTNC"))
The reason you are having trouble with clustering is that kmeans expects a numeric matrix, but you're providing the function a data frame with factor variables.
You could instead convert those factors to numbers and then run kmeans:
set.seed(144)
df$ZN.N <- as.numeric(df$ZN.N)
df$ZL.N <- as.numeric(df$ZL.N)
clusters <- kmeans(df, 2)$cluster
clusters1 <- names(clusters[clusters == 1])
clusters1
# [1] "MMP2" "AEBP1" "A1AG1" "ENOA" "APOA1" "PGS1" "CAH3" "CO1A1" "CO3A1"
# [10] "C1R" "CO8A" "CRP" "KCRM" "FIBB" "FIBG" "HPT" "HBA" "H32"
# [19] "H4" "IGHG1" "IGKC" "INHBA" "MYG" "NID2" "POSTN" "PLTP" "PEDF"
# [28] "LMNA" "PDIA3" "PDIA6" "S10AA" "S10A8" "TENA" "TETN" "TSP3" "BGH3"
# [37] "VIME"
clusters2 <- names(clusters[clusters == 2])
clusters2
# [1] "A1AT" "A2MG" "ANGI" "ANGL2" "ANT3" "APOA2" "APOD" "PGBM" "CRAC1"
# [10] "CILP1" "CILP2" "COMP" "CH3L1" "CH3L2" "CSPG4" "CCD80" "CO2A1" "CO6A1"
# [19] "COCA1" "COFA1" "COIA1" "CO1A2" "CO6A2" "COBA2" "CO6A3" "C1QB" "C1S"
# [28] "CO3" "CO4B" "CFAB" "CFAH" "CLC3A" "ECM1" "FIBA" "FGFP2" "FMOD"
# [37] "FINC" "FBLN1" "FSTL1" "G3P" "HBB" "H2B1L" "HPLN1" "IGHA1" "LAC6"
# [46] "IGHM" "IBP3" "ITIH1" "MMP1" "LDHA" "LYSC" "TIMP1" "TIMP2" "MIME"
# [55] "MOES" "NUCB1" "OSTP" "PPIA" "PPIB" "PRDX2" "PGAM1" "PA2GA" "IPSP"
# [64] "PCOC1" "PRELP" "AMBP" "PRG4" "KPYM" "RNAS1" "HTRA1" "TRFE" "ALBU"
# [73] "SAMP" "SMOC2" "MMP3" "TARSH" "TENX" "TSP4" "TTHY" "TR11B" "RL40"
# [82] "CSPG2" "VTNC"
In this code, ZN.N was converted into the numbers 1 and 2, and ZL.N was converted into the numbers 1, 2, and 3. kmeans then computes the euclidean distance between points for the clustering. You'll have to determine if this makes sense for your application.

Resources