Related
I have this dataset and want to perform a regression analysis on it. I have to predictive variables urban_rural and religious. Now I want to have two specific interaction variables: 1.) Urban/not religious and 2.) Rural/religious. I know that interaction is possible through the sign *, but this does not give me the desired combination of interaction. I guess one has to set the reference variable manually?
structure(list(urban_rural = structure(c(1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Urban", "Rural", "Refugee camp"
), class = "factor"), religious = structure(c(2L, 1L, 2L, 2L,
3L, 2L, 2L, 3L, 1L, 3L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 3L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 1L, 3L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 3L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 1L, 3L, 1L, 2L, 3L, 2L,
2L, 1L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 3L, 2L,
3L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 2L, 3L, 2L, 3L, 1L), .Label = c("Religious", "Somewhat religious",
"Not religious"), class = "factor"), family_role_recoded = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label = c("Agree/strongly agree",
"Disagree/strongly disagree", "Don't know"), class = "factor")), row.names = c(NA,
250L), class = "data.frame")
I used these regression models:
model1 <- glm(family_role_recoded ~ urban_rural,
family=binomial(link='logit'),
subset = (family_role_recoded != "Don't know" & urban_rural != "Refugee camp"),
data=dataset)
model2 <- glm(family_role_recoded ~ religious,
family=binomial(link='logit'),
subset = (family_role_recoded != "Don't know" & urban_rural != "Refugee camp"),
data=dataset)
model3 <- glm(family_role_recoded ~ urban_rural + religious,
family=binomial(link='logit'),
subset = (family_role_recoded != "Don't know" & urban_rural != "Refugee camp"),
data=dataset)
Does anyone have an idea how to solve this problem?
If you set the reference for religious to be "Somewhat religious" first. We can look at the results first :
library(broom)
dataset$religious = relevel(dataset$religious,ref="Somewhat religious")
fit0 = glm(family_role_recoded ~ urban_rural*religious,data=dataset,family=binomial())
# A tibble: 6 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) -0.902 0.181 -4.99 6.03e-7
2 urban_ruralRural -0.484 0.532 -0.910 3.63e-1
3 religiousReligious -0.0141 0.456 -0.0308 9.75e-1
4 religiousNot religious 1.47 0.391 3.76 1.67e-4
5 urban_ruralRural:religiousReligious 0.995 1.14 0.876 3.81e-1
6 urban_ruralRural:religiousNot religio… 0.201 0.993 0.203 8.39e-1
You have one of the terms rural/religious. Intuitively, the Urban/Not religious term would be the flip of urban_ruralRural:religiousNot religio. We can also manually define the interaction terms we need:
dataset$Rural_religious = with(dataset,as.numeric(urban_rural=="Rural" & religious=="Religious"))
dataset$Urban_not_religious = with(dataset,as.numeric(urban_rural=="Urban" & religious=="Not religious"))
fit = glm(family_role_recoded ~ 0+urban_rural+religious+Urban_not_religious+Rural_religious,data=dataset,family=binomial())
tidy(fit)
# A tibble: 6 x 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 urban_ruralUrban -0.902 0.181 -4.99 0.000000603
2 urban_ruralRural -1.39 0.500 -2.77 0.00556
3 religiousReligious -0.0141 0.456 -0.0308 0.975
4 religiousNot religious 1.67 0.913 1.83 0.0667
5 Urban_not_religious -0.201 0.993 -0.203 0.839
6 Rural_religious 0.995 1.14 0.876 0.381
You need to do a post hoc test. For that you can use the R package "emmeans"
I have very little experience with R and am trying to make a stacked barplot using ggplot2.
I have 2 groups - control and experimental, and 2 choices - red and green. I'm not sure how to organise my data.
There were 80 animals in my trial (control n=40, experimental n=40) and they were given the choice of red and green substrate, I noted which substrate they chose, and that's the data I'm trying to plot.
I would essentially want 'Experimental' and 'Control on the x-axis, and the number of choices on the y-axis (e.g. Control, Red n=20, Control, Green = 12 etc).
Any help would be appreciated!
Edited to add:
This is the graph it's outputting
This is the code I'm using (including suggested adjustments):
df <- data.frame(group = rep(c("control", "experimental"), each = 40),
substrate = sample (c("red","green"), 80, TRUE))
ggplot(df, aes(x = group, y = substrate, fill = substrate)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("red", "green"))
This is the output:
structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("control", "experimental"
), class = "factor"), substrate = structure(c(1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L), .Label = c("green",
"red"), class = "factor")), class = "data.frame", row.names = c(NA,
-80L))
output from df(behaviour) - original dataframe
structure(list(group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Control", "Experimental"
), class = "factor"), substrate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Green",
"Red"), class = "factor")), class = "data.frame", row.names = c(NA,
-80L))
Your data:
behaviour=structure(list(group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Control", "Experimental"
), class = "factor"), substrate = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Green",
"Red"), class = "factor")), class = "data.frame", row.names = c(NA,
-80L))
We can tabulate your data:
table(behaviour$group,behaviour$substrate)
Green Red
Control 10 30
Experimental 27 13
So you can only specify fill or y with geom_bar. In your case, you specify the fill, the geom_bar() function will do the counting for you:
ggplot(behaviour,aes(x=group,fill=substrate))+
geom_bar() + scale_fill_manual(values=c("#29c7ac","#c02739"))
You could have your data like this, with one row for each observation (i.e. each animal), with the group and the substrate recorded for each:
df <- data.frame(group = rep(c("control", "experimental"), each = 40),
substrate = rep(c("green", "red", "green", "red"), c(10, 30, 27, 13)))
Now define your plot using ggplot, specifying group as your x axis, and ..count.. as your y axis. Use geom_bar to get the stacked bars you are looking for, and finally use scale_fill_manual to set the colours:
library(ggplot2)
ggplot(df, aes(x = group, y = ..count.., fill = substrate)) +
geom_bar(colour = "black") +
scale_fill_manual(values = c("green", "red"))
I am trying to use standard evaluation with dplyr to calculate percents as a function of two grouping variables. The problem is in my mutate_ statement.
Here is a dataset:
structure(list(
var1 = structure(c(2L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L,
2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L
),
.Label = c("No", "Yes"), class = "factor"),
var2 = structure(c(2L, 2L, 1L, 2L,
2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L,
2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L
),
.Label = c("Female", "Male"), class = "factor")),
.Names = c("var1", "var2"), row.names = c(NA, -100L), class = "data.frame")
Here is the code I am working with:
for_plots = function(data, var1, var2){
grouped_data = data %>% group_by_(var1, var2) %>%
summarise_(n_in_group = ~n()) %>%
mutate_(.dots = setNames(list(
interp(quote(n_in_group / sum(n_in_group, na.rm = TRUE) * 100),
n_in_group = as.name(n_in_group)))
))
return(grouped_data)
}
When I run the code, I receive an error:
Error in setNames(list(interp(quote(n_in_group/sum(n_in_group, na.rm = TRUE) * :
argument "nm" is missing, with no default
Any thoughts?
Here is some code based on #Frank's response:
for_plots = function(data, var1, var2) {
grouped_data = data %>% group_by_(var1, var2) %>%
summarise_(n_in_group = ~n()) %>%
mutate(percent = (n_in_group / sum(n_in_group, na.rm = TRUE)) * 100)
return(grouped_data)
}
I'm currently trying to create a clustered bar chart using ggplot2. It's basically just mean response times for a 2x2x2 factorial design. The three factors are load, compatibility and salience. I'm having a hard time jamming the third factor (salience) in there though. It shouldn't be a stacked graph though
This is what I currently have
bar+stat_summary(fun.y = mean, geom = "bar", position = "dodge") +
+ stat_summary(fun.data = mean_cl_normal, geom = "errorbar", position = position_dodge(width = 0.90), width = 0.2)+
+ labs(x = "Compatibility", y = "Mean RT", fill = "Load")
Here's a small sample of the data I'm trying to graph:
ID load comp sal rt
1 1 High Incompatible Non_Salient 787
2 1 Low Compatible Salient 754
3 2 High Incompatible Salient 654
I've seen graphs like these numerous times before but I have no idea how to get ggplot2 to display three independent variables at the same time.
I've tried splitting the graphs by adding
+ facet_wrap( ~ sal)
but that doesn't work either. It just says "Invalid argument to unary operator"
Any help would be appreciated.
Is this the kind of plot you are looking for?
I used the Wii data from the book "Discovering Statistics Using R", which is in a similar format to yours.
structure(list(athlete = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("Athlete", "Non-Athlete"), class = "factor"),
stretch = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = c("No Stretching", "Stretching"
), class = "factor"), wii = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Playing Wii",
"Watching Wii"), class = "factor"), injury = c(2L, 2L, 1L,
2L, 0L, 1L, 2L, 0L, 2L, 2L, 2L, 1L, 4L, 2L, 2L, 0L, 0L, 3L,
3L, 3L, 2L, 1L, 0L, 2L, 2L, 3L, 2L, 2L, 3L, 1L, 2L, 4L, 1L,
2L, 2L, 2L, 1L, 4L, 4L, 1L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L,
2L, 2L, 1L, 0L, 3L, 3L, 2L, 1L, 2L, 4L, 1L, 2L, 5L, 5L, 3L,
6L, 4L, 3L, 4L, 5L, 5L, 2L, 6L, 4L, 4L, 4L, 3L, 4L, 3L, 2L,
1L, 4L, 3L, 2L, 2L, 1L, 3L, 1L, 1L, 3L, 4L, 2L, 7L, 8L, 6L,
9L, 4L, 7L, 5L, 9L, 6L, 4L, 8L, 5L, 4L, 7L, 10L, 1L, 3L,
2L, 1L, 3L, 3L, 2L, 3L, 4L, 2L, 0L, 1L, 3L, 2L, 0L)),
.Names = c("athlete", "stretch", "wii", "injury"),
class = "data.frame", row.names = c(NA, -120L))
Here is how to produce the plot.
library(ggplot2)
library(Hmisc)
ggplot(data=Wii, aes(x=stretch, y=injury, fill=wii)) +
facet_wrap(~athlete) +
stat_summary(fun.y = mean, geom = "bar", position = "dodge") +
stat_summary(fun.data = mean_cl_normal, geom = "errorbar", position = position_dodge(width = 0.90), width = 0.2)
I have a data.frame (df) that looks like that:
ZN.N ZL.N
MMP2 (1.89,3.58] (2.13,4.1]
AEBP1 (1.89,3.58] (2.13,4.1]
A1AG1 (1.89,3.58] (2.13,4.1]
A1AT [0.364,1.89] [0.275,2.13]
A2MG [0.364,1.89] [0.275,2.13]
ENOA (1.89,3.58] (2.13,4.1]
And I would like to cluster the row.names (proteins) based on the two variables (ZN.N and ZL.N). Could I use a k.means approach or a hierarchical clustering for this kind of data?
I've tried
df.k2 <- k.means(df, 2)
but it doesn't work. I'm really new on clustering so apologise whether the question is really silly, thanks a lot
Here is the dput of my data.frame
structure(list(ZN.N = structure(c(2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L,1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("[0.364,1.89]", "(1.89,3.58]"), class = "factor"),
ZL.N = structure(c(2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 2L, 1L, 2L, 2L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 1L,
3L, 3L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 3L, 1L, 1L, 1L, 2L, 2L, 3L, 3L, 1L, 1L,
1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 3L, 2L, 2L, 3L, 1L, 3L, 2L,
1L, 1L, 2L, 3L, 1L), .Label = c("[0.275,2.13]", "(2.13,4.1]",
"(4.1,6.78]"), class = "factor")), .Names = c("ZN.N", "ZL.N"), class = "data.frame", row.names = c("MMP2", "AEBP1", "A1AG1", "A1AT", "A2MG", "ENOA", "ANGI", "ANGL2", "ANT3", "APOA1", "APOA2", "APOD", "PGBM", "PGS1", "CAH3", "CRAC1", "CILP1", "CILP2", "COMP", "CH3L1", "CH3L2", "CSPG4", "CCD80", "CO1A1", "CO2A1", "CO3A1", "CO6A1", "COCA1", "COFA1", "COIA1", "CO1A2", "CO6A2", "COBA2", "CO6A3", "C1QB", "C1R", "C1S", "CO3", "CO4B", "CO8A", "CFAB", "CFAH", "CRP", "KCRM", "CLC3A", "ECM1", "FIBA", "FIBB", "FIBG", "FGFP2", "FMOD", "FINC", "FBLN1", "FSTL1", "G3P", "HPT", "HBA", "HBB", "H2B1L", "H32", "H4", "HPLN1", "IGHA1", "IGHG1", "IGKC", "LAC6", "IGHM", "INHBA", "IBP3", "ITIH1", "MMP1", "LDHA", "LYSC", "TIMP1", "TIMP2", "MIME", "MOES", "MYG", "NID2", "NUCB1", "OSTP", "PPIA", "PPIB", "POSTN", "PRDX2", "PGAM1", "PA2GA", "PLTP", "PEDF", "IPSP", "LMNA", "PCOC1", "PRELP", "AMBP", "PDIA3", "PDIA6", "S10AA", "S10A8", "PRG4", "KPYM", "RNAS1", "HTRA1", "TRFE", "ALBU", "SAMP", "SMOC2", "MMP3", "TARSH", "TENA", "TENX", "TETN", "TSP3", "TSP4", "BGH3", "TTHY", "TR11B", "RL40", "CSPG2", "VIME", "VTNC"))
The reason you are having trouble with clustering is that kmeans expects a numeric matrix, but you're providing the function a data frame with factor variables.
You could instead convert those factors to numbers and then run kmeans:
set.seed(144)
df$ZN.N <- as.numeric(df$ZN.N)
df$ZL.N <- as.numeric(df$ZL.N)
clusters <- kmeans(df, 2)$cluster
clusters1 <- names(clusters[clusters == 1])
clusters1
# [1] "MMP2" "AEBP1" "A1AG1" "ENOA" "APOA1" "PGS1" "CAH3" "CO1A1" "CO3A1"
# [10] "C1R" "CO8A" "CRP" "KCRM" "FIBB" "FIBG" "HPT" "HBA" "H32"
# [19] "H4" "IGHG1" "IGKC" "INHBA" "MYG" "NID2" "POSTN" "PLTP" "PEDF"
# [28] "LMNA" "PDIA3" "PDIA6" "S10AA" "S10A8" "TENA" "TETN" "TSP3" "BGH3"
# [37] "VIME"
clusters2 <- names(clusters[clusters == 2])
clusters2
# [1] "A1AT" "A2MG" "ANGI" "ANGL2" "ANT3" "APOA2" "APOD" "PGBM" "CRAC1"
# [10] "CILP1" "CILP2" "COMP" "CH3L1" "CH3L2" "CSPG4" "CCD80" "CO2A1" "CO6A1"
# [19] "COCA1" "COFA1" "COIA1" "CO1A2" "CO6A2" "COBA2" "CO6A3" "C1QB" "C1S"
# [28] "CO3" "CO4B" "CFAB" "CFAH" "CLC3A" "ECM1" "FIBA" "FGFP2" "FMOD"
# [37] "FINC" "FBLN1" "FSTL1" "G3P" "HBB" "H2B1L" "HPLN1" "IGHA1" "LAC6"
# [46] "IGHM" "IBP3" "ITIH1" "MMP1" "LDHA" "LYSC" "TIMP1" "TIMP2" "MIME"
# [55] "MOES" "NUCB1" "OSTP" "PPIA" "PPIB" "PRDX2" "PGAM1" "PA2GA" "IPSP"
# [64] "PCOC1" "PRELP" "AMBP" "PRG4" "KPYM" "RNAS1" "HTRA1" "TRFE" "ALBU"
# [73] "SAMP" "SMOC2" "MMP3" "TARSH" "TENX" "TSP4" "TTHY" "TR11B" "RL40"
# [82] "CSPG2" "VTNC"
In this code, ZN.N was converted into the numbers 1 and 2, and ZL.N was converted into the numbers 1, 2, and 3. kmeans then computes the euclidean distance between points for the clustering. You'll have to determine if this makes sense for your application.