I'm having trouble writing a for loop function where I ask gmulti to find the best model. I have the following example data set:
dput(Data)
structure(list(Studbook.ID = structure(c(16L, 16L, 16L, 16L,
16L, 16L, 16L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 6L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 30L, 30L, 30L), .Label = c("230", "298",
"308", "329", "357", "358", "374", "382", "385", "394", "397",
"399", "404", "413", "414", "418", "432", "433", "434", "437",
"439", "444", "446", "455", "458", "460", "473", "475", "476",
"477", "478", "492", "495", "496", "499", "503"), class = "factor"),
Season = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Breeding", "Nonbreeding"), class = "factor"),
Year = c(1999L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L, 2000L,
2000L, 2000L, 2000L, 2000L, 2000L, 2000L), Age.Class = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("Adult",
"Sub-Adult"), class = "factor"), Sex = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Female", "Male"
), class = "factor"), Captive_Wild = structure(c(2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Captive", "Wild"
), class = "factor"), C.SA.F = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), C.HA.F = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), W.MW.F = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L
), W.MW.DUR = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L),
C.CHEW.F = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L)), .Names =c("Studbook.ID",
"Season", "Year", "Age.Class", "Sex", "Captive_Wild", "C.SA.F",
"C.HA.F", "W.MW.F", "W.MW.DUR", "C.CHEW.F"), row.names = c(NA,
40L), class = "data.frame")
Code for my total loop is:
#lmer wrapper for glmulti function
lmer.glmulti <- function (formula, data, family=binomial, random, ...) {
lmer(paste(deparse(formula), random), data = data,...)
}
#make a dependent variable list for loop
dep_list<-colnames(Bamboo)
dep_list<-dep_list[-c(1:6)]
outglm<-c()
outdesc<-c()
#start loop coding
for (depend in dep_list){
y <-Bamboo[,depend]
#gmluti loop
#glm full model (substitute behavioral variables in place of 'depend')
glmmod<-y~Captive_Wild+Sex+Age.Class+Season
glm.glmulti<-glmulti(glmmod, random="+(1|Studbook.ID)", data=Bamboo, fitfunc = lmer.glmulti, family=binomial, level=2)
#make and print table for final best model
htmlreg(glm.glmulti#objects[[1]], file=paste(depend, ".doc", sep=""), caption = depend, caption.above = TRUE)
}
It's hanging up on the glmulti code where it gives me this error:
Error in model.frame.default(as.formula(paste(y, "~", paste(x, sep = "", :
variable lengths differ (found for 'Captive_Wild')
And traceback looks like:
8 model.frame.default(as.formula(paste(y, "~", paste(x, sep = "",
collapse = "+"), sep = "")), data = data)
7 model.frame(as.formula(paste(y, "~", paste(x, sep = "", collapse = "+"),
sep = "")), data = data)
6 glmulti(y = "y", data = Bamboo, level = 2, fitfunction = lmer.glmulti,
random = "+(1|Studbook.ID)", xr = c("Sex", "Season"), exclude = 1)
5 glmulti(y = "y", data = Bamboo, level = 2, fitfunction = lmer.glmulti,
random = "+(1|Studbook.ID)", xr = c("Sex", "Season"), exclude = 1)
4 eval(expr, envir, enclos)
3 eval(call)
2 glmulti(y ~ Sex + Season, random = "+(1|Studbook.ID)", data = Bamboo,
fitfunc = lmer.glmulti, level = 2)
1 glmulti(y ~ Sex + Season, random = "+(1|Studbook.ID)", data = Bamboo,
fitfunc = lmer.glmulti, level = 2)
I've also tried When I run the variables through by hand one-by-one the glmulti works just fine and when I remove Captive_Wild (which of course I don't want to do) it gives me the same error with Sex and ditto with Season. I've checked all variable lengths and they are the same.
This implies to me that glmulti is having a problem with the for loop somewhere but I'm not sure where. Can anyone suggest fixes? This is my first attempt at for loops so any and all help would be much appreciated!
After a little more fooling around I found two problems in the code:
1) lmer wrapper is old so need to call:
glmer.glmulti <- function (formula, data, family=binomial, random, ...) {
glmer(paste(deparse(formula), random), data = data,...)
}
and 2) using the alternate form of calling glmulti in the for-loop like so:
for (depend in dep_list){
glm.glmulti = glmulti(depend, c("Captive_Wild", "Sex", "Age.Class", "Season"), random="+(1|Studbook.ID)", data=Bamboo, fitfunc=lmer.glmulti, family=binomial, level=2)
#make and print table for final best model
htmlreg(glm.glmulti#objects[[1]], file=paste(depend, ".doc", sep=""), caption = depend, caption.above = TRUE)
}
Related
I am using distributed lag non-linear models . I ran a glm model with a cross-basis matrix from the DLNM package. When I tried to get the predictions, I got this error:
Error in crosspred(cbpm1, Tp1, by = 1, bylag = 1, at = speimin:speimax) :
coef/vcov not consistent with basis matrix. See help(crosspred).
This happened when I tried lag 1,2, and 3, but there was no error when I tried lag 0, 4, and 5. I read about a similar question from this link. But still, I cannot figure it out with my own code. Your help is really meaningful for me. Thanks.
The code is:
Dis <- ss$dis1
vkt <- equalknots(ss$T,nk=2)
lkt = logknots(1,nk=2)
vkpm <- equalknots(ss$spei3,nk=2)
lkpm <- logknots(1,nk=2)
speimin <- min(ss$spei3, na.rm = TRUE)
speimax <- max(ss$spei3, na.rm = TRUE)
cbt1 = crossbasis(ss$T, lag=1, argvar=list(fun="bs",degree=2,knots=vkt), arglag=list(knots=lkt))
cbpm1 <- crossbasis(ss$spei3, lag=1, argvar=list(fun="bs",degree=2,knots=vkpm), arglag=list(knots=lkpm))
Tp1 <- glm(Dis ~ cbt1 + cbpm1 + ns(RH,3)+ns(timeseries,2*5),
family=poisson(link=log),ss)
at=speimin:speimax
predsltp1 <- crosspred(cbpm1,Tp1,by=1,bylag=1,at=speimin:speimax)
Here is the used library:
library(splines);library(class);library(stats);library(mda)
library(akima);library(gam);library(mgcv);library(foreign);library(som)
library(dlnm) #equalknots logknots crossbasis
library(splines) #ns
library(magrittr)
Here is the reproducible sample of my dataset:
a<-structure(list(job = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "all", class = "factor"),
age3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "00_05", class = "factor"),
sexA = structure(c(1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L,
2L, 1L, 2L, 2L, 1L), .Label = c("F", "M"), class = "factor"),
All = c(65L, 53L, 92L, 68L, 81L, 103L, 144L, 92L, 44L, 40L,
54L, 19L, 55L, 61L, 72L, 89L, 77L, 68L, 71L, 27L, 15L, 18L,
39L, 52L, 52L, 58L, 27L, 44L, 32L, 37L), dis1 = c(6L, 0L,
9L, 0L, 0L, 0L, 9L, 0L, 3L, 6L, 3L, 0L, 0L, 3L, 6L, 0L, 9L,
3L, 0L, 3L, 6L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L), dis2 = c(3L,
6L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 3L, 0L, 0L, 6L, 6L,
0L, 0L, 0L, 3L, 3L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
T = c(20.39032258, 20.39032258, 19.78387097, 19.78387097,
19.64193548, 19.64193548, 18.78709677, 18.78709677, 19.17419355,
19.17419355, 20.46774194, 21.63870968, 21.85806452, 21.85806452,
19.73448276, 19.73448276, 20.55357143, 20.55357143, 19.925,
29.12580645, 29.12580645, 29.39354839, 29.39354839, 28.96129032,
28.96129032, 27.36666667, 27.40333333, 27.40333333, 27.82333333,
27.82333333), RH = c(70.09677419, 70.09677419, 70.03225806,
70.03225806, 70.35483871, 70.35483871, 72.32258065, 72.32258065,
69.80645161, 69.80645161, 74.58064516, 77.58064516, 71.32258065,
71.32258065, 75.82758621, 75.82758621, 62.28571429, 62.28571429,
72.60714286, 77.61290323, 77.61290323, 75.06451613, 75.06451613,
75.61290323, 75.61290323, 76.03333333, 76.23333333, 76.23333333,
75.03333333, 75.03333333), PP = c(11.5, 11.5, 44.5, 44.5,
25.9, 25.9, 14, 14, 5, 5, 35.7, 34.1, 30.8, 30.8, 44.4, 44.4,
15.6, 15.6, 40.7, 184, 184, 137.1, 137.1, 377, 377, 110.5,
129.8, 129.8, 292, 292), spei3 = c(0.447495072, 0.447495072,
1.537295165, 1.537295165, 1.285067571, 1.285067571, 0.441010834,
0.441010834, 1.505630159, 1.505630159, 1.725831329, 1.075029338,
-1.227673724, -1.227673724, 0.329690702, 0.329690702, 0.724314874,
0.724314874, 1.228544608, 0.60782059, 0.60782059, 0.191804009,
0.191804009, 1.752145476, 1.752145476, 1.94554333, 1.139058482,
1.139058482, -0.554472376, -0.554472376), timeseries = 1:30), class = "data.frame", row.names = c(NA,
-30L))
When I run this code-
a<- read.delim(file.choose("data.txt"))
d<-sort(a$d)
plot(d, sort(ecdf(d)(d)),type="s", lty=2,col="red", ylab= "P(X<=x)",ylim= 0:1)
it makes me make this mistake-
Error in ecdf(d) : 'x' must have 1 or more non-missing values
help?
I ran your code and it seems to be alright. I've just changed the second line of your code, because the only column provided in your data was named as x, instead of d.
Check it out:
# load data
a = structure(list(x = c(4L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 4L, 1L, 2L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 3L, 0L, 5L, 2L, 2L, 1L, 0L, 0L, 2L, 0L, 0L,
0L, 1L, 3L, 3L, 0L, 0L, 0L, 2L, 0L, 2L, 1L, 1L, 4L, 4L,
0L, 1L, 3L, 1L, 0L, 2L, 1L, 2L, 0L, 0L, 0L, 1L, 0L, 1L,
6L, 0L, 2L, 2L, 0L, 1L, 1L, 2L, 1L, 0L, 1L, 0L, 3L, 0L,
3L, 0L, 4L, 3L, 2L, 2L, 2L, 1L, 3L, 0L, 3L, 2L, 0L, 1L,
2L, 1L)), class = "data.frame", row.names = c(NA, -100L))
# sort x column (the only column)
d = sort(a$x)
# plot
plot(d, sort(ecdf(d)(d)), type = "s", lty = 2, col = "red",
ylab = "P(X<=x)", ylim = 0:1)
Output:
I have a data frame like this
structure(list(cli_exp = c(1L, 1L, 2L, 1L, 1L, 0L, 2L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 2L, 2L, 0L, 1L, 0L,
1L, 1L, 2L, 0L, 1L), vcs_exp = c(0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 2L, 1L,
1L, 0L, 0L, 0L, 2L, 1L, 0L), web_exp = c(2L, 2L, 2L, 1L, 0L,
0L, 1L, 2L, 0L, 0L, 3L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 2L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 2L, 0L, 0L)), .Names = c("cli_exp", "vcs_exp",
"web_exp"), row.names = c(NA, 30L), class = "data.frame")
I want to use ggplot2 to express the relation between these three variables and tried the simple point plot
ggplot(data = data) +
geom_point(mapping = aes(x = web_exp, y = vcs_exp, color = cli_exp))
But apparently, there are many overlapping data points, which are not suitable for point display. Are there any better ways?
I would use ggpairs from GGally package
tmp_df <- structure(list(cli_exp = c(1L, 1L, 2L, 1L, 1L, 0L, 2L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 2L, 2L, 0L, 1L, 0L,
1L, 1L, 2L, 0L, 1L), vcs_exp = c(0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 2L, 1L,
1L, 0L, 0L, 0L, 2L, 1L, 0L), web_exp = c(2L, 2L, 2L, 1L, 0L,
0L, 1L, 2L, 0L, 0L, 3L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 2L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 2L, 0L, 0L)), .Names = c("cli_exp", "vcs_exp",
"web_exp"), row.names = c(NA, 30L), class = "data.frame")
library(GGally)
ggpairs(tmp_df,
upper = list(continuous = wrap("cor", size = 10)),
lower = list(continuous = "smooth"))
Edit: use pairs from base R
pairs(tmp_df)
Use pairs.panels from psych package
library(psych)
pairs.panels(tmp_df,
method = "pearson",
density = TRUE,
ellipses = TRUE
)
As you mentioned, the points overlap, so some points aren't visible when using geom_point.
ggplot(data = df, aes(x = web_exp, y = vcs_exp, color = cli_exp)) +
geom_point()
This can be solved by adding a small amount of jitter. Also, making the points slightly transparent will make any overlaps more clear.
ggplot(data = df, aes(x = web_exp, y = vcs_exp, color = cli_exp)) +
geom_jitter(width = 0.05, height = 0.05, alpha = 0.8)
Below I have code with 3 columns: a group field, a open/close field for the store, and the rolling sum of 3 month opens for the store. I also have the desired solution output.
My dataset can be thought of as an employees availability. You can assume each row to be a different time period (hour, day,month, year, whatever). In the open/closed column I have whether or not the employee was present. The 3month rolling column is a sum of the previous rows.
What I want to identify is the non-zero values in this rolling sum column following a gap of at least 3 zero rows for that particular group. While not present in this dataset, you can assume that there might be more than one 'gap' of zeros present.
structure(list(Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), X0_closed_1_open = c(0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), X3month_roll_open = c(0L,
0L, 1L, 2L, 2L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 2L, 0L, 1L, 1L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L), desired_solution = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("no", "yes"), class ="factor")), .Names = c("Group", "X0_closed_1_open", "X3month_roll_open", "desired_solution"), class = "data.frame", row.names = c(NA,
-26L))
One option is:
res <- unsplit(
lapply(split(df1, df1$Group), function(x) {
rl <- with(x,rle(X3month_roll_open==0))
indx <- cumsum(c(0,diff(inverse.rle(within.list(rl,
values[values] <- lengths[values]>=3)))<0))
x$Flag <- indx!=0 & x[,3]!=0
x}),
df1$Group)
NOTE: Instead of 'yes/no', it may be better to have 'TRUE/FALSE' for easing subsetting.
identical(c('no', 'yes')[res$Flag+1L], as.character(res$desired_solution))
#[1] TRUE
I am trying to calculate the proportion of correct responses for each participant as a function of three factors (group, sound and language). My data frame looks like this:
participant group sound lang resp
advf03 adv a in 1
advf03 adv a sp 0
advf03 adv a in 1
advf03 adv a sp 0
advf03 adv a in 0
advf03 adv a sp 1
advf03 adv a sp 0
advf03 adv a in 1
advf03 adv a in 0
advf03 adv a in 1
begf03 beg a in 1
begf03 beg a in 1
begf03 beg a sp 0
"Group" has 3 levels: adv, int, and beg. "Sound" has 3 levels: a, e, i. "Lang" has 2 levels: in, sp. A "1" implies a correct response and a "0" implies an incorrect response. I would like to have a proportion (i.e. percent correct) of the "1"'s for each participant as a new column in a new data frame. An example of the type of information I would like to have: Participant advf03 got 53% correct for "a" in "sp".
Here are 50 observations from my data:
structure(list(sound = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("a",
"e", "i"), class = "factor"), resp = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L), participant = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("2advf03", "2advf05", "2advm04", "2advm06", "2begf01",
"2begf02", "2begf04", "2begf05", "2begm03", "2advf01", "2intf01",
"2intf03", "2intf04", "2intf06", "2advm05"), class = "factor"),
group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("adv",
"beg", "int"), class = "factor"), lang = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("in", "sp"), class = "factor")), .Names = c("sound",
"resp", "participant", "group", "lang"), row.names = c(10L, 31L,
36L, 43L, 47L, 49L, 52L, 59L, 61L, 65L, 66L, 68L, 71L, 79L, 97L,
99L, 106L, 125L, 133L, 138L, 147L, 149L, 162L, 165L, 174L, 175L,
33L, 37L, 112L, 136L, 154L, 186L, 11L, 50L, 89L, 92L, 104L, 105L,
123L, 126L, 129L, 143L, 153L, 173L, 177L, 187L, 188L, 191L, 7L,
12L), class = "data.frame")
This is what I have so far:
# get counts of subsets of factors
df <- as.data.frame(table(df))
# new column that gives the proportion of responses
df$prop <- df$Freq / 32
But this does not seems to give me the correct proportions. I know that I need to reduce the data so that I don't have so many observations (i.e. 1 value for each sound for each language for each participant, but I don't know the correct steps do that.
If I understand your question correctly, you would like to know the proportion of 1s by participant, sound, and language.
Because the proportion of 1s in a vector with only 0s and 1s is just the mean, this should work:
aggregate(data=df, resp ~ participant + group + lang, FUN="mean")
The output of that with your 50 observations is:
participant group lang resp
1 2advf03 adv in 0.1875000
2 2advf03 adv sp 0.1111111