Related
I'm trying to calculate mean and SD and then perform t.tests on three different measurements (height, weight, speed) between multiple subgroups.
I started with a simple dataset that only contains two groups (control vs drug) and I have it all working well enough.
simple.df<-
structure(list(trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), levels = c("control", "drug"), class = "factor"), height = c(15,
17, 25, 21, 11, 29, 18, 20), weight = c(80, 90, 81, 79, 200,
230, 215, 210), speed = c(50, 45, 60, 51, 52, 80, 41, 19)), class = "data.frame", row.names = c(NA,
-8L))
library(rstatix)
simple.df %>% group_by(trial) %>% get_summary_stats(type = "mean_sd")
testing<- data.frame(lapply(simple.df[-1], function(x) t.test(x~simple.df$trial)$p.value))
testing
Where I'm running into trouble is with the t.testing on a larger experiment similar to the dataframe below. I still have control vs drug and height, weight & speed, but now all the measurements were done at two timepoints in both males and females. I'm only concerned with comparing control versus drug for the same sex/age. I'm still ok calculating the mean and SD for each group, but have gotten stuck with figuring out the t-testing.
Specifically, I just want the t-test on each of the three measurements for drug vs control in young males, drug vs control in old males, drug vs control in young females and drug vs control in old females, so 12 p-values total with some identification for what comparison each value represents.
Thanks for your help and expertise!
big.df<- structure(list(age = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("old", "young"
), class = "factor"), sex = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("f", "m"), class = "factor"),
trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("control", "drug"
), class = "factor"), height = c(15L, 17L, 25L, 21L, 11L,
29L, 18L, 20L, 300L, 320L, 316L, 325L, 170L, 175L, 172L,
180L, 28L, 40L, 33L, 35L, 60L, 45L, 67L, 52L, 250L, 260L,
240L, 248L, 11L, 19L, 16L, 4L), weight = c(80L, 90L, 81L,
79L, 200L, 230L, 215L, 210L, 152L, 150L, 148L, 155L, 160L,
158L, 157L, 140L, 176L, 164L, 135L, 196L, 175L, 178L, 120L,
147L, 160L, 155L, 175L, 142L, 139L, 142L, 150L, 145L), speed = c(50L,
45L, 60L, 51L, 52L, 80L, 41L, 19L, 55L, 56L, 61L, 67L, 85L,
90L, 100L, 77L, 90L, 80L, 77L, 80L, 81L, 95L, 87L, 91L, 50L,
60L, 55L, 59L, 71L, 65L, 66L, 62L)), row.names = c(NA, -32L
), class = "data.frame")
big.df %>% group_by (sex, age, trial) %>%
get_summary_stats (type = "mean_sd") %>%
arrange (variable, sex, age, trial)
RYann had a good idea by defining a function to pull out subgroups and then doing all the t-tests on each subgroup. That approach was helpful.
I ended up building on his strategy and simplifing things a bit more by vectorizing the t-tests inside the function using lapply. I then stored each of the age/sex combinations in a dataframe and used mapply to pass those combinations to the t-testing function.
group<-big.df %>% filter(age == a_age & sex == a_sex)
data.frame(lapply(group[4:6], function(x) t.test(x~group$trial)$p.value))
}
combos <- data.frame(age = c("young","young","old","old"),
sex = c("m","f","m","f"))
t.test.df <- data.frame(mapply(t.script, a_age = combos$age, a_sex = combos$sex))
colnames(t.test.df) <- paste(combos$age, combos$sex, sep = " ")
young m
young f
old m
old f
height
1
1.939896e-05
0.01175771
1.630232e-08
weight
4.435875e-05
0.6368126
0.5196617
0.1299121
speed
0.80433
0.004320253
0.1526353
0.01539331
I hope this code will work out for you
big.df<- structure(list(age = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("old", "young"
), class = "factor"), sex = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("f", "m"), class = "factor"),
trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("control", "drug"
), class = "factor"), height = c(15L, 17L, 25L, 21L, 11L,
29L, 18L, 20L, 300L, 320L, 316L, 325L, 170L, 175L, 172L,
180L, 28L, 40L, 33L, 35L, 60L, 45L, 67L, 52L, 250L, 260L,
240L, 248L, 11L, 19L, 16L, 4L), weight = c(80L, 90L, 81L,
79L, 200L, 230L, 215L, 210L, 152L, 150L, 148L, 155L, 160L,
158L, 157L, 140L, 176L, 164L, 135L, 196L, 175L, 178L, 120L,
147L, 160L, 155L, 175L, 142L, 139L, 142L, 150L, 145L), speed = c(50L,
45L, 60L, 51L, 52L, 80L, 41L, 19L, 55L, 56L, 61L, 67L, 85L,
90L, 100L, 77L, 90L, 80L, 77L, 80L, 81L, 95L, 87L, 91L, 50L,
60L, 55L, 59L, 71L, 65L, 66L, 62L)), row.names = c(NA, -32L
), class = "data.frame")
# A function to extract the 3 comparrisons
multi_t <- function(a_sex,a_age){
df_func <- big.df %>% filter(sex==a_sex,age==a_age)
h <- t.test(height~trial,df_func)$p.value
w <- t.test(weight~trial,df_func)$p.value
s <- t.test(speed~trial,df_func)$p.value
# cat(
# "sex =",a_sex,"\nage =",a_age,"\n\n"
# )
return(cbind(height=h,weight=w,speed=s))
}
# Table in a long version
ptable <- data.frame(
multi_t("m","young"),
multi_t("m","old"),
multi_t("f","young"),
multi_t("f","old")
) %>% pivot_longer(cols=everything(),
names_to = "value",
values_to = "p.values") %>%
mutate(comparison = rep(c("young males","old males",
"young females","old females"),each=3),
value=str_remove_all(value,"\\.\\d"))
ptable
# Table in a wider version
ptable %>% group_by(value) %>% mutate(id=row_number()) %>%
pivot_wider(names_from = value,values_from = p.values) %>%
select(-id)
ptable %>%
mutate(sig=p.values<0.05) %>%
ggplot(aes(x=value,y=p.values,color=sig))+
geom_point(show.legend = T)+facet_wrap(~comparison,scales="free")+
theme(legend.position = "bottom")+
labs(title="P values of 3 different measurements",
subtitle = "For 4 different populations")
I've got the table of 55 observations with 5 variables (F,H,R,T,U) and 1 classifier variable ("Group") in which I have two groups.
I'm doing data sampling by splitting the data into the training set (70%) and test set (30%). Then I run adaboosting and check how it works.
I want to get the adaboost error distribution for 100 samplings. But the distribution occurs to be discrete, outputting only five value variants: 0, 0.0588235294117647, 0.117647058823529 0.176470588235294 and 0.235294117647059.It doesn't change with mfinal argument. I guess there should be more! How it works?
I use the folowing code:
predictions<-list()
for (i in 1:100){
train.ind<-sample(nrow(df), nrow(df) * 0.7)
assign(paste0("ada",i), do.call(boosting,
c(formula=Group~F + H + R + T + U,
data=substitute(df[train.ind,]), mfinal=50, boos=FALSE,
coeflearn='Breiman'),envir = parent.frame()))
assign(paste0("pred",i), predict(ada,df[-train.ind,]))
predictions[[i]]<-get(paste0("pred",i))$error
}
hist(100*unlist(predictions),breaks=10,
main="Error probability [%] ntrees=10. 100 sampling operations", xlab="AdaBoost error")
dput(df)
structure(list(Group = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Canines", "Sled"), class = "factor"), F = c(0.263150566678734,
0.260347316635598, 0.26437277258488, 0.265710057607949, 0.254866055219663,
0.263294264681227, 0.261901194801303, 0.257318268395066, 0.26420207103455,
0.252093225560912, 0.255473253732324, 0.259067858940115, 0.259528043446917,
0.267331491048901, 0.260246447333382, 0.26035486437815, 0.254553215708594,
0.274074579975413, 0.262896904742862, 0.260504330262876, 0.258329960879536,
0.262664861154909, 0.256148832094211, 0.258509128895957, 0.256292083925698,
0.262358651734143, 0.254578103664353, 0.255386025800537, 0.264120912009577,
0.275232714712253, 0.265375720277527, 0.267601768121804, 0.262932226832642,
0.263633189245163, 0.262826186070212, 0.261058637786334, 0.262979366135887,
0.259232168979912, 0.252933156025384, 0.263963451214447, 0.258511197058683,
0.261957295373665, 0.253412282699461, 0.260748166588172, 0.263136039863289,
0.255317062006506, 0.258822015633545, 0.252757763183064, 0.260840486010478,
0.258620689655172, 0.263738813871524, 0.26241134751773, 0.26405425581719,
0.263685152057245, 0.262062787572784), H = c(0.242711147002311,
0.243850477245014, 0.245132979060713, 0.241794831140003, 0.235370262206577,
0.241392449436832, 0.236787894677703, 0.240434935369935, 0.234076675284456,
0.236978505926275, 0.23489414817613, 0.236461115627298, 0.241377100655228,
0.240778565421122, 0.238954656595734, 0.237237027626932, 0.23562891291975,
0.228247507171151, 0.235543469567304, 0.238348073568565, 0.237639956832591,
0.237993655975811, 0.23053394888479, 0.237553985998722, 0.238716430501961,
0.241044553515742, 0.23579805839771, 0.244646715997643, 0.245211405561299,
0.248463204730402, 0.237910443860818, 0.23772859908127, 0.242517289073306,
0.230376515634971, 0.239386381312522, 0.242971498213445, 0.248246377553633,
0.245227816034538, 0.237968589560153, 0.235998092571798, 0.235639593181493,
0.240320284697509, 0.239383587641388, 0.237939850635807, 0.240409493084614,
0.239705089012767, 0.235291279312896, 0.237725562711216, 0.251017166425148,
0.244410329082034, 0.247581475626206, 0.244082639531298, 0.248022977743474,
0.246127343801762, 0.246345535241663), R = c(0.23238005068085,
0.233913128793082, 0.232906768805408, 0.234580624702711, 0.23729616240706,
0.232552468336102, 0.23566425708828, 0.233370934038501, 0.23413197660754,
0.241255572873247, 0.240609653949119, 0.233790113420818, 0.239086204963073,
0.233644719452121, 0.23849468613068, 0.236846146329206, 0.239755264655663,
0.225925420024587, 0.239355887920232, 0.237429996633718, 0.23819641170916,
0.232039177131833, 0.223832380603256, 0.235838907338977, 0.236669843303285,
0.234916072348618, 0.238304558463179, 0.235904655883701, 0.232124394623714,
0.222879222527955, 0.233232723139038, 0.233871666714818, 0.235947441217151,
0.242585880964708, 0.234693056561268, 0.233941777691605, 0.229366135886539,
0.23539800906269, 0.239803390172875, 0.236505714593364, 0.24647853698133,
0.235569395017794, 0.242526379716086, 0.236207360559779, 0.234180854122081,
0.240408036487878, 0.239601762794737, 0.245058343429191, 0.234449894103222,
0.237875925051173, 0.230698942666106, 0.233475177304965, 0.231384358432554,
0.233114688928642, 0.230655428424067), T = c(0.261758235638105,
0.261889077326307, 0.257587479549, 0.257914486549337, 0.272467520166701,
0.262760817545838, 0.265646653432713, 0.268875862196498, 0.267589277073454,
0.269672695639567, 0.269022944142428, 0.270680912011768, 0.260008650934782,
0.258245224077857, 0.262304209940204, 0.265561961665713, 0.270062606715993,
0.271752492828849, 0.262203737769602, 0.263717599534841, 0.265833670578713,
0.267302305737446, 0.289484838417743, 0.268097977766344, 0.268321642269056,
0.261680722401497, 0.271319279474757, 0.264062602318119, 0.258543287805409,
0.253424858029389, 0.263481112722616, 0.260797966082108, 0.258603042876902,
0.263404414155158, 0.263094376055998, 0.262028086308617, 0.259408120423941,
0.26014200592286, 0.269294864241588, 0.263532741620391, 0.259370672778494,
0.262153024911032, 0.264677749943065, 0.265104622216242, 0.262273612930016,
0.264569812492848, 0.266284942258822, 0.264458330676529, 0.253692453461153,
0.25909305621162, 0.257980767836164, 0.260030835646007, 0.256538408006782,
0.25707281521235, 0.260936248761486), U = c(0.276642254462421,
0.275750907536407, 0.274138521440258, 0.279385339041277, 0.283770344294126,
0.273124933319108, 0.276770665567999, 0.272796198013943, 0.273326789343435,
0.278824893979485, 0.282917535762971, 0.269035729493284, 0.276381346021371,
0.275681845488406, 0.280473043309851, 0.274957072857482, 0.279453614114969,
0.265400901516186, 0.284438401450319, 0.275270067631668, 0.277080803992985,
0.268341093323935, 0.26334299428362, 0.27494270078114, 0.277070411973316,
0.276364671746617, 0.277622940087166, 0.275489489882784, 0.275412200032649,
0.267636555236813, 0.275475938484053, 0.27914367434201, 0.281161825726141,
0.287341513046201, 0.274277898463271, 0.272041104617345, 0.268317034458041,
0.277054269097656, 0.276448903327891, 0.282483963758864, 0.288513266166897,
0.280409252669039, 0.283610415243301, 0.27874587902846, 0.274619094771137,
0.275604453090517, 0.286100299160421, 0.288513039597016, 0.270078586556683,
0.280480764184118, 0.274123602187187, 0.277940178846747, 0.273784368554907,
0.282369310276287, 0.277372857201026)), na.action = structure(c(`2` = 2L,
`4` = 4L, `19` = 18L, `24` = 20L, `28` = 24L, `29` = 25L, `30` = 26L,
`32` = 28L, `33` = 29L, `42` = 38L, `54` = 46L, `69` = 54L, `74` = 58L,
`77` = 59L, `79` = 60L, `80` = 61L, `83` = 62L), class = "omit"), row.names = c(5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 20L,
25L, 26L, 27L, 31L, 41L, 44L, 46L, 47L, 48L, 50L, 51L, 52L, 55L,
57L, 64L, 65L, 66L, 67L, 68L, 70L, 71L, 72L, 85L, 86L, 87L, 88L,
89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L,
101L, 102L, 103L), class = "data.frame")
I am generating multinom models using nnet, with a model fitted for each city in the dataset. When I attempt to use tidy with these models, I get the following error:
Error in probs[i, -1, drop = FALSE] : subscript out of bounds
However, if I produce a model for each City separately, and then use tidy I do not receive an error for any of the models. I am also able to use glace without an error.
What might be causing this error?
library(broom)
library(dplyr)
library(nnet)
dfstack <- structure(list(Var1 = c(73L, 71L, 66L, 75L, 96L, 98L, 98L, 65L,
75L, 74L, 71L, 98L, 100L, 87L, 78L, 50L, 73L, 82L, 70L, 70L,
31L, 34L, 32L, 100L, 100L, 100L, 54L, 51L, 36L, 48L, 66L, 60L,
59L, 72L, 76L, 90L, 85L, 76L, 55L, 53L, 42L, 54L, 54L, 10L, 34L,
18L, 6L, 16L, 63L, 41L, 68L, 55L, 52L, 57L, 64L, 61L, 68L, 44L,
33L, 19L, 38L, 54L, 44L, 87L, 100L, 100L, 63L, 75L, 76L, 100L,
100L, 64L, 95L, 90L, 99L, 98L, 87L, 62L, 62L, 88L, 79L, 85L),
Status = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A",
"B", "C"), class = "factor"), City = structure(c(3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L,
3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Denver", "Miami", "NYC"), class = "factor"),
ID = structure(c(52L, 63L, 74L, 77L, 78L, 79L, 80L, 81L,
82L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 64L,
31L, 42L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 32L, 1L, 12L,
23L, 25L, 26L, 27L, 28L, 29L, 30L, 2L, 3L, 4L, 5L, 65L, 66L,
67L, 68L, 69L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 6L,
7L, 8L, 9L, 10L, 11L, 13L, 70L, 71L, 72L, 73L, 75L, 76L,
41L, 43L, 44L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L,
24L), .Label = c("Denver1", "Denver10", "Denver11", "Denver12",
"Denver13", "Denver14", "Denver15", "Denver16", "Denver17",
"Denver18", "Denver19", "Denver2", "Denver20", "Denver21",
"Denver22", "Denver23", "Denver24", "Denver25", "Denver26",
"Denver27", "Denver28", "Denver29", "Denver3", "Denver30",
"Denver4", "Denver5", "Denver6", "Denver7", "Denver8", "Denver9",
"Miami1", "Miami10", "Miami11", "Miami12", "Miami13", "Miami14",
"Miami15", "Miami16", "Miami17", "Miami18", "Miami19", "Miami2",
"Miami20", "Miami21", "Miami3", "Miami4", "Miami5", "Miami6",
"Miami7", "Miami8", "Miami9", "NYC1", "NYC10", "NYC11", "NYC12",
"NYC13", "NYC14", "NYC15", "NYC16", "NYC17", "NYC18", "NYC19",
"NYC2", "NYC20", "NYC21", "NYC22", "NYC23", "NYC24", "NYC25",
"NYC26", "NYC27", "NYC28", "NYC29", "NYC3", "NYC30", "NYC31",
"NYC4", "NYC5", "NYC6", "NYC7", "NYC8", "NYC9"), class = "factor")), class = "data.frame", row.names = c(NA, -82L), .Names = c("Var1", "Status", "City", "ID"))
Model.List <- dfstack %>% group_by(City) %>% do(modfits = multinom(Status~Var1, data=.))
tidy(Model.List, modfits) # produces error
glance(Model.List, modfits) # no error
# no error when each city on its own
df1 <- dfstack %>% filter(City == "NYC") %>% do(modfit1 = multinom(Status~Var1, data=.))
tidy(df1, modfit1)
df2 <- dfstack %>% filter(City == "Miami") %>% do(modfit1 = multinom(Status~Var1, data=.))
tidy(df2, modfit1)
df3 <- dfstack %>% filter(City == "Denver") %>% do(modfit1 = multinom(Status~Var1, data=.))
tidy(df3, modfit1)
Don't ask me to explain why, but I figured it out.
tidy.multinom calls summary.multinom which calls vcov.multinom which calls multinomHess. The error was being generated down in multinomHess, which is only run when the Hessian matrix is not generated in the original call to multinom. That is to say, you don't necessarily need to spend the time calculating the Hessian matrix if you don't intend to use the summary object.
For some reason, when the multinom objects are formed within the do call, summary.multinom is unable to calculate the Hessian matrix. This can be circumvented by calling multinom with Hess = TRUE. See below:
Model.List <-
dfstack %>%
group_by(City) %>%
do(modfits = multinom(Status~Var1,
data=.,
Hess = TRUE))
tidy(Model.List, modfits)
glance(Model.List, modfits)
In your original code, glance did not cast an error because glance.multinom does not rely on summary.multinom.
I have a data frame such as this:
structure(list(x = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L,
6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L,
13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L, 19L,
19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L,
26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 30L, 30L, 31L, 31L, 32L,
32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L, 36L, 37L, 37L, 38L, 38L,
39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L, 43L, 43L, 44L, 44L, 45L,
45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L, 49L, 50L, 50L, 51L, 51L,
52L, 52L, 53L, 53L, 54L, 54L, 55L, 55L, 56L, 56L, 57L, 57L, 58L,
58L, 59L, 59L, 60L, 60L, 61L, 61L, 62L, 62L, 63L, 63L, 64L, 64L,
65L, 65L, 66L, 66L, 67L, 67L, 68L, 68L, 69L, 69L, 70L, 70L, 71L,
71L, 72L, 72L, 73L, 73L, 74L, 74L, 75L, 75L, 76L, 76L, 77L, 77L,
78L, 78L, 79L, 79L, 80L, 80L, 81L, 81L, 82L, 82L, 83L, 83L, 84L,
84L, 85L, 85L, 86L, 86L, 87L, 87L, 88L, 88L, 89L, 89L, 90L, 90L,
91L, 91L, 92L, 92L, 93L, 93L, 94L, 94L, 95L, 95L, 96L, 96L, 97L,
97L, 98L, 98L, 99L, 99L, 100L, 100L), y = structure(c(1L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L), .Label = c("one", "two"), class = "factor")), class = "data.frame", row.names = c(NA,
-200L), .Names = c("x", "y"))
I am trying to filter groups of x that have two distinct y values using:
library(dplyr)
df %>% group_by(x) %>% filter(n_distinct(y) > 1)
On a large data set, this almost never finishes.
Changing to this works reasonably fast for the full data set:
library(dplyr)
df %>% group_by(x) %>% filter(length(unique(y)) > 1)
Any idea why n_distinct() is super slow to never finishing?
I am trying to run a multistate model using the package msm and I am encountering the following error:
*** caught segfault ***
address 0x607c00032c60, cause 'memory not mapped'
The data
dat.long <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L,
4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L,
9L, 10L, 10L, 10L, 11L, 11L, 11L, 12L, 12L, 12L, 13L, 13L, 13L,
14L, 14L, 14L, 15L, 15L, 15L, 16L, 16L, 16L, 17L, 17L, 17L, 18L,
18L, 18L, 19L, 19L, 19L, 20L, 20L, 20L, 21L, 21L, 21L, 22L, 22L,
22L, 23L, 23L, 23L, 24L, 24L, 24L, 25L, 25L, 25L, 26L, 26L, 26L,
27L, 27L, 27L, 28L, 28L, 28L, 29L, 29L, 29L, 30L, 30L, 30L, 31L,
31L, 31L, 32L, 32L, 32L, 33L, 33L, 33L, 34L, 34L, 34L, 35L, 35L,
35L, 36L, 36L, 36L, 37L, 37L, 37L, 38L, 38L, 38L, 39L, 39L, 39L,
40L, 40L, 40L, 41L, 41L, 41L, 42L, 42L, 42L, 43L, 43L, 43L, 44L,
44L, 44L, 45L, 45L, 45L, 46L, 46L, 46L, 47L, 47L, 47L, 48L, 48L,
48L, 49L, 49L, 49L, 50L, 50L, 50L, 51L, 51L, 51L, 52L, 52L, 52L,
53L, 53L, 53L, 54L, 54L, 54L, 55L, 55L, 55L, 56L, 56L, 56L, 57L,
57L, 57L, 58L, 58L, 58L, 59L, 59L, 59L, 60L, 60L, 60L, 61L, 61L,
61L, 62L, 62L, 62L, 63L, 63L, 63L, 64L, 64L, 64L, 65L, 65L, 65L,
66L, 66L, 66L, 67L, 67L, 67L, 68L, 68L, 68L, 69L, 69L, 69L, 70L,
70L, 70L, 71L, 71L, 71L, 72L, 72L, 72L, 73L, 73L, 73L, 74L, 74L,
74L, 75L, 75L, 75L, 76L, 76L, 76L, 77L, 77L, 77L, 78L, 78L, 78L,
79L, 79L, 79L, 80L, 80L, 80L, 81L, 81L, 81L, 82L, 82L, 82L, 83L,
83L, 83L, 84L, 84L, 84L, 85L, 85L, 85L, 86L, 86L, 86L, 87L, 87L,
87L, 88L, 88L, 88L, 89L, 89L, 89L, 90L, 90L, 90L, 91L, 91L, 91L,
92L, 92L, 92L, 93L, 93L, 93L, 94L, 94L, 94L, 95L, 95L, 95L, 96L,
96L, 96L, 97L, 97L, 97L, 98L, 98L, 98L, 99L, 99L, 99L), time = c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), age = c(63L, 67L, 71L, 65L,
69L, 73L, 60L, 64L, 69L, 62L, 65L, 69L, 64L, 68L, 72L, 64L, 68L,
72L, 64L, 68L, 72L, 64L, 68L, 72L, 64L, 68L, 73L, 65L, 69L, 73L,
61L, 65L, 68L, 63L, 67L, 72L, 64L, 69L, 73L, 61L, 65L, 69L, 61L,
65L, 69L, 64L, 68L, 72L, 63L, 67L, 71L, 61L, 65L, 69L, 64L, 68L,
72L, 65L, 69L, 73L, 63L, 67L, 71L, 61L, 64L, 68L, 63L, 67L, 71L,
63L, 68L, 72L, 62L, 66L, 70L, 64L, 68L, 72L, 62L, 66L, 70L, 65L,
69L, 73L, 63L, 66L, 70L, 62L, 66L, 70L, 62L, 65L, 70L, 62L, 66L,
70L, 63L, 67L, 71L, 62L, 66L, 71L, 62L, 66L, 70L, 63L, 67L, 71L,
64L, 67L, 72L, 61L, 65L, 69L, 64L, 67L, 71L, 64L, 69L, 72L, 62L,
66L, 70L, 62L, 66L, 70L, 63L, 67L, 71L, 64L, 68L, 72L, 62L, 66L,
70L, 60L, 64L, 68L, 63L, 67L, 71L, 64L, 68L, 73L, 64L, 68L, 72L,
64L, 68L, 72L, 64L, 68L, 71L, 62L, 65L, 69L, 61L, 65L, 69L, 64L,
68L, 72L, 60L, 65L, 69L, 62L, 66L, 70L, 60L, 64L, 68L, 63L, 67L,
71L, 63L, 67L, 71L, 64L, 68L, 72L, 65L, 69L, 73L, 61L, 65L, 69L,
63L, 67L, 71L, 63L, 67L, 71L, 62L, 67L, 71L, 64L, 68L, 72L, 64L,
68L, 72L, 63L, 67L, 71L, 64L, 68L, 72L, 64L, 68L, 72L, 62L, 66L,
71L, 61L, 65L, 69L, 63L, 68L, 72L, 60L, 65L, 69L, 61L, 65L, 69L,
63L, 68L, 72L, 62L, 67L, 70L, 64L, 68L, 73L, 61L, 65L, 69L, 62L,
66L, 70L, 62L, 65L, 70L, 61L, 65L, 69L, 64L, 68L, 72L, 62L, 66L,
70L, 61L, 65L, 69L, 61L, 66L, 70L, 63L, 67L, 71L, 61L, 65L, 70L,
62L, 67L, 71L, 60L, 64L, 69L, 61L, 66L, 70L, 61L, 65L, 69L, 62L,
67L, 72L, 63L, 67L, 71L, 60L, 64L, 69L, 61L, 65L, 69L, 65L, 69L,
73L, 62L, 66L, 70L, 60L, 64L, 68L), mci = structure(c(2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("1", "2"), class = "factor")), .Names = c("id",
"time", "age", "mci"), reshapeLong = structure(list(varying = structure(list(
age = c("age_R2", "b_age_R2", "c_age"), mci = c("mci_w1",
"mci_w2", "mci_w3")), .Names = c("age", "mci")), v.names = c("age",
"mci"), idvar = "id", timevar = "time"), .Names = c("varying",
"v.names", "idvar", "timevar")), row.names = c("1.1", "1.2",
"1.3", "2.1", "2.2", "2.3", "3.1", "3.2", "3.3", "4.1", "4.2",
"4.3", "5.1", "5.2", "5.3", "6.1", "6.2", "6.3", "7.1", "7.2",
"7.3", "8.1", "8.2", "8.3", "9.1", "9.2", "9.3", "10.1", "10.2",
"10.3", "11.1", "11.2", "11.3", "12.1", "12.2", "12.3", "13.1",
"13.2", "13.3", "14.1", "14.2", "14.3", "15.1", "15.2", "15.3",
"16.1", "16.2", "16.3", "17.1", "17.2", "17.3", "18.1", "18.2",
"18.3", "19.1", "19.2", "19.3", "20.1", "20.2", "20.3", "21.1",
"21.2", "21.3", "22.1", "22.2", "22.3", "23.1", "23.2", "23.3",
"24.1", "24.2", "24.3", "25.1", "25.2", "25.3", "26.1", "26.2",
"26.3", "27.1", "27.2", "27.3", "28.1", "28.2", "28.3", "29.1",
"29.2", "29.3", "30.1", "30.2", "30.3", "31.1", "31.2", "31.3",
"32.1", "32.2", "32.3", "33.1", "33.2", "33.3", "34.1", "34.2",
"34.3", "35.1", "35.2", "35.3", "36.1", "36.2", "36.3", "37.1",
"37.2", "37.3", "38.1", "38.2", "38.3", "39.1", "39.2", "39.3",
"40.1", "40.2", "40.3", "41.1", "41.2", "41.3", "42.1", "42.2",
"42.3", "43.1", "43.2", "43.3", "44.1", "44.2", "44.3", "45.1",
"45.2", "45.3", "46.1", "46.2", "46.3", "47.1", "47.2", "47.3",
"48.1", "48.2", "48.3", "49.1", "49.2", "49.3", "50.1", "50.2",
"50.3", "51.1", "51.2", "51.3", "52.1", "52.2", "52.3", "53.1",
"53.2", "53.3", "54.1", "54.2", "54.3", "55.1", "55.2", "55.3",
"56.1", "56.2", "56.3", "57.1", "57.2", "57.3", "58.1", "58.2",
"58.3", "59.1", "59.2", "59.3", "60.1", "60.2", "60.3", "61.1",
"61.2", "61.3", "62.1", "62.2", "62.3", "63.1", "63.2", "63.3",
"64.1", "64.2", "64.3", "65.1", "65.2", "65.3", "66.1", "66.2",
"66.3", "67.1", "67.2", "67.3", "68.1", "68.2", "68.3", "69.1",
"69.2", "69.3", "70.1", "70.2", "70.3", "71.1", "71.2", "71.3",
"72.1", "72.2", "72.3", "73.1", "73.2", "73.3", "74.1", "74.2",
"74.3", "75.1", "75.2", "75.3", "76.1", "76.2", "76.3", "77.1",
"77.2", "77.3", "78.1", "78.2", "78.3", "79.1", "79.2", "79.3",
"80.1", "80.2", "80.3", "81.1", "81.2", "81.3", "82.1", "82.2",
"82.3", "83.1", "83.2", "83.3", "84.1", "84.2", "84.3", "85.1",
"85.2", "85.3", "86.1", "86.2", "86.3", "87.1", "87.2", "87.3",
"88.1", "88.2", "88.3", "89.1", "89.2", "89.3", "90.1", "90.2",
"90.3", "91.1", "91.2", "91.3", "92.1", "92.2", "92.3", "93.1",
"93.2", "93.3", "94.1", "94.2", "94.3", "95.1", "95.2", "95.3",
"96.1", "96.2", "96.3", "97.1", "97.2", "97.3", "98.1", "98.2",
"98.3", "99.1", "99.2", "99.3"), class = "data.frame")
I then run the multistate model as follows.
library(msm)
#construct the qmatrix(all transitions are allowed.)
Q <- matrix(c(1,1,1,1),
nrow = 2, ncol = 2, byrow=TRUE,
dimnames=list(from=1:2,to=1:2))
#specify the initial values
crudeinits <- crudeinits.msm(mci ~ age, subject=id, data=dat.long, qmatrix=Q)
#the model
mci.msm <- msm(mci ~ age, subject = id, qmatrix = crudeinits, data = dat.long)
This than results in the R session been terminated with the above error. I am currently unsure how to resolve this issue so any help would be appreciated.
Thanks
After contacting the maintainer of the msm package the issue was resolved by redefining the state variable numeric rather than a factor. The full reply is below.
Thanks for this report. This might be related to the state variable
being a factor rather than numeric. I can make it work for me by redefining
dat.long$mci <- as.numeric(dat.long$mci).
I couldn't reproduce the crash, but it didn't converge with the state as
a factor.
I don't think it's documented explicitly, but factor states were
supposed to work as long as their levels are named 1,2,.... So it's a
bug, which seems to have been introduced in 1.4, but I'll fix it for the
next release.