T-tests on specific subgroups - r

I'm trying to calculate mean and SD and then perform t.tests on three different measurements (height, weight, speed) between multiple subgroups.
I started with a simple dataset that only contains two groups (control vs drug) and I have it all working well enough.
simple.df<-
structure(list(trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), levels = c("control", "drug"), class = "factor"), height = c(15,
17, 25, 21, 11, 29, 18, 20), weight = c(80, 90, 81, 79, 200,
230, 215, 210), speed = c(50, 45, 60, 51, 52, 80, 41, 19)), class = "data.frame", row.names = c(NA,
-8L))
library(rstatix)
simple.df %>% group_by(trial) %>% get_summary_stats(type = "mean_sd")
testing<- data.frame(lapply(simple.df[-1], function(x) t.test(x~simple.df$trial)$p.value))
testing
Where I'm running into trouble is with the t.testing on a larger experiment similar to the dataframe below. I still have control vs drug and height, weight & speed, but now all the measurements were done at two timepoints in both males and females. I'm only concerned with comparing control versus drug for the same sex/age. I'm still ok calculating the mean and SD for each group, but have gotten stuck with figuring out the t-testing.
Specifically, I just want the t-test on each of the three measurements for drug vs control in young males, drug vs control in old males, drug vs control in young females and drug vs control in old females, so 12 p-values total with some identification for what comparison each value represents.
Thanks for your help and expertise!
big.df<- structure(list(age = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("old", "young"
), class = "factor"), sex = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("f", "m"), class = "factor"),
trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("control", "drug"
), class = "factor"), height = c(15L, 17L, 25L, 21L, 11L,
29L, 18L, 20L, 300L, 320L, 316L, 325L, 170L, 175L, 172L,
180L, 28L, 40L, 33L, 35L, 60L, 45L, 67L, 52L, 250L, 260L,
240L, 248L, 11L, 19L, 16L, 4L), weight = c(80L, 90L, 81L,
79L, 200L, 230L, 215L, 210L, 152L, 150L, 148L, 155L, 160L,
158L, 157L, 140L, 176L, 164L, 135L, 196L, 175L, 178L, 120L,
147L, 160L, 155L, 175L, 142L, 139L, 142L, 150L, 145L), speed = c(50L,
45L, 60L, 51L, 52L, 80L, 41L, 19L, 55L, 56L, 61L, 67L, 85L,
90L, 100L, 77L, 90L, 80L, 77L, 80L, 81L, 95L, 87L, 91L, 50L,
60L, 55L, 59L, 71L, 65L, 66L, 62L)), row.names = c(NA, -32L
), class = "data.frame")
big.df %>% group_by (sex, age, trial) %>%
get_summary_stats (type = "mean_sd") %>%
arrange (variable, sex, age, trial)

RYann had a good idea by defining a function to pull out subgroups and then doing all the t-tests on each subgroup. That approach was helpful.
I ended up building on his strategy and simplifing things a bit more by vectorizing the t-tests inside the function using lapply. I then stored each of the age/sex combinations in a dataframe and used mapply to pass those combinations to the t-testing function.
group<-big.df %>% filter(age == a_age & sex == a_sex)
data.frame(lapply(group[4:6], function(x) t.test(x~group$trial)$p.value))
}
combos <- data.frame(age = c("young","young","old","old"),
sex = c("m","f","m","f"))
t.test.df <- data.frame(mapply(t.script, a_age = combos$age, a_sex = combos$sex))
colnames(t.test.df) <- paste(combos$age, combos$sex, sep = " ")
young m
young f
old m
old f
height
1
1.939896e-05
0.01175771
1.630232e-08
weight
4.435875e-05
0.6368126
0.5196617
0.1299121
speed
0.80433
0.004320253
0.1526353
0.01539331

I hope this code will work out for you
big.df<- structure(list(age = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("old", "young"
), class = "factor"), sex = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("f", "m"), class = "factor"),
trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("control", "drug"
), class = "factor"), height = c(15L, 17L, 25L, 21L, 11L,
29L, 18L, 20L, 300L, 320L, 316L, 325L, 170L, 175L, 172L,
180L, 28L, 40L, 33L, 35L, 60L, 45L, 67L, 52L, 250L, 260L,
240L, 248L, 11L, 19L, 16L, 4L), weight = c(80L, 90L, 81L,
79L, 200L, 230L, 215L, 210L, 152L, 150L, 148L, 155L, 160L,
158L, 157L, 140L, 176L, 164L, 135L, 196L, 175L, 178L, 120L,
147L, 160L, 155L, 175L, 142L, 139L, 142L, 150L, 145L), speed = c(50L,
45L, 60L, 51L, 52L, 80L, 41L, 19L, 55L, 56L, 61L, 67L, 85L,
90L, 100L, 77L, 90L, 80L, 77L, 80L, 81L, 95L, 87L, 91L, 50L,
60L, 55L, 59L, 71L, 65L, 66L, 62L)), row.names = c(NA, -32L
), class = "data.frame")
# A function to extract the 3 comparrisons
multi_t <- function(a_sex,a_age){
df_func <- big.df %>% filter(sex==a_sex,age==a_age)
h <- t.test(height~trial,df_func)$p.value
w <- t.test(weight~trial,df_func)$p.value
s <- t.test(speed~trial,df_func)$p.value
# cat(
# "sex =",a_sex,"\nage =",a_age,"\n\n"
# )
return(cbind(height=h,weight=w,speed=s))
}
# Table in a long version
ptable <- data.frame(
multi_t("m","young"),
multi_t("m","old"),
multi_t("f","young"),
multi_t("f","old")
) %>% pivot_longer(cols=everything(),
names_to = "value",
values_to = "p.values") %>%
mutate(comparison = rep(c("young males","old males",
"young females","old females"),each=3),
value=str_remove_all(value,"\\.\\d"))
ptable
# Table in a wider version
ptable %>% group_by(value) %>% mutate(id=row_number()) %>%
pivot_wider(names_from = value,values_from = p.values) %>%
select(-id)
ptable %>%
mutate(sig=p.values<0.05) %>%
ggplot(aes(x=value,y=p.values,color=sig))+
geom_point(show.legend = T)+facet_wrap(~comparison,scales="free")+
theme(legend.position = "bottom")+
labs(title="P values of 3 different measurements",
subtitle = "For 4 different populations")

Related

Splitting one column into two with kable / kableExtra in R

this is my first so please be patient with me.
I want to split one column of a tibble into two columns depending on the value of a third column.
My table looks like this so far
Wertetabelle <- tibble(DAT$Tag, DAT$Lauf, DAT$Replikate, DAT$Wert) %>% group_by(DAT$Lauf)
Wertetabelle %>%
mutate_all(linebreak) %>%
kable(booktabs = T, digits = 2,
caption = "Rohdaten der PCR Messungen",
col.names = linebreak(c("Tag", " Lauf", "Replikat", "Wert"), align = "r")) %>%
kable_styling(latex_options = c("striped", "hold_position"))
This, unfortunately, gives me a very long table. The column "Wert" has at least 80 values.
So depending on the "Replikat" column which has two values (1:2) I could split up "Wert" into two columns with 40 values each.
Unfortunately, the group_by doesn't work, it seems.
Do you have any idea?
Tag has 20 values 1:20
Lauf has 2 values 1:2
Replikat has 2 values 1:2
Wert is numeric
Best
Werek
as requested please find the results of dput(.)
structure(list(`DAT$Tag` = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L), .Label = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20"), class = "factor"), `DAT$Lauf` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), `DAT$Replikate` = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), `DAT$Wert` = c(242L, 243L, 247L, 249L,
246L, 244L, 241L, 245L, 243L, 244L, 252L, 249L, 242L, 246L, 247L,
240L, 241L, 244L, 241L, 247L, 246L, 242L, 239L, 241L, 242L, 245L,
246L, 245L, 239L, 246L, 251L, 248L, 240L, 249L, 248L, 238L, 244L,
244L, 239L, 240L, 245L, 238L, 241L, 250L, 243L, 251L, 245L, 243L,
244L, 247L, 247L, 251L, 251L, 248L, 245L, 239L, 245L, 237L, 247L,
245L, 246L, 238L, 240L, 245L, 240L, 247L, 247L, 245L, 245L, 239L,
241L, 246L, 245L, 240L, 246L, 242L, 248L, 242L, 245L, 242L)), row.names = c(NA,
-80L), groups = structure(list(`DAT$Lauf` = structure(1:2, .Label = c("1",
"2"), class = "factor"), .rows = structure(list(c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L,
51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L), c(21L, 22L,
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L,
36L, 37L, 38L, 39L, 40L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L,
69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L)), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = 1:2, class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))

Why is prediction error discrete in adabag?

I've got the table of 55 observations with 5 variables (F,H,R,T,U) and 1 classifier variable ("Group") in which I have two groups.
I'm doing data sampling by splitting the data into the training set (70%) and test set (30%). Then I run adaboosting and check how it works.
I want to get the adaboost error distribution for 100 samplings. But the distribution occurs to be discrete, outputting only five value variants: 0, 0.0588235294117647, 0.117647058823529 0.176470588235294 and 0.235294117647059.It doesn't change with mfinal argument. I guess there should be more! How it works?
I use the folowing code:
predictions<-list()
for (i in 1:100){
train.ind<-sample(nrow(df), nrow(df) * 0.7)
assign(paste0("ada",i), do.call(boosting,
c(formula=Group~F + H + R + T + U,
data=substitute(df[train.ind,]), mfinal=50, boos=FALSE,
coeflearn='Breiman'),envir = parent.frame()))
assign(paste0("pred",i), predict(ada,df[-train.ind,]))
predictions[[i]]<-get(paste0("pred",i))$error
}
hist(100*unlist(predictions),breaks=10,
main="Error probability [%] ntrees=10. 100 sampling operations", xlab="AdaBoost error")
dput(df)
structure(list(Group = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Canines", "Sled"), class = "factor"), F = c(0.263150566678734,
0.260347316635598, 0.26437277258488, 0.265710057607949, 0.254866055219663,
0.263294264681227, 0.261901194801303, 0.257318268395066, 0.26420207103455,
0.252093225560912, 0.255473253732324, 0.259067858940115, 0.259528043446917,
0.267331491048901, 0.260246447333382, 0.26035486437815, 0.254553215708594,
0.274074579975413, 0.262896904742862, 0.260504330262876, 0.258329960879536,
0.262664861154909, 0.256148832094211, 0.258509128895957, 0.256292083925698,
0.262358651734143, 0.254578103664353, 0.255386025800537, 0.264120912009577,
0.275232714712253, 0.265375720277527, 0.267601768121804, 0.262932226832642,
0.263633189245163, 0.262826186070212, 0.261058637786334, 0.262979366135887,
0.259232168979912, 0.252933156025384, 0.263963451214447, 0.258511197058683,
0.261957295373665, 0.253412282699461, 0.260748166588172, 0.263136039863289,
0.255317062006506, 0.258822015633545, 0.252757763183064, 0.260840486010478,
0.258620689655172, 0.263738813871524, 0.26241134751773, 0.26405425581719,
0.263685152057245, 0.262062787572784), H = c(0.242711147002311,
0.243850477245014, 0.245132979060713, 0.241794831140003, 0.235370262206577,
0.241392449436832, 0.236787894677703, 0.240434935369935, 0.234076675284456,
0.236978505926275, 0.23489414817613, 0.236461115627298, 0.241377100655228,
0.240778565421122, 0.238954656595734, 0.237237027626932, 0.23562891291975,
0.228247507171151, 0.235543469567304, 0.238348073568565, 0.237639956832591,
0.237993655975811, 0.23053394888479, 0.237553985998722, 0.238716430501961,
0.241044553515742, 0.23579805839771, 0.244646715997643, 0.245211405561299,
0.248463204730402, 0.237910443860818, 0.23772859908127, 0.242517289073306,
0.230376515634971, 0.239386381312522, 0.242971498213445, 0.248246377553633,
0.245227816034538, 0.237968589560153, 0.235998092571798, 0.235639593181493,
0.240320284697509, 0.239383587641388, 0.237939850635807, 0.240409493084614,
0.239705089012767, 0.235291279312896, 0.237725562711216, 0.251017166425148,
0.244410329082034, 0.247581475626206, 0.244082639531298, 0.248022977743474,
0.246127343801762, 0.246345535241663), R = c(0.23238005068085,
0.233913128793082, 0.232906768805408, 0.234580624702711, 0.23729616240706,
0.232552468336102, 0.23566425708828, 0.233370934038501, 0.23413197660754,
0.241255572873247, 0.240609653949119, 0.233790113420818, 0.239086204963073,
0.233644719452121, 0.23849468613068, 0.236846146329206, 0.239755264655663,
0.225925420024587, 0.239355887920232, 0.237429996633718, 0.23819641170916,
0.232039177131833, 0.223832380603256, 0.235838907338977, 0.236669843303285,
0.234916072348618, 0.238304558463179, 0.235904655883701, 0.232124394623714,
0.222879222527955, 0.233232723139038, 0.233871666714818, 0.235947441217151,
0.242585880964708, 0.234693056561268, 0.233941777691605, 0.229366135886539,
0.23539800906269, 0.239803390172875, 0.236505714593364, 0.24647853698133,
0.235569395017794, 0.242526379716086, 0.236207360559779, 0.234180854122081,
0.240408036487878, 0.239601762794737, 0.245058343429191, 0.234449894103222,
0.237875925051173, 0.230698942666106, 0.233475177304965, 0.231384358432554,
0.233114688928642, 0.230655428424067), T = c(0.261758235638105,
0.261889077326307, 0.257587479549, 0.257914486549337, 0.272467520166701,
0.262760817545838, 0.265646653432713, 0.268875862196498, 0.267589277073454,
0.269672695639567, 0.269022944142428, 0.270680912011768, 0.260008650934782,
0.258245224077857, 0.262304209940204, 0.265561961665713, 0.270062606715993,
0.271752492828849, 0.262203737769602, 0.263717599534841, 0.265833670578713,
0.267302305737446, 0.289484838417743, 0.268097977766344, 0.268321642269056,
0.261680722401497, 0.271319279474757, 0.264062602318119, 0.258543287805409,
0.253424858029389, 0.263481112722616, 0.260797966082108, 0.258603042876902,
0.263404414155158, 0.263094376055998, 0.262028086308617, 0.259408120423941,
0.26014200592286, 0.269294864241588, 0.263532741620391, 0.259370672778494,
0.262153024911032, 0.264677749943065, 0.265104622216242, 0.262273612930016,
0.264569812492848, 0.266284942258822, 0.264458330676529, 0.253692453461153,
0.25909305621162, 0.257980767836164, 0.260030835646007, 0.256538408006782,
0.25707281521235, 0.260936248761486), U = c(0.276642254462421,
0.275750907536407, 0.274138521440258, 0.279385339041277, 0.283770344294126,
0.273124933319108, 0.276770665567999, 0.272796198013943, 0.273326789343435,
0.278824893979485, 0.282917535762971, 0.269035729493284, 0.276381346021371,
0.275681845488406, 0.280473043309851, 0.274957072857482, 0.279453614114969,
0.265400901516186, 0.284438401450319, 0.275270067631668, 0.277080803992985,
0.268341093323935, 0.26334299428362, 0.27494270078114, 0.277070411973316,
0.276364671746617, 0.277622940087166, 0.275489489882784, 0.275412200032649,
0.267636555236813, 0.275475938484053, 0.27914367434201, 0.281161825726141,
0.287341513046201, 0.274277898463271, 0.272041104617345, 0.268317034458041,
0.277054269097656, 0.276448903327891, 0.282483963758864, 0.288513266166897,
0.280409252669039, 0.283610415243301, 0.27874587902846, 0.274619094771137,
0.275604453090517, 0.286100299160421, 0.288513039597016, 0.270078586556683,
0.280480764184118, 0.274123602187187, 0.277940178846747, 0.273784368554907,
0.282369310276287, 0.277372857201026)), na.action = structure(c(`2` = 2L,
`4` = 4L, `19` = 18L, `24` = 20L, `28` = 24L, `29` = 25L, `30` = 26L,
`32` = 28L, `33` = 29L, `42` = 38L, `54` = 46L, `69` = 54L, `74` = 58L,
`77` = 59L, `79` = 60L, `80` = 61L, `83` = 62L), class = "omit"), row.names = c(5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 20L,
25L, 26L, 27L, 31L, 41L, 44L, 46L, 47L, 48L, 50L, 51L, 52L, 55L,
57L, 64L, 65L, 66L, 67L, 68L, 70L, 71L, 72L, 85L, 86L, 87L, 88L,
89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L,
101L, 102L, 103L), class = "data.frame")

Frequency distribution of a categorical variable in R

I am trying to prepare a frequency distribution table of a categorical variable in my data and I am using below code. But the output looks ok while I view it but not printing ok in report.
# These lines are not needed because the data below is already
# in that format
# STI<-STI_IPD1%>% select(Q18_1,Q54)
# STI$Q54<-as.factor(STI$Q54)
STI = structure(list(Q18_1 = c(101L, 120L, 29L, 101L, 94L, 16L, 47L,
141L, 154L, 47L, 141L, 154L, 154L, 29L, 58L, 154L, 101L, 154L,
47L, 141L, 75L, 1L, 120L, 16L, 154L, 141L, 141L, 154L, 154L,
154L, 29L, 141L, 38L, 47L, 101L, 16L, 154L, 154L, 101L, 192L,
58L, 154L, 16L, 120L, 101L, 1L, 38L, 1L, 154L, 1L, 16L, 58L,
75L, 154L, 47L, 58L, 120L, 141L, 1L, 141L, 16L, 141L, 58L, 29L,
101L, 58L, 154L, 75L, 75L, 141L, 29L, 101L, 101L, 154L, 16L,
101L, 101L, 47L, 47L, 181L, 16L, 154L, 47L, 154L, 47L, 120L,
75L, 47L, 192L, 1L, 154L, 154L, 120L, 141L, 58L, 47L, 154L, 101L,
75L, 141L, 75L, 16L, 47L, 1L, 58L, 141L), Q54 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", "Discretionary if earnings per share goals are met.",
"initial funding by targets and as year goes on begin to include financial results",
"Non-represented are targets focused and budgeted and union plans are self funded based on operating margin achievements."
), class = "factor")), class = c("data.table", "data.frame"), row.names = c(NA,
-106L), .Names = c("Q18_1",
"Q54"))
as.data.frame(table(STI$Q54))
Is there any other way to prepare such outputs?
I want output as a table of counts of each factor level. each factor level in one column and and counts in another column.
I am taking output in word file using Rmarkdown. Also in the output window the output is not printing as two columns table.
To print a data frame as a table in Markdown, one can use the kable() function in knitr.
library(knitr)
kable(aDataFrame)
For example...
data.frame() with the kable() function is really useful technique for communicating tabular information in R Markdown. For a couple of more complicated examples using this technique, please read my article Commentary on ToothGrowth Factorial ANOVA, where I compare Robert Kabacoff's analysis to the requirements of the Johns Hopkins University Statistical Inference course on Coursera.
regards,
Len
(11/22/2017) UPDATE: Responding to a comment from #sandhya-ghildiyal , here is how to exclude the blank row from the table output. If we save the result of table() into an object, we can then use the extract operator [ within the kable() function to exclude the row where the factor value is 1, the blank space.
theTable <- as.data.frame(table(STI$Q54))
kable(theTable[as.numeric(theTable$Var1) != 1,])

Broom::tidy error with dataframe of nnet::multinom models

I am generating multinom models using nnet, with a model fitted for each city in the dataset. When I attempt to use tidy with these models, I get the following error:
Error in probs[i, -1, drop = FALSE] : subscript out of bounds
However, if I produce a model for each City separately, and then use tidy I do not receive an error for any of the models. I am also able to use glace without an error.
What might be causing this error?
library(broom)
library(dplyr)
library(nnet)
dfstack <- structure(list(Var1 = c(73L, 71L, 66L, 75L, 96L, 98L, 98L, 65L,
75L, 74L, 71L, 98L, 100L, 87L, 78L, 50L, 73L, 82L, 70L, 70L,
31L, 34L, 32L, 100L, 100L, 100L, 54L, 51L, 36L, 48L, 66L, 60L,
59L, 72L, 76L, 90L, 85L, 76L, 55L, 53L, 42L, 54L, 54L, 10L, 34L,
18L, 6L, 16L, 63L, 41L, 68L, 55L, 52L, 57L, 64L, 61L, 68L, 44L,
33L, 19L, 38L, 54L, 44L, 87L, 100L, 100L, 63L, 75L, 76L, 100L,
100L, 64L, 95L, 90L, 99L, 98L, 87L, 62L, 62L, 88L, 79L, 85L),
Status = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A",
"B", "C"), class = "factor"), City = structure(c(3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L,
3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = c("Denver", "Miami", "NYC"), class = "factor"),
ID = structure(c(52L, 63L, 74L, 77L, 78L, 79L, 80L, 81L,
82L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 64L,
31L, 42L, 45L, 46L, 47L, 48L, 49L, 50L, 51L, 32L, 1L, 12L,
23L, 25L, 26L, 27L, 28L, 29L, 30L, 2L, 3L, 4L, 5L, 65L, 66L,
67L, 68L, 69L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 6L,
7L, 8L, 9L, 10L, 11L, 13L, 70L, 71L, 72L, 73L, 75L, 76L,
41L, 43L, 44L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L,
24L), .Label = c("Denver1", "Denver10", "Denver11", "Denver12",
"Denver13", "Denver14", "Denver15", "Denver16", "Denver17",
"Denver18", "Denver19", "Denver2", "Denver20", "Denver21",
"Denver22", "Denver23", "Denver24", "Denver25", "Denver26",
"Denver27", "Denver28", "Denver29", "Denver3", "Denver30",
"Denver4", "Denver5", "Denver6", "Denver7", "Denver8", "Denver9",
"Miami1", "Miami10", "Miami11", "Miami12", "Miami13", "Miami14",
"Miami15", "Miami16", "Miami17", "Miami18", "Miami19", "Miami2",
"Miami20", "Miami21", "Miami3", "Miami4", "Miami5", "Miami6",
"Miami7", "Miami8", "Miami9", "NYC1", "NYC10", "NYC11", "NYC12",
"NYC13", "NYC14", "NYC15", "NYC16", "NYC17", "NYC18", "NYC19",
"NYC2", "NYC20", "NYC21", "NYC22", "NYC23", "NYC24", "NYC25",
"NYC26", "NYC27", "NYC28", "NYC29", "NYC3", "NYC30", "NYC31",
"NYC4", "NYC5", "NYC6", "NYC7", "NYC8", "NYC9"), class = "factor")), class = "data.frame", row.names = c(NA, -82L), .Names = c("Var1", "Status", "City", "ID"))
Model.List <- dfstack %>% group_by(City) %>% do(modfits = multinom(Status~Var1, data=.))
tidy(Model.List, modfits) # produces error
glance(Model.List, modfits) # no error
# no error when each city on its own
df1 <- dfstack %>% filter(City == "NYC") %>% do(modfit1 = multinom(Status~Var1, data=.))
tidy(df1, modfit1)
df2 <- dfstack %>% filter(City == "Miami") %>% do(modfit1 = multinom(Status~Var1, data=.))
tidy(df2, modfit1)
df3 <- dfstack %>% filter(City == "Denver") %>% do(modfit1 = multinom(Status~Var1, data=.))
tidy(df3, modfit1)
Don't ask me to explain why, but I figured it out.
tidy.multinom calls summary.multinom which calls vcov.multinom which calls multinomHess. The error was being generated down in multinomHess, which is only run when the Hessian matrix is not generated in the original call to multinom. That is to say, you don't necessarily need to spend the time calculating the Hessian matrix if you don't intend to use the summary object.
For some reason, when the multinom objects are formed within the do call, summary.multinom is unable to calculate the Hessian matrix. This can be circumvented by calling multinom with Hess = TRUE. See below:
Model.List <-
dfstack %>%
group_by(City) %>%
do(modfits = multinom(Status~Var1,
data=.,
Hess = TRUE))
tidy(Model.List, modfits)
glance(Model.List, modfits)
In your original code, glance did not cast an error because glance.multinom does not rely on summary.multinom.

dplyr n_distinct() in filter takes forever where as base length(unique()) works like charm

I have a data frame such as this:
structure(list(x = c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L, 5L, 5L,
6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L,
13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L, 17L, 17L, 18L, 18L, 19L,
19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L, 23L, 24L, 24L, 25L, 25L,
26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L, 30L, 30L, 31L, 31L, 32L,
32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L, 36L, 37L, 37L, 38L, 38L,
39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L, 43L, 43L, 44L, 44L, 45L,
45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L, 49L, 50L, 50L, 51L, 51L,
52L, 52L, 53L, 53L, 54L, 54L, 55L, 55L, 56L, 56L, 57L, 57L, 58L,
58L, 59L, 59L, 60L, 60L, 61L, 61L, 62L, 62L, 63L, 63L, 64L, 64L,
65L, 65L, 66L, 66L, 67L, 67L, 68L, 68L, 69L, 69L, 70L, 70L, 71L,
71L, 72L, 72L, 73L, 73L, 74L, 74L, 75L, 75L, 76L, 76L, 77L, 77L,
78L, 78L, 79L, 79L, 80L, 80L, 81L, 81L, 82L, 82L, 83L, 83L, 84L,
84L, 85L, 85L, 86L, 86L, 87L, 87L, 88L, 88L, 89L, 89L, 90L, 90L,
91L, 91L, 92L, 92L, 93L, 93L, 94L, 94L, 95L, 95L, 96L, 96L, 97L,
97L, 98L, 98L, 99L, 99L, 100L, 100L), y = structure(c(1L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L), .Label = c("one", "two"), class = "factor")), class = "data.frame", row.names = c(NA,
-200L), .Names = c("x", "y"))
I am trying to filter groups of x that have two distinct y values using:
library(dplyr)
df %>% group_by(x) %>% filter(n_distinct(y) > 1)
On a large data set, this almost never finishes.
Changing to this works reasonably fast for the full data set:
library(dplyr)
df %>% group_by(x) %>% filter(length(unique(y)) > 1)
Any idea why n_distinct() is super slow to never finishing?

Resources