Related
I'm trying to calculate mean and SD and then perform t.tests on three different measurements (height, weight, speed) between multiple subgroups.
I started with a simple dataset that only contains two groups (control vs drug) and I have it all working well enough.
simple.df<-
structure(list(trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), levels = c("control", "drug"), class = "factor"), height = c(15,
17, 25, 21, 11, 29, 18, 20), weight = c(80, 90, 81, 79, 200,
230, 215, 210), speed = c(50, 45, 60, 51, 52, 80, 41, 19)), class = "data.frame", row.names = c(NA,
-8L))
library(rstatix)
simple.df %>% group_by(trial) %>% get_summary_stats(type = "mean_sd")
testing<- data.frame(lapply(simple.df[-1], function(x) t.test(x~simple.df$trial)$p.value))
testing
Where I'm running into trouble is with the t.testing on a larger experiment similar to the dataframe below. I still have control vs drug and height, weight & speed, but now all the measurements were done at two timepoints in both males and females. I'm only concerned with comparing control versus drug for the same sex/age. I'm still ok calculating the mean and SD for each group, but have gotten stuck with figuring out the t-testing.
Specifically, I just want the t-test on each of the three measurements for drug vs control in young males, drug vs control in old males, drug vs control in young females and drug vs control in old females, so 12 p-values total with some identification for what comparison each value represents.
Thanks for your help and expertise!
big.df<- structure(list(age = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("old", "young"
), class = "factor"), sex = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("f", "m"), class = "factor"),
trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("control", "drug"
), class = "factor"), height = c(15L, 17L, 25L, 21L, 11L,
29L, 18L, 20L, 300L, 320L, 316L, 325L, 170L, 175L, 172L,
180L, 28L, 40L, 33L, 35L, 60L, 45L, 67L, 52L, 250L, 260L,
240L, 248L, 11L, 19L, 16L, 4L), weight = c(80L, 90L, 81L,
79L, 200L, 230L, 215L, 210L, 152L, 150L, 148L, 155L, 160L,
158L, 157L, 140L, 176L, 164L, 135L, 196L, 175L, 178L, 120L,
147L, 160L, 155L, 175L, 142L, 139L, 142L, 150L, 145L), speed = c(50L,
45L, 60L, 51L, 52L, 80L, 41L, 19L, 55L, 56L, 61L, 67L, 85L,
90L, 100L, 77L, 90L, 80L, 77L, 80L, 81L, 95L, 87L, 91L, 50L,
60L, 55L, 59L, 71L, 65L, 66L, 62L)), row.names = c(NA, -32L
), class = "data.frame")
big.df %>% group_by (sex, age, trial) %>%
get_summary_stats (type = "mean_sd") %>%
arrange (variable, sex, age, trial)
RYann had a good idea by defining a function to pull out subgroups and then doing all the t-tests on each subgroup. That approach was helpful.
I ended up building on his strategy and simplifing things a bit more by vectorizing the t-tests inside the function using lapply. I then stored each of the age/sex combinations in a dataframe and used mapply to pass those combinations to the t-testing function.
group<-big.df %>% filter(age == a_age & sex == a_sex)
data.frame(lapply(group[4:6], function(x) t.test(x~group$trial)$p.value))
}
combos <- data.frame(age = c("young","young","old","old"),
sex = c("m","f","m","f"))
t.test.df <- data.frame(mapply(t.script, a_age = combos$age, a_sex = combos$sex))
colnames(t.test.df) <- paste(combos$age, combos$sex, sep = " ")
young m
young f
old m
old f
height
1
1.939896e-05
0.01175771
1.630232e-08
weight
4.435875e-05
0.6368126
0.5196617
0.1299121
speed
0.80433
0.004320253
0.1526353
0.01539331
I hope this code will work out for you
big.df<- structure(list(age = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("old", "young"
), class = "factor"), sex = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("f", "m"), class = "factor"),
trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("control", "drug"
), class = "factor"), height = c(15L, 17L, 25L, 21L, 11L,
29L, 18L, 20L, 300L, 320L, 316L, 325L, 170L, 175L, 172L,
180L, 28L, 40L, 33L, 35L, 60L, 45L, 67L, 52L, 250L, 260L,
240L, 248L, 11L, 19L, 16L, 4L), weight = c(80L, 90L, 81L,
79L, 200L, 230L, 215L, 210L, 152L, 150L, 148L, 155L, 160L,
158L, 157L, 140L, 176L, 164L, 135L, 196L, 175L, 178L, 120L,
147L, 160L, 155L, 175L, 142L, 139L, 142L, 150L, 145L), speed = c(50L,
45L, 60L, 51L, 52L, 80L, 41L, 19L, 55L, 56L, 61L, 67L, 85L,
90L, 100L, 77L, 90L, 80L, 77L, 80L, 81L, 95L, 87L, 91L, 50L,
60L, 55L, 59L, 71L, 65L, 66L, 62L)), row.names = c(NA, -32L
), class = "data.frame")
# A function to extract the 3 comparrisons
multi_t <- function(a_sex,a_age){
df_func <- big.df %>% filter(sex==a_sex,age==a_age)
h <- t.test(height~trial,df_func)$p.value
w <- t.test(weight~trial,df_func)$p.value
s <- t.test(speed~trial,df_func)$p.value
# cat(
# "sex =",a_sex,"\nage =",a_age,"\n\n"
# )
return(cbind(height=h,weight=w,speed=s))
}
# Table in a long version
ptable <- data.frame(
multi_t("m","young"),
multi_t("m","old"),
multi_t("f","young"),
multi_t("f","old")
) %>% pivot_longer(cols=everything(),
names_to = "value",
values_to = "p.values") %>%
mutate(comparison = rep(c("young males","old males",
"young females","old females"),each=3),
value=str_remove_all(value,"\\.\\d"))
ptable
# Table in a wider version
ptable %>% group_by(value) %>% mutate(id=row_number()) %>%
pivot_wider(names_from = value,values_from = p.values) %>%
select(-id)
ptable %>%
mutate(sig=p.values<0.05) %>%
ggplot(aes(x=value,y=p.values,color=sig))+
geom_point(show.legend = T)+facet_wrap(~comparison,scales="free")+
theme(legend.position = "bottom")+
labs(title="P values of 3 different measurements",
subtitle = "For 4 different populations")
I have lots of data frames with the same columns. What I want is to apply quantile (15% and 80%) function to the 3rd ("cpm") column for all the data frames in my environment and add the result as a new column to each data frame
All the data frames in environment are the same, here is the sample of them:
BD.ios = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "BD", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ios", class = "factor"),
cpm = c(0.00026978417266187, 0.000276497695852535, 0.00442228161827238,
0.00396317260301814, 0.0191772698764066, 0.700811773637797,
0.00482934642627173, 0.00201429499675114, 0.00021494623655914,
0.0000520855057351408)), row.names = c(12925L, 13011L, 15189L,
18469L, 19494L, 22385L, 22594L, 29467L, 31907L, 38037L), class = "data.frame")
AE.mac = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AE", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "mac", class = "factor"),
cpm = c(0.000353264424964019, 0.00390138781055901, 0.000893105609526794,
0.0099634872417983, 0.00119375573921028, 0.00535134321942833,
0.00318471337579618, 0.000983284169124877, 0.116180371352785
)), row.names = c(2622L, 6483L, 6898L, 9383L, 25280L, 25923L,
29649L, 37977L, 40411L), class = "data.frame")
AF.android = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AF", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "android", class = "factor"),
cpm = c(0.193592767295597, 0.153727276424417, 0.30376596601237,
0.43615845874945, 0.552450120363948, 0.214786723495654, 0.206123674204523,
0.0250727462779332, 0.157723828668625)), row.names = c(955L,
7975L, 8899L, 9297L, 11223L, 14963L, 17452L, 19883L, 20555L), class = "data.frame")
I believe, that the solution is easy and requires the use of eapply function, but I just can't figure it out
env = .GlobalEnv
eapply(env, quantile, probs = c(.15,.8))
This command results in an error:
Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
undefined columns selected
EDIT
To make it clear, here is what I did and what I need as a result:
I had Data like this
data = structure(list(geo = structure(c(15L, 1L, 3L, 16L, 1L, 9L, 17L,
23L, 29L, 52L, 26L, 55L, 34L, 46L, 25L, 52L, 17L, 15L, 27L, 35L,
45L, 8L, 21L, 24L, 6L, 16L, 52L, 31L, 14L, 38L, 21L, 5L, 41L,
16L, 34L, 52L, 27L, 16L, 7L, 13L, 10L, 35L, 52L, 44L, 27L, 19L,
35L, 6L, 42L, 25L, 40L, 31L, 43L, 33L, 13L, 2L, 4L, 12L, 30L,
44L, 51L, 38L, 35L, 28L, 52L, 32L, 20L, 19L, 34L, 56L, 51L, 53L,
54L, 22L, 49L, 18L, 4L, 36L, 34L, 4L, 47L, 11L, 25L, 9L, 6L,
46L, 39L, 25L, 12L, 50L, 27L, 39L, 48L, 27L, 23L, 9L, 19L, 9L,
44L, 37L), .Label = c("AE", "AR", "AT", "AU", "AZ", "BD", "BG",
"BO", "CA", "CD", "CH", "CO", "DK", "DZ", "EC", "EG", "ES", "FI",
"FR", "GA", "GB", "GE", "HK", "HU", "ID", "IE", "IN", "IR", "IT",
"KE", "KR", "LB", "LY", "MX", "MY", "NL", "PE", "PH", "PK", "PL",
"PT", "QA", "RO", "RU", "RW", "SE", "SG", "SK", "SY", "TH", "TR",
"US", "UY", "VN", "YE", "ZA"), class = "factor"), os = structure(c(3L,
2L, 1L, 1L, 1L, 6L, 4L, 1L, 1L, 4L, 6L, 1L, 1L, 1L, 6L, 7L, 1L,
4L, 1L, 3L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
6L, 1L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 1L,
1L, 6L, 1L, 1L, 1L, 4L, 4L, 1L, 3L, 1L, 5L, 1L, 6L, 6L, 1L, 3L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 6L, 4L, 2L,
6L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 6L,
4L, 3L, 1L), .Label = c("android", "blackberry", "ios", "mac",
"other", "windows", "windows_phone"), class = "factor"), cpm = c(0.259529602595296,
0.008325, 0.664507018855387, 0.000646161798914448, 0.117647058823529,
0.630132741077424, 0.00398838150289017, 0.0986788005043583, 0.483832900637243,
0.631904877252478, 0.00499783423573511, 0.408063887806778, 0.0916731378464372,
1.3325069724202, 0.0112485708069297, 0.00171537666632221, 0.0129665435458787,
0.00296443300606869, 0.22941417451864, 0.000426580184572523,
0.206888580674988, 0.000622490272373541, 0.016084968041569, 0.119169168392267,
0.0216352172946694, 0.0552526416330796, 0.0150883006745904, 0.324403186817902,
0.188053932659688, 0.00389006342494715, 0.0625410833224263, 0.00111134385665529,
0.000198831231813773, 0.00551511140525039, 1.02902374670185,
0.574300071787509, 0.371022474579782, 0.111970606352996, 0.0000313953488372093,
0.380035469977198, 0.0159468438538206, 0.0274524158125915, 0.237448482577744,
0.083452302337827, 0.371352785145889, 0.129754756459319, 0.0261164794985636,
0.602409638554217, 0.0157611216101295, 0.347620654741816, 0.130193264668441,
0.34434946165254, 0.0693131695022054, 0.673575129533679, 0.0272002127093858,
0.0295980803571429, 0.482425913163336, 0.00235336471280429, 0.00508469886782341,
0.0000840689365279529, 0.236539258503618, 0.0799443865137296,
0.296296296296296, 0.0236127508854782, 0.0152198636822762, 0.00339285714285714,
0.150753768844221, 0.0859481582537517, 0.000587920688617856,
0.00127715231788079, 0.150836862270619, 0.0849810111668886, 0.279757646414598,
0.00113308871141809, 0.996427153632394, 0.00269808881394042,
0.374087591240876, 0.228267072474796, 0.0516169572925784, 0.00902986826347305,
0.000207365145228216, 0.244244977712646, 0.169128424850603, 0.573023255813954,
0.0152944175375988, 1.11731843575419, 0.426646706586826, 0.0544090571844687,
0.271433919880195, 0.0271570068233128, 0.00445611403693561, 0.00160892057026477,
0.671800318640467, 0.0216794334441393, 0.00285318261516391, 0.295866741619575,
0.0843108504398827, 1.60302577359969, 0.0132230143658259, 0.00246752277351996
)), row.names = c(6L, 22L, 25L, 28L, 31L, 41L, 43L, 45L, 47L,
59L, 68L, 70L, 71L, 72L, 73L, 80L, 94L, 95L, 96L, 101L, 115L,
117L, 121L, 123L, 125L, 140L, 144L, 149L, 151L, 165L, 169L, 170L,
179L, 182L, 186L, 189L, 190L, 206L, 207L, 208L, 221L, 238L, 239L,
259L, 271L, 275L, 276L, 280L, 281L, 294L, 303L, 308L, 311L, 315L,
318L, 345L, 354L, 355L, 362L, 374L, 377L, 383L, 384L, 385L, 386L,
394L, 405L, 407L, 408L, 419L, 422L, 424L, 425L, 427L, 442L, 445L,
454L, 455L, 465L, 466L, 482L, 484L, 485L, 487L, 496L, 506L, 510L,
513L, 517L, 518L, 523L, 528L, 544L, 548L, 552L, 557L, 570L, 579L,
586L, 596L), class = "data.frame")
Used split function to get list of data frames, which separated geo+os combinations from each other and wrote them down in a list of data frames:
X <- split(data, list(data$geo,data$os))
Than I pulled data frames out from that list into the environment and deleted data frames with zero rows
list2env(X, envir = .GlobalEnv)
## create a function that returns a logical value
isEmpty <- function(x) {
is.data.frame(x) && nrow(x) == 0L
}
## apply it over the environment
empty <- unlist(eapply(.GlobalEnv, isEmpty))
## remove the empties
rm(list = names(empty)[empty])
The desired result is a Data frame, which has 4 columns:
geo, os, quantile_15,quantile_80
Where geo+os are unique and have a certain quantile_15,quantile_80
I'd strongly suggest putting your data frames in a list instead of just leaving them in the global environment. The answer I link to should help you understand why lists are better, and also show how you could do lists from the start instead of this "find all data frames and put them in a list" approach.
eapply is difficult because there's nothing built-in to let you apply, say, only to data frames. And eapply returns results as a list, so it doesn't make much sense for adding columns to existing data frames.
df_names = ls()[sapply(mget(ls()), is.data.frame)]
df_list = mget(df_names)
result_list = lapply(df_list, function(d) d$new_col = <code for new column>)
I'm not sure what you want since you don't post your desired output. quantile(x, c(.15, .8)) returns 2 values, and your data frames have more than 2 rows, so I'm not sure what you want added - 2 new columns? 1 new column with recycling? something else?
Alternatively, maybe you just want a 2-number summary for each data frame? In that case sapply does nice simplification and keeps the names:
sapply(df_list, function(d) quantile(d$cpm, c(0.15, 0.8)))
# AE.mac AF.android BD.ios
# 15% 0.0009111413 0.1545266 0.0002341395
# 80% 0.0071962008 0.3567230 0.0076989311
EDIT based on your edits, let's work directly with data. We don't need to split, we certainly don't need list2env after the split. Adding columns by group is easy and efficient with dplyr or data.table. For example:
library(dplyr)
data %>%
group_by(geo, os) %>%
summarize(quantile_15 = quantile(cpm, .15),
quantile_80 = quantile(cpm, 0.8))
# # A tibble: 81 x 4
# # Groups: geo [?]
# geo os quantile_15 quantile_80
# <fct> <fct> <dbl> <dbl>
# 1 AE android 0.118 0.118
# 2 AE blackberry 0.00833 0.00833
# 3 AR mac 0.0296 0.0296
# 4 AT android 0.665 0.665
# 5 AU android 0.482 0.482
# 6 AU ios 0.374 0.374
# 7 AU mac 0.00903 0.00903
# ...
Or with data.table:
library(data.table)
setDT(data)
data[, as.list(quantile(cpm, c(0.15, 0.8))), by = .(geo, os)]
# geo os 15% 80%
# 1: EC ios 2.595296e-01 2.595296e-01
# 2: AE blackberry 8.325000e-03 8.325000e-03
# 3: AT android 6.645070e-01 6.645070e-01
# 4: EG android 1.702811e-02 8.928342e-02
# 5: AE android 1.176471e-01 1.176471e-01
# 6: CA windows 6.301327e-01 6.301327e-01
I have lots of data frames with the same columns. What I want is to apply quantile (15% and 80%) function to the 3rd ("cpm") column for all the data frames in my environment and add the result as a new column to each data frame
All the data frames in environment are the same, here is the sample of them:
BD.ios = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "BD", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ios", class = "factor"),
cpm = c(0.00026978417266187, 0.000276497695852535, 0.00442228161827238,
0.00396317260301814, 0.0191772698764066, 0.700811773637797,
0.00482934642627173, 0.00201429499675114, 0.00021494623655914,
0.0000520855057351408)), row.names = c(12925L, 13011L, 15189L,
18469L, 19494L, 22385L, 22594L, 29467L, 31907L, 38037L), class = "data.frame")
AE.mac = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AE", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "mac", class = "factor"),
cpm = c(0.000353264424964019, 0.00390138781055901, 0.000893105609526794,
0.0099634872417983, 0.00119375573921028, 0.00535134321942833,
0.00318471337579618, 0.000983284169124877, 0.116180371352785
)), row.names = c(2622L, 6483L, 6898L, 9383L, 25280L, 25923L,
29649L, 37977L, 40411L), class = "data.frame")
AF.android = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AF", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "android", class = "factor"),
cpm = c(0.193592767295597, 0.153727276424417, 0.30376596601237,
0.43615845874945, 0.552450120363948, 0.214786723495654, 0.206123674204523,
0.0250727462779332, 0.157723828668625)), row.names = c(955L,
7975L, 8899L, 9297L, 11223L, 14963L, 17452L, 19883L, 20555L), class = "data.frame")
I believe, that the solution is easy and requires the use of eapply function, but I just can't figure it out
env = .GlobalEnv
eapply(env, quantile, probs = c(.15,.8))
This command results in an error:
Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
undefined columns selected
EDIT
To make it clear, here is what I did and what I need as a result:
I had Data like this
data = structure(list(geo = structure(c(15L, 1L, 3L, 16L, 1L, 9L, 17L,
23L, 29L, 52L, 26L, 55L, 34L, 46L, 25L, 52L, 17L, 15L, 27L, 35L,
45L, 8L, 21L, 24L, 6L, 16L, 52L, 31L, 14L, 38L, 21L, 5L, 41L,
16L, 34L, 52L, 27L, 16L, 7L, 13L, 10L, 35L, 52L, 44L, 27L, 19L,
35L, 6L, 42L, 25L, 40L, 31L, 43L, 33L, 13L, 2L, 4L, 12L, 30L,
44L, 51L, 38L, 35L, 28L, 52L, 32L, 20L, 19L, 34L, 56L, 51L, 53L,
54L, 22L, 49L, 18L, 4L, 36L, 34L, 4L, 47L, 11L, 25L, 9L, 6L,
46L, 39L, 25L, 12L, 50L, 27L, 39L, 48L, 27L, 23L, 9L, 19L, 9L,
44L, 37L), .Label = c("AE", "AR", "AT", "AU", "AZ", "BD", "BG",
"BO", "CA", "CD", "CH", "CO", "DK", "DZ", "EC", "EG", "ES", "FI",
"FR", "GA", "GB", "GE", "HK", "HU", "ID", "IE", "IN", "IR", "IT",
"KE", "KR", "LB", "LY", "MX", "MY", "NL", "PE", "PH", "PK", "PL",
"PT", "QA", "RO", "RU", "RW", "SE", "SG", "SK", "SY", "TH", "TR",
"US", "UY", "VN", "YE", "ZA"), class = "factor"), os = structure(c(3L,
2L, 1L, 1L, 1L, 6L, 4L, 1L, 1L, 4L, 6L, 1L, 1L, 1L, 6L, 7L, 1L,
4L, 1L, 3L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
6L, 1L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 1L,
1L, 6L, 1L, 1L, 1L, 4L, 4L, 1L, 3L, 1L, 5L, 1L, 6L, 6L, 1L, 3L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 6L, 4L, 2L,
6L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 6L,
4L, 3L, 1L), .Label = c("android", "blackberry", "ios", "mac",
"other", "windows", "windows_phone"), class = "factor"), cpm = c(0.259529602595296,
0.008325, 0.664507018855387, 0.000646161798914448, 0.117647058823529,
0.630132741077424, 0.00398838150289017, 0.0986788005043583, 0.483832900637243,
0.631904877252478, 0.00499783423573511, 0.408063887806778, 0.0916731378464372,
1.3325069724202, 0.0112485708069297, 0.00171537666632221, 0.0129665435458787,
0.00296443300606869, 0.22941417451864, 0.000426580184572523,
0.206888580674988, 0.000622490272373541, 0.016084968041569, 0.119169168392267,
0.0216352172946694, 0.0552526416330796, 0.0150883006745904, 0.324403186817902,
0.188053932659688, 0.00389006342494715, 0.0625410833224263, 0.00111134385665529,
0.000198831231813773, 0.00551511140525039, 1.02902374670185,
0.574300071787509, 0.371022474579782, 0.111970606352996, 0.0000313953488372093,
0.380035469977198, 0.0159468438538206, 0.0274524158125915, 0.237448482577744,
0.083452302337827, 0.371352785145889, 0.129754756459319, 0.0261164794985636,
0.602409638554217, 0.0157611216101295, 0.347620654741816, 0.130193264668441,
0.34434946165254, 0.0693131695022054, 0.673575129533679, 0.0272002127093858,
0.0295980803571429, 0.482425913163336, 0.00235336471280429, 0.00508469886782341,
0.0000840689365279529, 0.236539258503618, 0.0799443865137296,
0.296296296296296, 0.0236127508854782, 0.0152198636822762, 0.00339285714285714,
0.150753768844221, 0.0859481582537517, 0.000587920688617856,
0.00127715231788079, 0.150836862270619, 0.0849810111668886, 0.279757646414598,
0.00113308871141809, 0.996427153632394, 0.00269808881394042,
0.374087591240876, 0.228267072474796, 0.0516169572925784, 0.00902986826347305,
0.000207365145228216, 0.244244977712646, 0.169128424850603, 0.573023255813954,
0.0152944175375988, 1.11731843575419, 0.426646706586826, 0.0544090571844687,
0.271433919880195, 0.0271570068233128, 0.00445611403693561, 0.00160892057026477,
0.671800318640467, 0.0216794334441393, 0.00285318261516391, 0.295866741619575,
0.0843108504398827, 1.60302577359969, 0.0132230143658259, 0.00246752277351996
)), row.names = c(6L, 22L, 25L, 28L, 31L, 41L, 43L, 45L, 47L,
59L, 68L, 70L, 71L, 72L, 73L, 80L, 94L, 95L, 96L, 101L, 115L,
117L, 121L, 123L, 125L, 140L, 144L, 149L, 151L, 165L, 169L, 170L,
179L, 182L, 186L, 189L, 190L, 206L, 207L, 208L, 221L, 238L, 239L,
259L, 271L, 275L, 276L, 280L, 281L, 294L, 303L, 308L, 311L, 315L,
318L, 345L, 354L, 355L, 362L, 374L, 377L, 383L, 384L, 385L, 386L,
394L, 405L, 407L, 408L, 419L, 422L, 424L, 425L, 427L, 442L, 445L,
454L, 455L, 465L, 466L, 482L, 484L, 485L, 487L, 496L, 506L, 510L,
513L, 517L, 518L, 523L, 528L, 544L, 548L, 552L, 557L, 570L, 579L,
586L, 596L), class = "data.frame")
Used split function to get list of data frames, which separated geo+os combinations from each other and wrote them down in a list of data frames:
X <- split(data, list(data$geo,data$os))
Than I pulled data frames out from that list into the environment and deleted data frames with zero rows
list2env(X, envir = .GlobalEnv)
## create a function that returns a logical value
isEmpty <- function(x) {
is.data.frame(x) && nrow(x) == 0L
}
## apply it over the environment
empty <- unlist(eapply(.GlobalEnv, isEmpty))
## remove the empties
rm(list = names(empty)[empty])
The desired result is a Data frame, which has 4 columns:
geo, os, quantile_15,quantile_80
Where geo+os are unique and have a certain quantile_15,quantile_80
I'd strongly suggest putting your data frames in a list instead of just leaving them in the global environment. The answer I link to should help you understand why lists are better, and also show how you could do lists from the start instead of this "find all data frames and put them in a list" approach.
eapply is difficult because there's nothing built-in to let you apply, say, only to data frames. And eapply returns results as a list, so it doesn't make much sense for adding columns to existing data frames.
df_names = ls()[sapply(mget(ls()), is.data.frame)]
df_list = mget(df_names)
result_list = lapply(df_list, function(d) d$new_col = <code for new column>)
I'm not sure what you want since you don't post your desired output. quantile(x, c(.15, .8)) returns 2 values, and your data frames have more than 2 rows, so I'm not sure what you want added - 2 new columns? 1 new column with recycling? something else?
Alternatively, maybe you just want a 2-number summary for each data frame? In that case sapply does nice simplification and keeps the names:
sapply(df_list, function(d) quantile(d$cpm, c(0.15, 0.8)))
# AE.mac AF.android BD.ios
# 15% 0.0009111413 0.1545266 0.0002341395
# 80% 0.0071962008 0.3567230 0.0076989311
EDIT based on your edits, let's work directly with data. We don't need to split, we certainly don't need list2env after the split. Adding columns by group is easy and efficient with dplyr or data.table. For example:
library(dplyr)
data %>%
group_by(geo, os) %>%
summarize(quantile_15 = quantile(cpm, .15),
quantile_80 = quantile(cpm, 0.8))
# # A tibble: 81 x 4
# # Groups: geo [?]
# geo os quantile_15 quantile_80
# <fct> <fct> <dbl> <dbl>
# 1 AE android 0.118 0.118
# 2 AE blackberry 0.00833 0.00833
# 3 AR mac 0.0296 0.0296
# 4 AT android 0.665 0.665
# 5 AU android 0.482 0.482
# 6 AU ios 0.374 0.374
# 7 AU mac 0.00903 0.00903
# ...
Or with data.table:
library(data.table)
setDT(data)
data[, as.list(quantile(cpm, c(0.15, 0.8))), by = .(geo, os)]
# geo os 15% 80%
# 1: EC ios 2.595296e-01 2.595296e-01
# 2: AE blackberry 8.325000e-03 8.325000e-03
# 3: AT android 6.645070e-01 6.645070e-01
# 4: EG android 1.702811e-02 8.928342e-02
# 5: AE android 1.176471e-01 1.176471e-01
# 6: CA windows 6.301327e-01 6.301327e-01
I have a ggplot related question, which should be easy but I could not find the answer yet. I am trying to plot a faceted plot with the code below and this dataset (11 kB).
ggplot(plot.dat, aes(x = estimate, y = reorder(countryyear, estimate))) +
geom_point() +
geom_segment(aes(x=conf.low, xend=conf.high, yend=countryyear)) +
facet_grid(. ~ facet) +
xlab("Random Effect Estimate") +
ylab("") + scale_x_continuous(breaks=c(seq(0, 5, 1)), limits=c(0, 5)) +
ggtitle("Random Slopes in Country*Year Groups from Northwestern Europe") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
I would like countryyear to be organized by the values of estimate in the Extreme Right facet. Not quite sure how to order by values of a specific facet. Any ideas are welcome! Thanks.
Update: Here is the dput structure of a random subset of the dataset. It has some missing values, but it should work for the sake of the example. I also updated the download link above, that has the full version.
structure(list(estimate = c(1.41056902925372, 0.854859208455895,
1.16012834593894, 0.871339033194504, 0.803272289946221, 1.17540386134493,
0.996313357490551, 1.49940694539732, 1.33773365908762, 2.7318703090905,
1.19131935418045, 1.12765907711738, 0.746741192261761, 0.985847015192172,
0.912357310925342, 1.11582763712164, 1.21854572824977, 0.675712547978394,
0.566955524699616, 1.32611743759365, 0.519648352294682, 0.591013596394243,
1.30944973684044, 0.613722269599125, 1.13293279727271, 0.950788678552604,
1.1599446923567, 1.11493952112913, 0.95336321045095, 1.39002327097034,
0.794207546872633, 0.788545101449259, 1.01096883872495, 0.897407203907834,
1.38391605229103, 1.35754760293107, 1.0718508539761, 0.542191158958878,
0.757132752456427, 1.44172863221312, 1.04842251986171, 0.77260404885379,
0.879288027642055, 1.09372353598088, 0.745484830381145, 1.21211217249353,
0.628009608902132, 1.34864488674734), countryyear = structure(c(1L,
2L, 4L, 5L, 7L, 9L, 10L, 12L, 13L, 26L, 28L, 29L, 31L, 32L, 34L,
36L, 37L, 39L, 40L, 57L, 59L, 60L, 62L, 63L, 65L, 67L, 68L, 70L,
71L, 73L, 75L, 76L, 89L, 90L, 92L, 94L, 95L, 103L, 104L, 106L,
108L, 109L, 111L, 128L, 130L, 132L, 133L, 135L), .Label = c("AT02",
"AT04", "AT06", "AT14", "AT16", "BE02", "BE04", "BE06", "BE08",
"BE10", "BE12", "BE14", "BE16", "BG06", "BG08", "BG10", "BG12",
"CH14", "CZ02", "CZ04", "CZ08", "CZ10", "CZ12", "CZ14", "CZ16",
"DE02", "DE04", "DE06", "DE08", "DE10", "DE12", "DE14", "DE16",
"DK02", "DK04", "DK06", "DK08", "DK10", "DK12", "DK14", "EE04",
"EE06", "EE08", "EE10", "EE12", "EE14", "EE16", "ES02", "ES04",
"ES06", "ES08", "ES10", "ES12", "ES14", "ES16", "FI02", "FI04",
"FI06", "FI08", "FI10", "FI12", "FI14", "FI16", "FR06", "FR08",
"FR10", "FR12", "FR14", "FR16", "GB02", "GB04", "GB06", "GB08",
"GB10", "GB12", "GB14", "GB16", "GR02", "GR04", "GR08", "GR10",
"HU02", "HU06", "HU08", "HU10", "HU12", "HU14", "HU16", "IE02",
"IE04", "IE06", "IE08", "IE10", "IE12", "IE14", "IE16", "IT04",
"IT12", "IT16", "LT10", "LT12", "LT14", "NL02", "NL04", "NL06",
"NL08", "NL10", "NL12", "NL14", "NL16", "NO14", "PL02", "PL04",
"PL06", "PL08", "PL10", "PL12", "PL14", "PL16", "PT02", "PT04",
"PT06", "PT08", "PT10", "PT12", "PT14", "PT16", "SE02", "SE04",
"SE06", "SE08", "SE10", "SE12", "SE14", "SE16", "SI02", "SI04",
"SI06", "SI08", "SI10", "SI12", "SI14", "SI16", "SK04", "SK06",
"SK08", "SK10", "SK12"), class = "factor"), facet = structure(c(1L,
3L, 1L, 4L, 5L, 3L, 4L, 1L, 1L, 1L, 5L, 5L, 4L, 5L, 3L, 1L, 2L,
4L, 5L, 2L, 1L, 4L, 2L, 5L, 2L, 3L, 4L, 3L, 2L, 5L, 5L, 4L, 2L,
5L, 4L, 5L, 3L, 1L, 4L, 5L, 3L, 5L, 4L, 1L, 5L, 2L, 4L, 1L), .Label = c("Intercept",
"Extreme Left", "Center", "Right", "Extreme Right"), class = "factor"),
conf.low = c(1.16824810706745, 0.686215051613965, 0.910277310292764,
0.591705078386698, 0.37357342399703, 0.947951001435781, 0.663296044193037,
1.18794112232166, 1.06645119085865, 2.33578182814618, 0.580210898576738,
0.564235690522211, 0.530859530342114, 0.516191258265551,
0.730992343373883, 0.862424540370486, 0.827891784352444,
0.427638276259852, 0.275692447335368, 0.829763907986328,
0.370078643492081, 0.321852705445509, 0.83550621863293, 0.289836810427436,
0.847226120408727, 0.780056160572728, 0.873143885861924,
0.869757467125519, 0.615741777890997, 0.649483531741787,
0.349657606457465, 0.523294407847395, 0.670109418373736,
0.36656743494149, 0.952201390937053, 0.777207016700884, 0.888128473009524,
0.397085597526946, 0.479828726362257, 0.614533313431094,
0.813336887981082, 0.3129232351085, 0.61435321820328, 0.854801028643867,
0.346698059397102, 0.805414039007076, 0.434676644041643,
1.07780736338027), conf.high = c(1.70315275860739, 1.06494933995261,
1.47855797769819, 1.28312522319126, 1.7272277157504, 1.45743211956315,
1.49652679976667, 1.8925358720741, 1.67802460909168, 3.19512520208851,
2.44607918797515, 2.25369471581694, 1.05041423643869, 1.8828182806291,
1.13872035780431, 1.44368725318228, 1.79353596677755, 1.06769546329854,
1.16593171156554, 2.11938292490653, 0.729667639003753, 1.08526995489865,
2.05223919950836, 1.29954170985538, 1.51498719434776, 1.15888977865399,
1.54095070825389, 1.4292376699955, 1.47610807594453, 2.97492484321718,
1.80395225460704, 1.18824770090216, 1.52521060717706, 2.19697554354282,
2.01136404338166, 2.37122858469145, 1.29357889999432, 0.740322123703373,
1.19469713534712, 3.38237391450413, 1.35145693795059, 1.90755095606211,
1.25847381058047, 1.39942645489832, 1.60297301142912, 1.82417470710871,
0.907332092210651, 1.68753999308876)), row.names = c(1L,
9L, 17L, 25L, 33L, 41L, 49L, 57L, 65L, 128L, 136L, 144L, 152L,
160L, 168L, 176L, 184L, 192L, 200L, 283L, 291L, 299L, 307L, 315L,
323L, 331L, 339L, 347L, 355L, 363L, 371L, 379L, 442L, 450L, 458L,
466L, 474L, 512L, 520L, 528L, 536L, 544L, 552L, 640L, 648L, 656L,
664L, 672L), class = "data.frame")
I have a data frame like this:
structure(list(x = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L,
50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L,
63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L,
76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L, 87L, 88L,
89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L,
101L, 102L, 103L, 104L, 105L, 106L, 107L, 108L, 109L, 110L, 112L,
113L, 114L, 115L, 116L, 117L, 118L, 119L, 120L, 121L, 123L, 124L,
125L, 127L, 128L, 129L, 130L, 132L, 133L, 134L, 135L, 136L, 137L,
138L, 139L, 140L, 141L, 142L, 143L, 145L, 146L, 147L, 148L, 149L,
150L, 151L, 152L, 153L, 154L, 155L, 158L, 160L, 163L, 164L, 166L,
167L, 169L, 170L, 173L, 174L, 178L, 179L, 181L, 182L, 183L, 186L,
187L, 191L, 192L, 193L, 194L, 197L, 198L, 200L, 205L, 208L, 209L,
213L, 214L, 216L, 217L, 220L, 222L, 223L, 225L, 229L, 233L, 235L,
237L, 242L, 243L, 244L, 251L, 253L, 254L, 255L, 261L, 262L, 263L,
264L, 267L, 268L, 269L, 270L, 276L, 281L, 282L, 284L, 285L, 287L,
289L, 293L, 295L, 297L, 299L, 301L, 306L, 308L, 315L, 317L, 318L,
320L, 327L, 330L, 336L, 337L, 345L, 346L, 355L, 359L, 376L, 377L,
379L, 384L, 387L, 388L, 402L, 405L, 408L, 415L, 420L, 421L, 427L,
428L, 429L, 430L, 437L, 438L, 439L, 440L, 446L, 448L, 453L, 456L,
469L, 472L, 476L, 478L, 481L, 483L, 486L, 487L, 488L, 497L, 500L,
502L, 504L, 507L, 512L, 525L, 530L, 531L, 543L, 546L, 550L, 578L,
581L, 598L, 601L, 680L, 689L, 693L, 712L, 728L, 746L, 768L, 790L,
794L, 840L, 851L, 861L, 928L, 969L, 1010L, 1180L, 1698L), freq = c(29186L,
12276L, 5851L, 3938L, 3133L, 1894L, 1157L, 820L, 597L, 481L,
398L, 297L, 269L, 251L, 175L, 176L, 153L, 130L, 117L, 108L, 93L,
83L, 58L, 84L, 60L, 43L, 59L, 51L, 57L, 53L, 38L, 38L, 32L, 35L,
28L, 27L, 29L, 22L, 24L, 29L, 30L, 23L, 26L, 19L, 19L, 25L, 14L,
22L, 16L, 12L, 15L, 14L, 11L, 13L, 18L, 10L, 17L, 20L, 7L, 9L,
2L, 8L, 12L, 8L, 7L, 10L, 10L, 9L, 6L, 6L, 9L, 5L, 11L, 4L, 5L,
5L, 10L, 4L, 6L, 1L, 4L, 7L, 3L, 4L, 3L, 2L, 3L, 5L, 7L, 2L,
2L, 3L, 2L, 4L, 7L, 1L, 3L, 5L, 5L, 3L, 5L, 2L, 2L, 2L, 3L, 2L,
5L, 7L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 3L, 2L, 2L, 1L,
3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 1L, 4L, 3L, 1L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 3L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 4L, 4L, 1L, 2L,
2L, 4L, 2L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L,
3L, 2L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 4L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("x",
"freq"), row.names = c(NA, -296L), class = "data.frame")
After the x value of 130, there are missing values. Is there a way I make this a continuous data frame in increments of 1 i.e. from 1 to 1698, populate the entire list and set the elements that do not have a value here as 0? What I mean is:
1,2
4,5
5,7
should be converted to:
1,2
2,0
3,0
4,5
5,7
Any suggestions?
You can also use merge (assuming your data is strored in l):
l <- merge(l,data.frame(x = 1:1698),all = TRUE,by = "x")
l$freq[is.na(l$freq)] <- 0
I'd create a data set of values that aren't covered by column x and then create a dataframe of those values and assign 0 to the freq of all of these x values. Then rbind and order by x.
#I called your data dat
y <- 1:max(dat$x)
dat2 <- data.frame(x=y[!y%in%dat$x], freq=0)
dat3 <- rbind(dat, dat2)
dat4 <- dat3[order(dat3$x), ] #could stop here
rownames(dat4) <- NULL #but I hate non sequential row names
dat4