Widening Data and Changing Columns - r

I have managed to delete a little bit of code that did the below task and can't for the life of me figure out how I did it before.
I want to widen the data that has two factors spread over 8 different 'waves'. There are four 'Paper' factors, each with the same four internal factors 'Response'. The output from a previously required function gives the following dataframe:
[
And I would like to make it look like this:
The single column of the first tibble has become the single row of the second tibble.
As you can see, the second tibble has extra factors of Paper but these can just be joined row wise.
I really wasn't sure how to attack this, but thought it would be done using the pivot_wider function. When I tried
times_correct <- times_19 %>%
pivot_wider( id_cols = c('Stay/remain in the EU`', 'Leave the EU', 'I would/will not vote', 'Don\'t know'), names_from = eurrefcolnames)
I got the error that I can't subset columns that don't exist which makes sense: I need to manually add the correct 'Waves'. I think this is relatively simple, but can't for the life of me figure out how I did it!
Here is the dput of the various tibbles:
structure(list(resp = structure(c(3L, 2L, 4L, 1L, NA, NA, NA,
NA), .Label = c("Don't Know", "Leave", "Remain", "Will Not Vote"
), class = "factor"), `Stay/remain in the EU` = c(316L, 290L,
313L, 324L, 338L, 320L, 325L, 335L), `Leave the EU` = c(157L,
123L, 159L, 154L, 134L, 189L, 187L, 181L), `I would/will not vote` = c(2L,
3L, 3L, 3L, 2L, 2L, 2L, 0L), `Don't know` = c(56L, 51L, 55L,
50L, 57L, 20L, 17L, 0L), Paper = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "Times", class = "factor")), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
structure(list(resp = structure(c(3L, 2L, 4L, 1L, 3L, 2L, 4L,
1L, 3L, 2L, 4L, 1L, 3L, 2L, 4L, 1L, 3L, 2L, 4L, 1L), .Label = c("Don't Know",
"Leave", "Remain", "Will Not Vote"), class = "factor"), euRefVoteW1 = c(316L,
157L, 2L, 56L, 190L, 339L, 4L, 70L, 819L, 79L, 9L, 71L, 1294L,
1311L, 150L, 523L, 1715L, 2587L, 133L, 630L), euRefVoteW2 = c(290L,
123L, 3L, 51L, 175L, 282L, 3L, 62L, 777L, 74L, 5L, 62L, 1091L,
925L, 80L, 371L, 1528L, 2044L, 83L, 517L), euRefVoteW3 = c(313L,
159L, 3L, 55L, 199L, 334L, 4L, 69L, 835L, 81L, 10L, 57L, 1348L,
1289L, 139L, 508L, 1766L, 2563L, 156L, 586L), euRefVoteW4 = c(324L,
154L, 3L, 50L, 215L, 328L, 2L, 61L, 848L, 70L, 10L, 55L, 1397L,
1267L, 128L, 492L, 1853L, 2494L, 143L, 583L), euRefVoteW6 = c(338L,
134L, 2L, 57L, 241L, 286L, 2L, 77L, 853L, 68L, 5L, 57L, 1519L,
1133L, 112L, 520L, 2017L, 2284L, 106L, 667L), euRefVoteW7 = c(320L,
189L, 2L, 20L, 186L, 384L, 2L, 34L, 832L, 109L, 8L, 34L, 1449L,
1456L, 87L, 292L, 1906L, 2785L, 55L, 328L), euRefVoteW8 = c(325L,
187L, 2L, 17L, 187L, 384L, 1L, 34L, 836L, 118L, 5L, 24L, 1462L,
1522L, 72L, 228L, 1898L, 2852L, 56L, 268L), euRefVoteW9 = c(335L,
181L, 0L, 0L, 206L, 385L, 0L, 6L, 844L, 102L, 0L, 4L, 1572L,
1462L, 0L, 21L, 2018L, 2827L, 0L, 20L), Paper = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L), .Label = c("Times", "Telegraph", "Control", "No_Paper",
"Rest"), class = "factor")), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
eurrefcolnames = c('euRefVoteW1','euRefVoteW2', 'euRefVoteW3', 'euRefVoteW4', 'euRefVoteW6',' euRefVoteW7', 'euRefVoteW8', 'euRefVoteW9')
EDIT:
Here is the function that create the initial dataframes, is there an edit I could make here perhaps ?
tally_reader_number <- function(input_dataframe,newspaper_name) {
#function takes the input of in_all_waves, tallies the number of different eu ref responses using map_df for a given newspaper factor (defined above)
# and returns a dataframe of responese for each wave with the newspaper factor as a column
returned_dataframe <- input_dataframe %>%
filter(Paper == newspaper_name) %>%
ungroup() %>% #function refuses to work without this
select(-Paper) %>%
map_df(table) %>% # use map_df from the purrr package to "table" each column
rownames_to_column("response") %>% #convert the rownames to a column named response
mutate(resp = case_when(response == 1 ~ "Remain", #change the resulting numbers to the correct responses
response == 2 ~ "Leave",
response ==3 ~ "Will Not Vote",
response == 4 ~ "Don't Know")) %>%
select(resp, everything(), -response) %>% #reorder the columns with resp at the front, removing response
mutate(Paper = newspaper_name)
returned_dataframe$Paper <- as.factor(returned_dataframe$Paper)
returned_dataframe$resp <- as.factor(returned_dataframe$resp)
returned_dataframe
}

Related

T-tests on specific subgroups

I'm trying to calculate mean and SD and then perform t.tests on three different measurements (height, weight, speed) between multiple subgroups.
I started with a simple dataset that only contains two groups (control vs drug) and I have it all working well enough.
simple.df<-
structure(list(trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), levels = c("control", "drug"), class = "factor"), height = c(15,
17, 25, 21, 11, 29, 18, 20), weight = c(80, 90, 81, 79, 200,
230, 215, 210), speed = c(50, 45, 60, 51, 52, 80, 41, 19)), class = "data.frame", row.names = c(NA,
-8L))
library(rstatix)
simple.df %>% group_by(trial) %>% get_summary_stats(type = "mean_sd")
testing<- data.frame(lapply(simple.df[-1], function(x) t.test(x~simple.df$trial)$p.value))
testing
Where I'm running into trouble is with the t.testing on a larger experiment similar to the dataframe below. I still have control vs drug and height, weight & speed, but now all the measurements were done at two timepoints in both males and females. I'm only concerned with comparing control versus drug for the same sex/age. I'm still ok calculating the mean and SD for each group, but have gotten stuck with figuring out the t-testing.
Specifically, I just want the t-test on each of the three measurements for drug vs control in young males, drug vs control in old males, drug vs control in young females and drug vs control in old females, so 12 p-values total with some identification for what comparison each value represents.
Thanks for your help and expertise!
big.df<- structure(list(age = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("old", "young"
), class = "factor"), sex = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("f", "m"), class = "factor"),
trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("control", "drug"
), class = "factor"), height = c(15L, 17L, 25L, 21L, 11L,
29L, 18L, 20L, 300L, 320L, 316L, 325L, 170L, 175L, 172L,
180L, 28L, 40L, 33L, 35L, 60L, 45L, 67L, 52L, 250L, 260L,
240L, 248L, 11L, 19L, 16L, 4L), weight = c(80L, 90L, 81L,
79L, 200L, 230L, 215L, 210L, 152L, 150L, 148L, 155L, 160L,
158L, 157L, 140L, 176L, 164L, 135L, 196L, 175L, 178L, 120L,
147L, 160L, 155L, 175L, 142L, 139L, 142L, 150L, 145L), speed = c(50L,
45L, 60L, 51L, 52L, 80L, 41L, 19L, 55L, 56L, 61L, 67L, 85L,
90L, 100L, 77L, 90L, 80L, 77L, 80L, 81L, 95L, 87L, 91L, 50L,
60L, 55L, 59L, 71L, 65L, 66L, 62L)), row.names = c(NA, -32L
), class = "data.frame")
big.df %>% group_by (sex, age, trial) %>%
get_summary_stats (type = "mean_sd") %>%
arrange (variable, sex, age, trial)
RYann had a good idea by defining a function to pull out subgroups and then doing all the t-tests on each subgroup. That approach was helpful.
I ended up building on his strategy and simplifing things a bit more by vectorizing the t-tests inside the function using lapply. I then stored each of the age/sex combinations in a dataframe and used mapply to pass those combinations to the t-testing function.
group<-big.df %>% filter(age == a_age & sex == a_sex)
data.frame(lapply(group[4:6], function(x) t.test(x~group$trial)$p.value))
}
combos <- data.frame(age = c("young","young","old","old"),
sex = c("m","f","m","f"))
t.test.df <- data.frame(mapply(t.script, a_age = combos$age, a_sex = combos$sex))
colnames(t.test.df) <- paste(combos$age, combos$sex, sep = " ")
young m
young f
old m
old f
height
1
1.939896e-05
0.01175771
1.630232e-08
weight
4.435875e-05
0.6368126
0.5196617
0.1299121
speed
0.80433
0.004320253
0.1526353
0.01539331
I hope this code will work out for you
big.df<- structure(list(age = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("old", "young"
), class = "factor"), sex = structure(c(2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), levels = c("f", "m"), class = "factor"),
trial = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L), levels = c("control", "drug"
), class = "factor"), height = c(15L, 17L, 25L, 21L, 11L,
29L, 18L, 20L, 300L, 320L, 316L, 325L, 170L, 175L, 172L,
180L, 28L, 40L, 33L, 35L, 60L, 45L, 67L, 52L, 250L, 260L,
240L, 248L, 11L, 19L, 16L, 4L), weight = c(80L, 90L, 81L,
79L, 200L, 230L, 215L, 210L, 152L, 150L, 148L, 155L, 160L,
158L, 157L, 140L, 176L, 164L, 135L, 196L, 175L, 178L, 120L,
147L, 160L, 155L, 175L, 142L, 139L, 142L, 150L, 145L), speed = c(50L,
45L, 60L, 51L, 52L, 80L, 41L, 19L, 55L, 56L, 61L, 67L, 85L,
90L, 100L, 77L, 90L, 80L, 77L, 80L, 81L, 95L, 87L, 91L, 50L,
60L, 55L, 59L, 71L, 65L, 66L, 62L)), row.names = c(NA, -32L
), class = "data.frame")
# A function to extract the 3 comparrisons
multi_t <- function(a_sex,a_age){
df_func <- big.df %>% filter(sex==a_sex,age==a_age)
h <- t.test(height~trial,df_func)$p.value
w <- t.test(weight~trial,df_func)$p.value
s <- t.test(speed~trial,df_func)$p.value
# cat(
# "sex =",a_sex,"\nage =",a_age,"\n\n"
# )
return(cbind(height=h,weight=w,speed=s))
}
# Table in a long version
ptable <- data.frame(
multi_t("m","young"),
multi_t("m","old"),
multi_t("f","young"),
multi_t("f","old")
) %>% pivot_longer(cols=everything(),
names_to = "value",
values_to = "p.values") %>%
mutate(comparison = rep(c("young males","old males",
"young females","old females"),each=3),
value=str_remove_all(value,"\\.\\d"))
ptable
# Table in a wider version
ptable %>% group_by(value) %>% mutate(id=row_number()) %>%
pivot_wider(names_from = value,values_from = p.values) %>%
select(-id)
ptable %>%
mutate(sig=p.values<0.05) %>%
ggplot(aes(x=value,y=p.values,color=sig))+
geom_point(show.legend = T)+facet_wrap(~comparison,scales="free")+
theme(legend.position = "bottom")+
labs(title="P values of 3 different measurements",
subtitle = "For 4 different populations")

Apply a function on all the element of the environment [duplicate]

I have lots of data frames with the same columns. What I want is to apply quantile (15% and 80%) function to the 3rd ("cpm") column for all the data frames in my environment and add the result as a new column to each data frame
All the data frames in environment are the same, here is the sample of them:
BD.ios = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "BD", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ios", class = "factor"),
cpm = c(0.00026978417266187, 0.000276497695852535, 0.00442228161827238,
0.00396317260301814, 0.0191772698764066, 0.700811773637797,
0.00482934642627173, 0.00201429499675114, 0.00021494623655914,
0.0000520855057351408)), row.names = c(12925L, 13011L, 15189L,
18469L, 19494L, 22385L, 22594L, 29467L, 31907L, 38037L), class = "data.frame")
AE.mac = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AE", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "mac", class = "factor"),
cpm = c(0.000353264424964019, 0.00390138781055901, 0.000893105609526794,
0.0099634872417983, 0.00119375573921028, 0.00535134321942833,
0.00318471337579618, 0.000983284169124877, 0.116180371352785
)), row.names = c(2622L, 6483L, 6898L, 9383L, 25280L, 25923L,
29649L, 37977L, 40411L), class = "data.frame")
AF.android = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AF", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "android", class = "factor"),
cpm = c(0.193592767295597, 0.153727276424417, 0.30376596601237,
0.43615845874945, 0.552450120363948, 0.214786723495654, 0.206123674204523,
0.0250727462779332, 0.157723828668625)), row.names = c(955L,
7975L, 8899L, 9297L, 11223L, 14963L, 17452L, 19883L, 20555L), class = "data.frame")
I believe, that the solution is easy and requires the use of eapply function, but I just can't figure it out
env = .GlobalEnv
eapply(env, quantile, probs = c(.15,.8))
This command results in an error:
Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
undefined columns selected
EDIT
To make it clear, here is what I did and what I need as a result:
I had Data like this
data = structure(list(geo = structure(c(15L, 1L, 3L, 16L, 1L, 9L, 17L,
23L, 29L, 52L, 26L, 55L, 34L, 46L, 25L, 52L, 17L, 15L, 27L, 35L,
45L, 8L, 21L, 24L, 6L, 16L, 52L, 31L, 14L, 38L, 21L, 5L, 41L,
16L, 34L, 52L, 27L, 16L, 7L, 13L, 10L, 35L, 52L, 44L, 27L, 19L,
35L, 6L, 42L, 25L, 40L, 31L, 43L, 33L, 13L, 2L, 4L, 12L, 30L,
44L, 51L, 38L, 35L, 28L, 52L, 32L, 20L, 19L, 34L, 56L, 51L, 53L,
54L, 22L, 49L, 18L, 4L, 36L, 34L, 4L, 47L, 11L, 25L, 9L, 6L,
46L, 39L, 25L, 12L, 50L, 27L, 39L, 48L, 27L, 23L, 9L, 19L, 9L,
44L, 37L), .Label = c("AE", "AR", "AT", "AU", "AZ", "BD", "BG",
"BO", "CA", "CD", "CH", "CO", "DK", "DZ", "EC", "EG", "ES", "FI",
"FR", "GA", "GB", "GE", "HK", "HU", "ID", "IE", "IN", "IR", "IT",
"KE", "KR", "LB", "LY", "MX", "MY", "NL", "PE", "PH", "PK", "PL",
"PT", "QA", "RO", "RU", "RW", "SE", "SG", "SK", "SY", "TH", "TR",
"US", "UY", "VN", "YE", "ZA"), class = "factor"), os = structure(c(3L,
2L, 1L, 1L, 1L, 6L, 4L, 1L, 1L, 4L, 6L, 1L, 1L, 1L, 6L, 7L, 1L,
4L, 1L, 3L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
6L, 1L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 1L,
1L, 6L, 1L, 1L, 1L, 4L, 4L, 1L, 3L, 1L, 5L, 1L, 6L, 6L, 1L, 3L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 6L, 4L, 2L,
6L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 6L,
4L, 3L, 1L), .Label = c("android", "blackberry", "ios", "mac",
"other", "windows", "windows_phone"), class = "factor"), cpm = c(0.259529602595296,
0.008325, 0.664507018855387, 0.000646161798914448, 0.117647058823529,
0.630132741077424, 0.00398838150289017, 0.0986788005043583, 0.483832900637243,
0.631904877252478, 0.00499783423573511, 0.408063887806778, 0.0916731378464372,
1.3325069724202, 0.0112485708069297, 0.00171537666632221, 0.0129665435458787,
0.00296443300606869, 0.22941417451864, 0.000426580184572523,
0.206888580674988, 0.000622490272373541, 0.016084968041569, 0.119169168392267,
0.0216352172946694, 0.0552526416330796, 0.0150883006745904, 0.324403186817902,
0.188053932659688, 0.00389006342494715, 0.0625410833224263, 0.00111134385665529,
0.000198831231813773, 0.00551511140525039, 1.02902374670185,
0.574300071787509, 0.371022474579782, 0.111970606352996, 0.0000313953488372093,
0.380035469977198, 0.0159468438538206, 0.0274524158125915, 0.237448482577744,
0.083452302337827, 0.371352785145889, 0.129754756459319, 0.0261164794985636,
0.602409638554217, 0.0157611216101295, 0.347620654741816, 0.130193264668441,
0.34434946165254, 0.0693131695022054, 0.673575129533679, 0.0272002127093858,
0.0295980803571429, 0.482425913163336, 0.00235336471280429, 0.00508469886782341,
0.0000840689365279529, 0.236539258503618, 0.0799443865137296,
0.296296296296296, 0.0236127508854782, 0.0152198636822762, 0.00339285714285714,
0.150753768844221, 0.0859481582537517, 0.000587920688617856,
0.00127715231788079, 0.150836862270619, 0.0849810111668886, 0.279757646414598,
0.00113308871141809, 0.996427153632394, 0.00269808881394042,
0.374087591240876, 0.228267072474796, 0.0516169572925784, 0.00902986826347305,
0.000207365145228216, 0.244244977712646, 0.169128424850603, 0.573023255813954,
0.0152944175375988, 1.11731843575419, 0.426646706586826, 0.0544090571844687,
0.271433919880195, 0.0271570068233128, 0.00445611403693561, 0.00160892057026477,
0.671800318640467, 0.0216794334441393, 0.00285318261516391, 0.295866741619575,
0.0843108504398827, 1.60302577359969, 0.0132230143658259, 0.00246752277351996
)), row.names = c(6L, 22L, 25L, 28L, 31L, 41L, 43L, 45L, 47L,
59L, 68L, 70L, 71L, 72L, 73L, 80L, 94L, 95L, 96L, 101L, 115L,
117L, 121L, 123L, 125L, 140L, 144L, 149L, 151L, 165L, 169L, 170L,
179L, 182L, 186L, 189L, 190L, 206L, 207L, 208L, 221L, 238L, 239L,
259L, 271L, 275L, 276L, 280L, 281L, 294L, 303L, 308L, 311L, 315L,
318L, 345L, 354L, 355L, 362L, 374L, 377L, 383L, 384L, 385L, 386L,
394L, 405L, 407L, 408L, 419L, 422L, 424L, 425L, 427L, 442L, 445L,
454L, 455L, 465L, 466L, 482L, 484L, 485L, 487L, 496L, 506L, 510L,
513L, 517L, 518L, 523L, 528L, 544L, 548L, 552L, 557L, 570L, 579L,
586L, 596L), class = "data.frame")
Used split function to get list of data frames, which separated geo+os combinations from each other and wrote them down in a list of data frames:
X <- split(data, list(data$geo,data$os))
Than I pulled data frames out from that list into the environment and deleted data frames with zero rows
list2env(X, envir = .GlobalEnv)
## create a function that returns a logical value
isEmpty <- function(x) {
is.data.frame(x) && nrow(x) == 0L
}
## apply it over the environment
empty <- unlist(eapply(.GlobalEnv, isEmpty))
## remove the empties
rm(list = names(empty)[empty])
The desired result is a Data frame, which has 4 columns:
geo, os, quantile_15,quantile_80
Where geo+os are unique and have a certain quantile_15,quantile_80
I'd strongly suggest putting your data frames in a list instead of just leaving them in the global environment. The answer I link to should help you understand why lists are better, and also show how you could do lists from the start instead of this "find all data frames and put them in a list" approach.
eapply is difficult because there's nothing built-in to let you apply, say, only to data frames. And eapply returns results as a list, so it doesn't make much sense for adding columns to existing data frames.
df_names = ls()[sapply(mget(ls()), is.data.frame)]
df_list = mget(df_names)
result_list = lapply(df_list, function(d) d$new_col = <code for new column>)
I'm not sure what you want since you don't post your desired output. quantile(x, c(.15, .8)) returns 2 values, and your data frames have more than 2 rows, so I'm not sure what you want added - 2 new columns? 1 new column with recycling? something else?
Alternatively, maybe you just want a 2-number summary for each data frame? In that case sapply does nice simplification and keeps the names:
sapply(df_list, function(d) quantile(d$cpm, c(0.15, 0.8)))
# AE.mac AF.android BD.ios
# 15% 0.0009111413 0.1545266 0.0002341395
# 80% 0.0071962008 0.3567230 0.0076989311
EDIT based on your edits, let's work directly with data. We don't need to split, we certainly don't need list2env after the split. Adding columns by group is easy and efficient with dplyr or data.table. For example:
library(dplyr)
data %>%
group_by(geo, os) %>%
summarize(quantile_15 = quantile(cpm, .15),
quantile_80 = quantile(cpm, 0.8))
# # A tibble: 81 x 4
# # Groups: geo [?]
# geo os quantile_15 quantile_80
# <fct> <fct> <dbl> <dbl>
# 1 AE android 0.118 0.118
# 2 AE blackberry 0.00833 0.00833
# 3 AR mac 0.0296 0.0296
# 4 AT android 0.665 0.665
# 5 AU android 0.482 0.482
# 6 AU ios 0.374 0.374
# 7 AU mac 0.00903 0.00903
# ...
Or with data.table:
library(data.table)
setDT(data)
data[, as.list(quantile(cpm, c(0.15, 0.8))), by = .(geo, os)]
# geo os 15% 80%
# 1: EC ios 2.595296e-01 2.595296e-01
# 2: AE blackberry 8.325000e-03 8.325000e-03
# 3: AT android 6.645070e-01 6.645070e-01
# 4: EG android 1.702811e-02 8.928342e-02
# 5: AE android 1.176471e-01 1.176471e-01
# 6: CA windows 6.301327e-01 6.301327e-01

How to apply a function to a certain column for all the data frames in environment in R

I have lots of data frames with the same columns. What I want is to apply quantile (15% and 80%) function to the 3rd ("cpm") column for all the data frames in my environment and add the result as a new column to each data frame
All the data frames in environment are the same, here is the sample of them:
BD.ios = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "BD", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ios", class = "factor"),
cpm = c(0.00026978417266187, 0.000276497695852535, 0.00442228161827238,
0.00396317260301814, 0.0191772698764066, 0.700811773637797,
0.00482934642627173, 0.00201429499675114, 0.00021494623655914,
0.0000520855057351408)), row.names = c(12925L, 13011L, 15189L,
18469L, 19494L, 22385L, 22594L, 29467L, 31907L, 38037L), class = "data.frame")
AE.mac = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AE", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "mac", class = "factor"),
cpm = c(0.000353264424964019, 0.00390138781055901, 0.000893105609526794,
0.0099634872417983, 0.00119375573921028, 0.00535134321942833,
0.00318471337579618, 0.000983284169124877, 0.116180371352785
)), row.names = c(2622L, 6483L, 6898L, 9383L, 25280L, 25923L,
29649L, 37977L, 40411L), class = "data.frame")
AF.android = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AF", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "android", class = "factor"),
cpm = c(0.193592767295597, 0.153727276424417, 0.30376596601237,
0.43615845874945, 0.552450120363948, 0.214786723495654, 0.206123674204523,
0.0250727462779332, 0.157723828668625)), row.names = c(955L,
7975L, 8899L, 9297L, 11223L, 14963L, 17452L, 19883L, 20555L), class = "data.frame")
I believe, that the solution is easy and requires the use of eapply function, but I just can't figure it out
env = .GlobalEnv
eapply(env, quantile, probs = c(.15,.8))
This command results in an error:
Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
undefined columns selected
EDIT
To make it clear, here is what I did and what I need as a result:
I had Data like this
data = structure(list(geo = structure(c(15L, 1L, 3L, 16L, 1L, 9L, 17L,
23L, 29L, 52L, 26L, 55L, 34L, 46L, 25L, 52L, 17L, 15L, 27L, 35L,
45L, 8L, 21L, 24L, 6L, 16L, 52L, 31L, 14L, 38L, 21L, 5L, 41L,
16L, 34L, 52L, 27L, 16L, 7L, 13L, 10L, 35L, 52L, 44L, 27L, 19L,
35L, 6L, 42L, 25L, 40L, 31L, 43L, 33L, 13L, 2L, 4L, 12L, 30L,
44L, 51L, 38L, 35L, 28L, 52L, 32L, 20L, 19L, 34L, 56L, 51L, 53L,
54L, 22L, 49L, 18L, 4L, 36L, 34L, 4L, 47L, 11L, 25L, 9L, 6L,
46L, 39L, 25L, 12L, 50L, 27L, 39L, 48L, 27L, 23L, 9L, 19L, 9L,
44L, 37L), .Label = c("AE", "AR", "AT", "AU", "AZ", "BD", "BG",
"BO", "CA", "CD", "CH", "CO", "DK", "DZ", "EC", "EG", "ES", "FI",
"FR", "GA", "GB", "GE", "HK", "HU", "ID", "IE", "IN", "IR", "IT",
"KE", "KR", "LB", "LY", "MX", "MY", "NL", "PE", "PH", "PK", "PL",
"PT", "QA", "RO", "RU", "RW", "SE", "SG", "SK", "SY", "TH", "TR",
"US", "UY", "VN", "YE", "ZA"), class = "factor"), os = structure(c(3L,
2L, 1L, 1L, 1L, 6L, 4L, 1L, 1L, 4L, 6L, 1L, 1L, 1L, 6L, 7L, 1L,
4L, 1L, 3L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
6L, 1L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 1L,
1L, 6L, 1L, 1L, 1L, 4L, 4L, 1L, 3L, 1L, 5L, 1L, 6L, 6L, 1L, 3L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 6L, 4L, 2L,
6L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 6L,
4L, 3L, 1L), .Label = c("android", "blackberry", "ios", "mac",
"other", "windows", "windows_phone"), class = "factor"), cpm = c(0.259529602595296,
0.008325, 0.664507018855387, 0.000646161798914448, 0.117647058823529,
0.630132741077424, 0.00398838150289017, 0.0986788005043583, 0.483832900637243,
0.631904877252478, 0.00499783423573511, 0.408063887806778, 0.0916731378464372,
1.3325069724202, 0.0112485708069297, 0.00171537666632221, 0.0129665435458787,
0.00296443300606869, 0.22941417451864, 0.000426580184572523,
0.206888580674988, 0.000622490272373541, 0.016084968041569, 0.119169168392267,
0.0216352172946694, 0.0552526416330796, 0.0150883006745904, 0.324403186817902,
0.188053932659688, 0.00389006342494715, 0.0625410833224263, 0.00111134385665529,
0.000198831231813773, 0.00551511140525039, 1.02902374670185,
0.574300071787509, 0.371022474579782, 0.111970606352996, 0.0000313953488372093,
0.380035469977198, 0.0159468438538206, 0.0274524158125915, 0.237448482577744,
0.083452302337827, 0.371352785145889, 0.129754756459319, 0.0261164794985636,
0.602409638554217, 0.0157611216101295, 0.347620654741816, 0.130193264668441,
0.34434946165254, 0.0693131695022054, 0.673575129533679, 0.0272002127093858,
0.0295980803571429, 0.482425913163336, 0.00235336471280429, 0.00508469886782341,
0.0000840689365279529, 0.236539258503618, 0.0799443865137296,
0.296296296296296, 0.0236127508854782, 0.0152198636822762, 0.00339285714285714,
0.150753768844221, 0.0859481582537517, 0.000587920688617856,
0.00127715231788079, 0.150836862270619, 0.0849810111668886, 0.279757646414598,
0.00113308871141809, 0.996427153632394, 0.00269808881394042,
0.374087591240876, 0.228267072474796, 0.0516169572925784, 0.00902986826347305,
0.000207365145228216, 0.244244977712646, 0.169128424850603, 0.573023255813954,
0.0152944175375988, 1.11731843575419, 0.426646706586826, 0.0544090571844687,
0.271433919880195, 0.0271570068233128, 0.00445611403693561, 0.00160892057026477,
0.671800318640467, 0.0216794334441393, 0.00285318261516391, 0.295866741619575,
0.0843108504398827, 1.60302577359969, 0.0132230143658259, 0.00246752277351996
)), row.names = c(6L, 22L, 25L, 28L, 31L, 41L, 43L, 45L, 47L,
59L, 68L, 70L, 71L, 72L, 73L, 80L, 94L, 95L, 96L, 101L, 115L,
117L, 121L, 123L, 125L, 140L, 144L, 149L, 151L, 165L, 169L, 170L,
179L, 182L, 186L, 189L, 190L, 206L, 207L, 208L, 221L, 238L, 239L,
259L, 271L, 275L, 276L, 280L, 281L, 294L, 303L, 308L, 311L, 315L,
318L, 345L, 354L, 355L, 362L, 374L, 377L, 383L, 384L, 385L, 386L,
394L, 405L, 407L, 408L, 419L, 422L, 424L, 425L, 427L, 442L, 445L,
454L, 455L, 465L, 466L, 482L, 484L, 485L, 487L, 496L, 506L, 510L,
513L, 517L, 518L, 523L, 528L, 544L, 548L, 552L, 557L, 570L, 579L,
586L, 596L), class = "data.frame")
Used split function to get list of data frames, which separated geo+os combinations from each other and wrote them down in a list of data frames:
X <- split(data, list(data$geo,data$os))
Than I pulled data frames out from that list into the environment and deleted data frames with zero rows
list2env(X, envir = .GlobalEnv)
## create a function that returns a logical value
isEmpty <- function(x) {
is.data.frame(x) && nrow(x) == 0L
}
## apply it over the environment
empty <- unlist(eapply(.GlobalEnv, isEmpty))
## remove the empties
rm(list = names(empty)[empty])
The desired result is a Data frame, which has 4 columns:
geo, os, quantile_15,quantile_80
Where geo+os are unique and have a certain quantile_15,quantile_80
I'd strongly suggest putting your data frames in a list instead of just leaving them in the global environment. The answer I link to should help you understand why lists are better, and also show how you could do lists from the start instead of this "find all data frames and put them in a list" approach.
eapply is difficult because there's nothing built-in to let you apply, say, only to data frames. And eapply returns results as a list, so it doesn't make much sense for adding columns to existing data frames.
df_names = ls()[sapply(mget(ls()), is.data.frame)]
df_list = mget(df_names)
result_list = lapply(df_list, function(d) d$new_col = <code for new column>)
I'm not sure what you want since you don't post your desired output. quantile(x, c(.15, .8)) returns 2 values, and your data frames have more than 2 rows, so I'm not sure what you want added - 2 new columns? 1 new column with recycling? something else?
Alternatively, maybe you just want a 2-number summary for each data frame? In that case sapply does nice simplification and keeps the names:
sapply(df_list, function(d) quantile(d$cpm, c(0.15, 0.8)))
# AE.mac AF.android BD.ios
# 15% 0.0009111413 0.1545266 0.0002341395
# 80% 0.0071962008 0.3567230 0.0076989311
EDIT based on your edits, let's work directly with data. We don't need to split, we certainly don't need list2env after the split. Adding columns by group is easy and efficient with dplyr or data.table. For example:
library(dplyr)
data %>%
group_by(geo, os) %>%
summarize(quantile_15 = quantile(cpm, .15),
quantile_80 = quantile(cpm, 0.8))
# # A tibble: 81 x 4
# # Groups: geo [?]
# geo os quantile_15 quantile_80
# <fct> <fct> <dbl> <dbl>
# 1 AE android 0.118 0.118
# 2 AE blackberry 0.00833 0.00833
# 3 AR mac 0.0296 0.0296
# 4 AT android 0.665 0.665
# 5 AU android 0.482 0.482
# 6 AU ios 0.374 0.374
# 7 AU mac 0.00903 0.00903
# ...
Or with data.table:
library(data.table)
setDT(data)
data[, as.list(quantile(cpm, c(0.15, 0.8))), by = .(geo, os)]
# geo os 15% 80%
# 1: EC ios 2.595296e-01 2.595296e-01
# 2: AE blackberry 8.325000e-03 8.325000e-03
# 3: AT android 6.645070e-01 6.645070e-01
# 4: EG android 1.702811e-02 8.928342e-02
# 5: AE android 1.176471e-01 1.176471e-01
# 6: CA windows 6.301327e-01 6.301327e-01

order geom_point by specific facet

I have a ggplot related question, which should be easy but I could not find the answer yet. I am trying to plot a faceted plot with the code below and this dataset (11 kB).
ggplot(plot.dat, aes(x = estimate, y = reorder(countryyear, estimate))) +
geom_point() +
geom_segment(aes(x=conf.low, xend=conf.high, yend=countryyear)) +
facet_grid(. ~ facet) +
xlab("Random Effect Estimate") +
ylab("") + scale_x_continuous(breaks=c(seq(0, 5, 1)), limits=c(0, 5)) +
ggtitle("Random Slopes in Country*Year Groups from Northwestern Europe") +
theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
I would like countryyear to be organized by the values of estimate in the Extreme Right facet. Not quite sure how to order by values of a specific facet. Any ideas are welcome! Thanks.
Update: Here is the dput structure of a random subset of the dataset. It has some missing values, but it should work for the sake of the example. I also updated the download link above, that has the full version.
structure(list(estimate = c(1.41056902925372, 0.854859208455895,
1.16012834593894, 0.871339033194504, 0.803272289946221, 1.17540386134493,
0.996313357490551, 1.49940694539732, 1.33773365908762, 2.7318703090905,
1.19131935418045, 1.12765907711738, 0.746741192261761, 0.985847015192172,
0.912357310925342, 1.11582763712164, 1.21854572824977, 0.675712547978394,
0.566955524699616, 1.32611743759365, 0.519648352294682, 0.591013596394243,
1.30944973684044, 0.613722269599125, 1.13293279727271, 0.950788678552604,
1.1599446923567, 1.11493952112913, 0.95336321045095, 1.39002327097034,
0.794207546872633, 0.788545101449259, 1.01096883872495, 0.897407203907834,
1.38391605229103, 1.35754760293107, 1.0718508539761, 0.542191158958878,
0.757132752456427, 1.44172863221312, 1.04842251986171, 0.77260404885379,
0.879288027642055, 1.09372353598088, 0.745484830381145, 1.21211217249353,
0.628009608902132, 1.34864488674734), countryyear = structure(c(1L,
2L, 4L, 5L, 7L, 9L, 10L, 12L, 13L, 26L, 28L, 29L, 31L, 32L, 34L,
36L, 37L, 39L, 40L, 57L, 59L, 60L, 62L, 63L, 65L, 67L, 68L, 70L,
71L, 73L, 75L, 76L, 89L, 90L, 92L, 94L, 95L, 103L, 104L, 106L,
108L, 109L, 111L, 128L, 130L, 132L, 133L, 135L), .Label = c("AT02",
"AT04", "AT06", "AT14", "AT16", "BE02", "BE04", "BE06", "BE08",
"BE10", "BE12", "BE14", "BE16", "BG06", "BG08", "BG10", "BG12",
"CH14", "CZ02", "CZ04", "CZ08", "CZ10", "CZ12", "CZ14", "CZ16",
"DE02", "DE04", "DE06", "DE08", "DE10", "DE12", "DE14", "DE16",
"DK02", "DK04", "DK06", "DK08", "DK10", "DK12", "DK14", "EE04",
"EE06", "EE08", "EE10", "EE12", "EE14", "EE16", "ES02", "ES04",
"ES06", "ES08", "ES10", "ES12", "ES14", "ES16", "FI02", "FI04",
"FI06", "FI08", "FI10", "FI12", "FI14", "FI16", "FR06", "FR08",
"FR10", "FR12", "FR14", "FR16", "GB02", "GB04", "GB06", "GB08",
"GB10", "GB12", "GB14", "GB16", "GR02", "GR04", "GR08", "GR10",
"HU02", "HU06", "HU08", "HU10", "HU12", "HU14", "HU16", "IE02",
"IE04", "IE06", "IE08", "IE10", "IE12", "IE14", "IE16", "IT04",
"IT12", "IT16", "LT10", "LT12", "LT14", "NL02", "NL04", "NL06",
"NL08", "NL10", "NL12", "NL14", "NL16", "NO14", "PL02", "PL04",
"PL06", "PL08", "PL10", "PL12", "PL14", "PL16", "PT02", "PT04",
"PT06", "PT08", "PT10", "PT12", "PT14", "PT16", "SE02", "SE04",
"SE06", "SE08", "SE10", "SE12", "SE14", "SE16", "SI02", "SI04",
"SI06", "SI08", "SI10", "SI12", "SI14", "SI16", "SK04", "SK06",
"SK08", "SK10", "SK12"), class = "factor"), facet = structure(c(1L,
3L, 1L, 4L, 5L, 3L, 4L, 1L, 1L, 1L, 5L, 5L, 4L, 5L, 3L, 1L, 2L,
4L, 5L, 2L, 1L, 4L, 2L, 5L, 2L, 3L, 4L, 3L, 2L, 5L, 5L, 4L, 2L,
5L, 4L, 5L, 3L, 1L, 4L, 5L, 3L, 5L, 4L, 1L, 5L, 2L, 4L, 1L), .Label = c("Intercept",
"Extreme Left", "Center", "Right", "Extreme Right"), class = "factor"),
conf.low = c(1.16824810706745, 0.686215051613965, 0.910277310292764,
0.591705078386698, 0.37357342399703, 0.947951001435781, 0.663296044193037,
1.18794112232166, 1.06645119085865, 2.33578182814618, 0.580210898576738,
0.564235690522211, 0.530859530342114, 0.516191258265551,
0.730992343373883, 0.862424540370486, 0.827891784352444,
0.427638276259852, 0.275692447335368, 0.829763907986328,
0.370078643492081, 0.321852705445509, 0.83550621863293, 0.289836810427436,
0.847226120408727, 0.780056160572728, 0.873143885861924,
0.869757467125519, 0.615741777890997, 0.649483531741787,
0.349657606457465, 0.523294407847395, 0.670109418373736,
0.36656743494149, 0.952201390937053, 0.777207016700884, 0.888128473009524,
0.397085597526946, 0.479828726362257, 0.614533313431094,
0.813336887981082, 0.3129232351085, 0.61435321820328, 0.854801028643867,
0.346698059397102, 0.805414039007076, 0.434676644041643,
1.07780736338027), conf.high = c(1.70315275860739, 1.06494933995261,
1.47855797769819, 1.28312522319126, 1.7272277157504, 1.45743211956315,
1.49652679976667, 1.8925358720741, 1.67802460909168, 3.19512520208851,
2.44607918797515, 2.25369471581694, 1.05041423643869, 1.8828182806291,
1.13872035780431, 1.44368725318228, 1.79353596677755, 1.06769546329854,
1.16593171156554, 2.11938292490653, 0.729667639003753, 1.08526995489865,
2.05223919950836, 1.29954170985538, 1.51498719434776, 1.15888977865399,
1.54095070825389, 1.4292376699955, 1.47610807594453, 2.97492484321718,
1.80395225460704, 1.18824770090216, 1.52521060717706, 2.19697554354282,
2.01136404338166, 2.37122858469145, 1.29357889999432, 0.740322123703373,
1.19469713534712, 3.38237391450413, 1.35145693795059, 1.90755095606211,
1.25847381058047, 1.39942645489832, 1.60297301142912, 1.82417470710871,
0.907332092210651, 1.68753999308876)), row.names = c(1L,
9L, 17L, 25L, 33L, 41L, 49L, 57L, 65L, 128L, 136L, 144L, 152L,
160L, 168L, 176L, 184L, 192L, 200L, 283L, 291L, 299L, 307L, 315L,
323L, 331L, 339L, 347L, 355L, 363L, 371L, 379L, 442L, 450L, 458L,
466L, 474L, 512L, 520L, 528L, 536L, 544L, 552L, 640L, 648L, 656L,
664L, 672L), class = "data.frame")

How can I add missing sequence values?

I have a data frame like this:
structure(list(x = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L,
50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L,
63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L, 74L, 75L,
76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L, 87L, 88L,
89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L,
101L, 102L, 103L, 104L, 105L, 106L, 107L, 108L, 109L, 110L, 112L,
113L, 114L, 115L, 116L, 117L, 118L, 119L, 120L, 121L, 123L, 124L,
125L, 127L, 128L, 129L, 130L, 132L, 133L, 134L, 135L, 136L, 137L,
138L, 139L, 140L, 141L, 142L, 143L, 145L, 146L, 147L, 148L, 149L,
150L, 151L, 152L, 153L, 154L, 155L, 158L, 160L, 163L, 164L, 166L,
167L, 169L, 170L, 173L, 174L, 178L, 179L, 181L, 182L, 183L, 186L,
187L, 191L, 192L, 193L, 194L, 197L, 198L, 200L, 205L, 208L, 209L,
213L, 214L, 216L, 217L, 220L, 222L, 223L, 225L, 229L, 233L, 235L,
237L, 242L, 243L, 244L, 251L, 253L, 254L, 255L, 261L, 262L, 263L,
264L, 267L, 268L, 269L, 270L, 276L, 281L, 282L, 284L, 285L, 287L,
289L, 293L, 295L, 297L, 299L, 301L, 306L, 308L, 315L, 317L, 318L,
320L, 327L, 330L, 336L, 337L, 345L, 346L, 355L, 359L, 376L, 377L,
379L, 384L, 387L, 388L, 402L, 405L, 408L, 415L, 420L, 421L, 427L,
428L, 429L, 430L, 437L, 438L, 439L, 440L, 446L, 448L, 453L, 456L,
469L, 472L, 476L, 478L, 481L, 483L, 486L, 487L, 488L, 497L, 500L,
502L, 504L, 507L, 512L, 525L, 530L, 531L, 543L, 546L, 550L, 578L,
581L, 598L, 601L, 680L, 689L, 693L, 712L, 728L, 746L, 768L, 790L,
794L, 840L, 851L, 861L, 928L, 969L, 1010L, 1180L, 1698L), freq = c(29186L,
12276L, 5851L, 3938L, 3133L, 1894L, 1157L, 820L, 597L, 481L,
398L, 297L, 269L, 251L, 175L, 176L, 153L, 130L, 117L, 108L, 93L,
83L, 58L, 84L, 60L, 43L, 59L, 51L, 57L, 53L, 38L, 38L, 32L, 35L,
28L, 27L, 29L, 22L, 24L, 29L, 30L, 23L, 26L, 19L, 19L, 25L, 14L,
22L, 16L, 12L, 15L, 14L, 11L, 13L, 18L, 10L, 17L, 20L, 7L, 9L,
2L, 8L, 12L, 8L, 7L, 10L, 10L, 9L, 6L, 6L, 9L, 5L, 11L, 4L, 5L,
5L, 10L, 4L, 6L, 1L, 4L, 7L, 3L, 4L, 3L, 2L, 3L, 5L, 7L, 2L,
2L, 3L, 2L, 4L, 7L, 1L, 3L, 5L, 5L, 3L, 5L, 2L, 2L, 2L, 3L, 2L,
5L, 7L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 3L, 2L, 2L, 1L,
3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 1L, 4L, 3L, 1L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 3L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 4L, 4L, 1L, 2L,
2L, 4L, 2L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 2L, 3L, 1L, 1L, 1L, 1L,
3L, 2L, 1L, 3L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 4L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("x",
"freq"), row.names = c(NA, -296L), class = "data.frame")
After the x value of 130, there are missing values. Is there a way I make this a continuous data frame in increments of 1 i.e. from 1 to 1698, populate the entire list and set the elements that do not have a value here as 0? What I mean is:
1,2
4,5
5,7
should be converted to:
1,2
2,0
3,0
4,5
5,7
Any suggestions?
You can also use merge (assuming your data is strored in l):
l <- merge(l,data.frame(x = 1:1698),all = TRUE,by = "x")
l$freq[is.na(l$freq)] <- 0
I'd create a data set of values that aren't covered by column x and then create a dataframe of those values and assign 0 to the freq of all of these x values. Then rbind and order by x.
#I called your data dat
y <- 1:max(dat$x)
dat2 <- data.frame(x=y[!y%in%dat$x], freq=0)
dat3 <- rbind(dat, dat2)
dat4 <- dat3[order(dat3$x), ] #could stop here
rownames(dat4) <- NULL #but I hate non sequential row names
dat4

Resources