I have lots of data frames with the same columns. What I want is to apply quantile (15% and 80%) function to the 3rd ("cpm") column for all the data frames in my environment and add the result as a new column to each data frame
All the data frames in environment are the same, here is the sample of them:
BD.ios = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "BD", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ios", class = "factor"),
cpm = c(0.00026978417266187, 0.000276497695852535, 0.00442228161827238,
0.00396317260301814, 0.0191772698764066, 0.700811773637797,
0.00482934642627173, 0.00201429499675114, 0.00021494623655914,
0.0000520855057351408)), row.names = c(12925L, 13011L, 15189L,
18469L, 19494L, 22385L, 22594L, 29467L, 31907L, 38037L), class = "data.frame")
AE.mac = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AE", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "mac", class = "factor"),
cpm = c(0.000353264424964019, 0.00390138781055901, 0.000893105609526794,
0.0099634872417983, 0.00119375573921028, 0.00535134321942833,
0.00318471337579618, 0.000983284169124877, 0.116180371352785
)), row.names = c(2622L, 6483L, 6898L, 9383L, 25280L, 25923L,
29649L, 37977L, 40411L), class = "data.frame")
AF.android = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AF", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "android", class = "factor"),
cpm = c(0.193592767295597, 0.153727276424417, 0.30376596601237,
0.43615845874945, 0.552450120363948, 0.214786723495654, 0.206123674204523,
0.0250727462779332, 0.157723828668625)), row.names = c(955L,
7975L, 8899L, 9297L, 11223L, 14963L, 17452L, 19883L, 20555L), class = "data.frame")
I believe, that the solution is easy and requires the use of eapply function, but I just can't figure it out
env = .GlobalEnv
eapply(env, quantile, probs = c(.15,.8))
This command results in an error:
Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
undefined columns selected
EDIT
To make it clear, here is what I did and what I need as a result:
I had Data like this
data = structure(list(geo = structure(c(15L, 1L, 3L, 16L, 1L, 9L, 17L,
23L, 29L, 52L, 26L, 55L, 34L, 46L, 25L, 52L, 17L, 15L, 27L, 35L,
45L, 8L, 21L, 24L, 6L, 16L, 52L, 31L, 14L, 38L, 21L, 5L, 41L,
16L, 34L, 52L, 27L, 16L, 7L, 13L, 10L, 35L, 52L, 44L, 27L, 19L,
35L, 6L, 42L, 25L, 40L, 31L, 43L, 33L, 13L, 2L, 4L, 12L, 30L,
44L, 51L, 38L, 35L, 28L, 52L, 32L, 20L, 19L, 34L, 56L, 51L, 53L,
54L, 22L, 49L, 18L, 4L, 36L, 34L, 4L, 47L, 11L, 25L, 9L, 6L,
46L, 39L, 25L, 12L, 50L, 27L, 39L, 48L, 27L, 23L, 9L, 19L, 9L,
44L, 37L), .Label = c("AE", "AR", "AT", "AU", "AZ", "BD", "BG",
"BO", "CA", "CD", "CH", "CO", "DK", "DZ", "EC", "EG", "ES", "FI",
"FR", "GA", "GB", "GE", "HK", "HU", "ID", "IE", "IN", "IR", "IT",
"KE", "KR", "LB", "LY", "MX", "MY", "NL", "PE", "PH", "PK", "PL",
"PT", "QA", "RO", "RU", "RW", "SE", "SG", "SK", "SY", "TH", "TR",
"US", "UY", "VN", "YE", "ZA"), class = "factor"), os = structure(c(3L,
2L, 1L, 1L, 1L, 6L, 4L, 1L, 1L, 4L, 6L, 1L, 1L, 1L, 6L, 7L, 1L,
4L, 1L, 3L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
6L, 1L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 1L,
1L, 6L, 1L, 1L, 1L, 4L, 4L, 1L, 3L, 1L, 5L, 1L, 6L, 6L, 1L, 3L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 6L, 4L, 2L,
6L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 6L,
4L, 3L, 1L), .Label = c("android", "blackberry", "ios", "mac",
"other", "windows", "windows_phone"), class = "factor"), cpm = c(0.259529602595296,
0.008325, 0.664507018855387, 0.000646161798914448, 0.117647058823529,
0.630132741077424, 0.00398838150289017, 0.0986788005043583, 0.483832900637243,
0.631904877252478, 0.00499783423573511, 0.408063887806778, 0.0916731378464372,
1.3325069724202, 0.0112485708069297, 0.00171537666632221, 0.0129665435458787,
0.00296443300606869, 0.22941417451864, 0.000426580184572523,
0.206888580674988, 0.000622490272373541, 0.016084968041569, 0.119169168392267,
0.0216352172946694, 0.0552526416330796, 0.0150883006745904, 0.324403186817902,
0.188053932659688, 0.00389006342494715, 0.0625410833224263, 0.00111134385665529,
0.000198831231813773, 0.00551511140525039, 1.02902374670185,
0.574300071787509, 0.371022474579782, 0.111970606352996, 0.0000313953488372093,
0.380035469977198, 0.0159468438538206, 0.0274524158125915, 0.237448482577744,
0.083452302337827, 0.371352785145889, 0.129754756459319, 0.0261164794985636,
0.602409638554217, 0.0157611216101295, 0.347620654741816, 0.130193264668441,
0.34434946165254, 0.0693131695022054, 0.673575129533679, 0.0272002127093858,
0.0295980803571429, 0.482425913163336, 0.00235336471280429, 0.00508469886782341,
0.0000840689365279529, 0.236539258503618, 0.0799443865137296,
0.296296296296296, 0.0236127508854782, 0.0152198636822762, 0.00339285714285714,
0.150753768844221, 0.0859481582537517, 0.000587920688617856,
0.00127715231788079, 0.150836862270619, 0.0849810111668886, 0.279757646414598,
0.00113308871141809, 0.996427153632394, 0.00269808881394042,
0.374087591240876, 0.228267072474796, 0.0516169572925784, 0.00902986826347305,
0.000207365145228216, 0.244244977712646, 0.169128424850603, 0.573023255813954,
0.0152944175375988, 1.11731843575419, 0.426646706586826, 0.0544090571844687,
0.271433919880195, 0.0271570068233128, 0.00445611403693561, 0.00160892057026477,
0.671800318640467, 0.0216794334441393, 0.00285318261516391, 0.295866741619575,
0.0843108504398827, 1.60302577359969, 0.0132230143658259, 0.00246752277351996
)), row.names = c(6L, 22L, 25L, 28L, 31L, 41L, 43L, 45L, 47L,
59L, 68L, 70L, 71L, 72L, 73L, 80L, 94L, 95L, 96L, 101L, 115L,
117L, 121L, 123L, 125L, 140L, 144L, 149L, 151L, 165L, 169L, 170L,
179L, 182L, 186L, 189L, 190L, 206L, 207L, 208L, 221L, 238L, 239L,
259L, 271L, 275L, 276L, 280L, 281L, 294L, 303L, 308L, 311L, 315L,
318L, 345L, 354L, 355L, 362L, 374L, 377L, 383L, 384L, 385L, 386L,
394L, 405L, 407L, 408L, 419L, 422L, 424L, 425L, 427L, 442L, 445L,
454L, 455L, 465L, 466L, 482L, 484L, 485L, 487L, 496L, 506L, 510L,
513L, 517L, 518L, 523L, 528L, 544L, 548L, 552L, 557L, 570L, 579L,
586L, 596L), class = "data.frame")
Used split function to get list of data frames, which separated geo+os combinations from each other and wrote them down in a list of data frames:
X <- split(data, list(data$geo,data$os))
Than I pulled data frames out from that list into the environment and deleted data frames with zero rows
list2env(X, envir = .GlobalEnv)
## create a function that returns a logical value
isEmpty <- function(x) {
is.data.frame(x) && nrow(x) == 0L
}
## apply it over the environment
empty <- unlist(eapply(.GlobalEnv, isEmpty))
## remove the empties
rm(list = names(empty)[empty])
The desired result is a Data frame, which has 4 columns:
geo, os, quantile_15,quantile_80
Where geo+os are unique and have a certain quantile_15,quantile_80
I'd strongly suggest putting your data frames in a list instead of just leaving them in the global environment. The answer I link to should help you understand why lists are better, and also show how you could do lists from the start instead of this "find all data frames and put them in a list" approach.
eapply is difficult because there's nothing built-in to let you apply, say, only to data frames. And eapply returns results as a list, so it doesn't make much sense for adding columns to existing data frames.
df_names = ls()[sapply(mget(ls()), is.data.frame)]
df_list = mget(df_names)
result_list = lapply(df_list, function(d) d$new_col = <code for new column>)
I'm not sure what you want since you don't post your desired output. quantile(x, c(.15, .8)) returns 2 values, and your data frames have more than 2 rows, so I'm not sure what you want added - 2 new columns? 1 new column with recycling? something else?
Alternatively, maybe you just want a 2-number summary for each data frame? In that case sapply does nice simplification and keeps the names:
sapply(df_list, function(d) quantile(d$cpm, c(0.15, 0.8)))
# AE.mac AF.android BD.ios
# 15% 0.0009111413 0.1545266 0.0002341395
# 80% 0.0071962008 0.3567230 0.0076989311
EDIT based on your edits, let's work directly with data. We don't need to split, we certainly don't need list2env after the split. Adding columns by group is easy and efficient with dplyr or data.table. For example:
library(dplyr)
data %>%
group_by(geo, os) %>%
summarize(quantile_15 = quantile(cpm, .15),
quantile_80 = quantile(cpm, 0.8))
# # A tibble: 81 x 4
# # Groups: geo [?]
# geo os quantile_15 quantile_80
# <fct> <fct> <dbl> <dbl>
# 1 AE android 0.118 0.118
# 2 AE blackberry 0.00833 0.00833
# 3 AR mac 0.0296 0.0296
# 4 AT android 0.665 0.665
# 5 AU android 0.482 0.482
# 6 AU ios 0.374 0.374
# 7 AU mac 0.00903 0.00903
# ...
Or with data.table:
library(data.table)
setDT(data)
data[, as.list(quantile(cpm, c(0.15, 0.8))), by = .(geo, os)]
# geo os 15% 80%
# 1: EC ios 2.595296e-01 2.595296e-01
# 2: AE blackberry 8.325000e-03 8.325000e-03
# 3: AT android 6.645070e-01 6.645070e-01
# 4: EG android 1.702811e-02 8.928342e-02
# 5: AE android 1.176471e-01 1.176471e-01
# 6: CA windows 6.301327e-01 6.301327e-01
Related
I have this script, I want to know how I can replace summarise_each() with the across() function?
common_bw_elements = df %>%
group_by(range_of_commons = cut(common_IDs,
breaks= c(-Inf,0, 5, 10, 20, 30, 60, 100, 200, 300, 600, 1200, 1800, Inf))) %>%
summarise_each(funs(sum), sum_of_instances = frequent)
I am asking this, as I get the following message:
Warning message: summarise_each() is deprecated as of dplyr 0.7.0. Please use across() instead.
My code is very similar to the following post: summarize groups into intervals using dplyr
Any leads on this would be greatly appreciated.
For reference, you can use the following dput()
dput(df)
structure(list(common_IDs = c(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 17L, 18L, 25L, 26L, 27L, 37L, 51L, 55L, 56L, 63L, 68L, 69L, 70L, 71L, 74L, 76L, 81L, 84L, 86L, 87L, 89L, 90L, 91L, 92L, 101L,
103L, 108L, 109L, 110L, 113L, 114L, 115L, 116L, 129L, 130L, 131L, 133L, 135L, 136L, 137L, 138L, 139L, 141L, 152L, 153L, 154L, 177L, 178L, 190L, 191L, 196L, 199L, 202L, 203L, 208L, 209L, 210L, 211L, 213L, 214L, 215L, 216L, 218L, 219L, 222L, 223L, 229L, 230L, 231L,
232L, 239L, 251L, 252L, 254L, 257L, 264L, 265L, 271L, 272L, 273L, 275L, 276L, 277L, 280L, 293L, 294L, 297L, 298L, 299L, 300L, 301L, 304L, 317L, 320L, 337L, 346L, 347L, 364L, 371L, 373L, 386L, 387L, 389L, 412L, 417L, 419L, 420L, 432L, 440L, 441L, 442L, 443L, 451L,
452L, 453L, 455L, 456L, 457L, 458L, 462L, 463L, 464L, 469L, 470L, 474L, 476L, 477L, 478L, 487L, 488L, 492L, 1484L, 1534L, 1546L, 1561L, 1629L, 1642L, 1670L, 1672L, 1681L, 1698L, 1723L, 1725L,
1736L, 1738L, 1745L, 1753L, 1759L, 1764L, 1766L, 1767L, 1770L, 1772L, 1775L, 1776L, 1781L, 1784L, 1787L, 1791L, 1802L, 1807L, 1813L, 1815L, 1817L, 1821L, 1823L, 1825L, 1846L, 1850L, 1852L,
1853L, 1854L, 1857L, 1858L, 1859L, 1868L, 1899L, 1904L, 1911L, 1913L, 1977L, 1997L, 1999L, 2023L, 2079L),
frequent = c(81L, 75L, 10L, 17L, 4L, 4L, 33L, 13L, 31L, 3L, 19L, 22L, 6L, 1L, 11L, 2L,
1L, 1L, 3L, 14L, 1L, 2L, 1L, 14L, 1L, 9L, 6L, 9L, 2L, 5L, 13L, 4L, 4L, 1L, 4L, 1L, 3L, 1L, 6L, 2L, 1L, 3L, 2L, 5L, 2L, 1L, 17L, 5L, 4L, 4L, 1L, 4L, 7L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 6L,
16L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 5L, 13L, 6L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 4L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 3L, 5L, 1L, 3L, 1L, 3L, 4L, 1L, 1L, 2L, 3L, 4L, 3L, 3L, 1L, 3L, 2L, 2L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -193L))
You can use summarise since you are only summing one variable by group.
library(tidyverse)
common_bw_elements = df %>%
group_by(range_of_commons = cut(common_IDs,
breaks= c(-Inf,0, 5, 10, 20, 30, 60, 100, 200, 300, 600, 1200, 1800, Inf))) %>%
summarise(sum_of_instances = sum(frequent))
Output
range_of_commons sum_of_instances
<fct> <int>
1 (-Inf,0] 81
2 (0,5] 110
3 (5,10] 46
4 (10,20] 34
5 (20,30] 47
6 (30,60] 15
7 (60,100] 85
8 (100,200] 87
9 (200,300] 92
10 (300,600] 75
11 (1.2e+03,1.8e+03] 29
12 (1.8e+03, Inf] 28
If you had multiple columns to sum, then we would use across (or if you only had a few columns, then instead of everything(), you can provide a vector of column names (e.g., c(common_IDs, frequent)):
df %>%
group_by(range_of_commons = cut(common_IDs,
breaks= c(-Inf,0, 5, 10, 20, 30, 60, 100, 200, 300, 600, 1200, 1800, Inf))) %>%
summarise(across(everything(), ~ sum(.x))) %>%
rename(sum_of_instances = frequent)
Output
range_of_commons common_IDs sum_of_instances
<fct> <int> <int>
1 (-Inf,0] 0 81
2 (0,5] 15 110
3 (5,10] 13 46
4 (10,20] 35 34
5 (20,30] 78 47
6 (30,60] 199 15
7 (60,100] 1191 85
8 (100,200] 3928 87
9 (200,300] 9392 92
10 (300,600] 17290 75
11 (1.2e+03,1.8e+03] 47829 29
12 (1.8e+03, Inf] 48922 28
this is my first so please be patient with me.
I want to split one column of a tibble into two columns depending on the value of a third column.
My table looks like this so far
Wertetabelle <- tibble(DAT$Tag, DAT$Lauf, DAT$Replikate, DAT$Wert) %>% group_by(DAT$Lauf)
Wertetabelle %>%
mutate_all(linebreak) %>%
kable(booktabs = T, digits = 2,
caption = "Rohdaten der PCR Messungen",
col.names = linebreak(c("Tag", " Lauf", "Replikat", "Wert"), align = "r")) %>%
kable_styling(latex_options = c("striped", "hold_position"))
This, unfortunately, gives me a very long table. The column "Wert" has at least 80 values.
So depending on the "Replikat" column which has two values (1:2) I could split up "Wert" into two columns with 40 values each.
Unfortunately, the group_by doesn't work, it seems.
Do you have any idea?
Tag has 20 values 1:20
Lauf has 2 values 1:2
Replikat has 2 values 1:2
Wert is numeric
Best
Werek
as requested please find the results of dput(.)
structure(list(`DAT$Tag` = structure(c(1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L,
15L, 16L, 17L, 18L, 19L, 20L), .Label = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20"), class = "factor"), `DAT$Lauf` = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), `DAT$Replikate` = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("1",
"2"), class = "factor"), `DAT$Wert` = c(242L, 243L, 247L, 249L,
246L, 244L, 241L, 245L, 243L, 244L, 252L, 249L, 242L, 246L, 247L,
240L, 241L, 244L, 241L, 247L, 246L, 242L, 239L, 241L, 242L, 245L,
246L, 245L, 239L, 246L, 251L, 248L, 240L, 249L, 248L, 238L, 244L,
244L, 239L, 240L, 245L, 238L, 241L, 250L, 243L, 251L, 245L, 243L,
244L, 247L, 247L, 251L, 251L, 248L, 245L, 239L, 245L, 237L, 247L,
245L, 246L, 238L, 240L, 245L, 240L, 247L, 247L, 245L, 245L, 239L,
241L, 246L, 245L, 240L, 246L, 242L, 248L, 242L, 245L, 242L)), row.names = c(NA,
-80L), groups = structure(list(`DAT$Lauf` = structure(1:2, .Label = c("1",
"2"), class = "factor"), .rows = structure(list(c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 49L, 50L,
51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L), c(21L, 22L,
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L,
36L, 37L, 38L, 39L, 40L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L,
69L, 70L, 71L, 72L, 73L, 74L, 75L, 76L, 77L, 78L, 79L, 80L)), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = 1:2, class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
I've got the table of 55 observations with 5 variables (F,H,R,T,U) and 1 classifier variable ("Group") in which I have two groups.
I'm doing data sampling by splitting the data into the training set (70%) and test set (30%). Then I run adaboosting and check how it works.
I want to get the adaboost error distribution for 100 samplings. But the distribution occurs to be discrete, outputting only five value variants: 0, 0.0588235294117647, 0.117647058823529 0.176470588235294 and 0.235294117647059.It doesn't change with mfinal argument. I guess there should be more! How it works?
I use the folowing code:
predictions<-list()
for (i in 1:100){
train.ind<-sample(nrow(df), nrow(df) * 0.7)
assign(paste0("ada",i), do.call(boosting,
c(formula=Group~F + H + R + T + U,
data=substitute(df[train.ind,]), mfinal=50, boos=FALSE,
coeflearn='Breiman'),envir = parent.frame()))
assign(paste0("pred",i), predict(ada,df[-train.ind,]))
predictions[[i]]<-get(paste0("pred",i))$error
}
hist(100*unlist(predictions),breaks=10,
main="Error probability [%] ntrees=10. 100 sampling operations", xlab="AdaBoost error")
dput(df)
structure(list(Group = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Canines", "Sled"), class = "factor"), F = c(0.263150566678734,
0.260347316635598, 0.26437277258488, 0.265710057607949, 0.254866055219663,
0.263294264681227, 0.261901194801303, 0.257318268395066, 0.26420207103455,
0.252093225560912, 0.255473253732324, 0.259067858940115, 0.259528043446917,
0.267331491048901, 0.260246447333382, 0.26035486437815, 0.254553215708594,
0.274074579975413, 0.262896904742862, 0.260504330262876, 0.258329960879536,
0.262664861154909, 0.256148832094211, 0.258509128895957, 0.256292083925698,
0.262358651734143, 0.254578103664353, 0.255386025800537, 0.264120912009577,
0.275232714712253, 0.265375720277527, 0.267601768121804, 0.262932226832642,
0.263633189245163, 0.262826186070212, 0.261058637786334, 0.262979366135887,
0.259232168979912, 0.252933156025384, 0.263963451214447, 0.258511197058683,
0.261957295373665, 0.253412282699461, 0.260748166588172, 0.263136039863289,
0.255317062006506, 0.258822015633545, 0.252757763183064, 0.260840486010478,
0.258620689655172, 0.263738813871524, 0.26241134751773, 0.26405425581719,
0.263685152057245, 0.262062787572784), H = c(0.242711147002311,
0.243850477245014, 0.245132979060713, 0.241794831140003, 0.235370262206577,
0.241392449436832, 0.236787894677703, 0.240434935369935, 0.234076675284456,
0.236978505926275, 0.23489414817613, 0.236461115627298, 0.241377100655228,
0.240778565421122, 0.238954656595734, 0.237237027626932, 0.23562891291975,
0.228247507171151, 0.235543469567304, 0.238348073568565, 0.237639956832591,
0.237993655975811, 0.23053394888479, 0.237553985998722, 0.238716430501961,
0.241044553515742, 0.23579805839771, 0.244646715997643, 0.245211405561299,
0.248463204730402, 0.237910443860818, 0.23772859908127, 0.242517289073306,
0.230376515634971, 0.239386381312522, 0.242971498213445, 0.248246377553633,
0.245227816034538, 0.237968589560153, 0.235998092571798, 0.235639593181493,
0.240320284697509, 0.239383587641388, 0.237939850635807, 0.240409493084614,
0.239705089012767, 0.235291279312896, 0.237725562711216, 0.251017166425148,
0.244410329082034, 0.247581475626206, 0.244082639531298, 0.248022977743474,
0.246127343801762, 0.246345535241663), R = c(0.23238005068085,
0.233913128793082, 0.232906768805408, 0.234580624702711, 0.23729616240706,
0.232552468336102, 0.23566425708828, 0.233370934038501, 0.23413197660754,
0.241255572873247, 0.240609653949119, 0.233790113420818, 0.239086204963073,
0.233644719452121, 0.23849468613068, 0.236846146329206, 0.239755264655663,
0.225925420024587, 0.239355887920232, 0.237429996633718, 0.23819641170916,
0.232039177131833, 0.223832380603256, 0.235838907338977, 0.236669843303285,
0.234916072348618, 0.238304558463179, 0.235904655883701, 0.232124394623714,
0.222879222527955, 0.233232723139038, 0.233871666714818, 0.235947441217151,
0.242585880964708, 0.234693056561268, 0.233941777691605, 0.229366135886539,
0.23539800906269, 0.239803390172875, 0.236505714593364, 0.24647853698133,
0.235569395017794, 0.242526379716086, 0.236207360559779, 0.234180854122081,
0.240408036487878, 0.239601762794737, 0.245058343429191, 0.234449894103222,
0.237875925051173, 0.230698942666106, 0.233475177304965, 0.231384358432554,
0.233114688928642, 0.230655428424067), T = c(0.261758235638105,
0.261889077326307, 0.257587479549, 0.257914486549337, 0.272467520166701,
0.262760817545838, 0.265646653432713, 0.268875862196498, 0.267589277073454,
0.269672695639567, 0.269022944142428, 0.270680912011768, 0.260008650934782,
0.258245224077857, 0.262304209940204, 0.265561961665713, 0.270062606715993,
0.271752492828849, 0.262203737769602, 0.263717599534841, 0.265833670578713,
0.267302305737446, 0.289484838417743, 0.268097977766344, 0.268321642269056,
0.261680722401497, 0.271319279474757, 0.264062602318119, 0.258543287805409,
0.253424858029389, 0.263481112722616, 0.260797966082108, 0.258603042876902,
0.263404414155158, 0.263094376055998, 0.262028086308617, 0.259408120423941,
0.26014200592286, 0.269294864241588, 0.263532741620391, 0.259370672778494,
0.262153024911032, 0.264677749943065, 0.265104622216242, 0.262273612930016,
0.264569812492848, 0.266284942258822, 0.264458330676529, 0.253692453461153,
0.25909305621162, 0.257980767836164, 0.260030835646007, 0.256538408006782,
0.25707281521235, 0.260936248761486), U = c(0.276642254462421,
0.275750907536407, 0.274138521440258, 0.279385339041277, 0.283770344294126,
0.273124933319108, 0.276770665567999, 0.272796198013943, 0.273326789343435,
0.278824893979485, 0.282917535762971, 0.269035729493284, 0.276381346021371,
0.275681845488406, 0.280473043309851, 0.274957072857482, 0.279453614114969,
0.265400901516186, 0.284438401450319, 0.275270067631668, 0.277080803992985,
0.268341093323935, 0.26334299428362, 0.27494270078114, 0.277070411973316,
0.276364671746617, 0.277622940087166, 0.275489489882784, 0.275412200032649,
0.267636555236813, 0.275475938484053, 0.27914367434201, 0.281161825726141,
0.287341513046201, 0.274277898463271, 0.272041104617345, 0.268317034458041,
0.277054269097656, 0.276448903327891, 0.282483963758864, 0.288513266166897,
0.280409252669039, 0.283610415243301, 0.27874587902846, 0.274619094771137,
0.275604453090517, 0.286100299160421, 0.288513039597016, 0.270078586556683,
0.280480764184118, 0.274123602187187, 0.277940178846747, 0.273784368554907,
0.282369310276287, 0.277372857201026)), na.action = structure(c(`2` = 2L,
`4` = 4L, `19` = 18L, `24` = 20L, `28` = 24L, `29` = 25L, `30` = 26L,
`32` = 28L, `33` = 29L, `42` = 38L, `54` = 46L, `69` = 54L, `74` = 58L,
`77` = 59L, `79` = 60L, `80` = 61L, `83` = 62L), class = "omit"), row.names = c(5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 20L,
25L, 26L, 27L, 31L, 41L, 44L, 46L, 47L, 48L, 50L, 51L, 52L, 55L,
57L, 64L, 65L, 66L, 67L, 68L, 70L, 71L, 72L, 85L, 86L, 87L, 88L,
89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L,
101L, 102L, 103L), class = "data.frame")
I have lots of data frames with the same columns. What I want is to apply quantile (15% and 80%) function to the 3rd ("cpm") column for all the data frames in my environment and add the result as a new column to each data frame
All the data frames in environment are the same, here is the sample of them:
BD.ios = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "BD", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ios", class = "factor"),
cpm = c(0.00026978417266187, 0.000276497695852535, 0.00442228161827238,
0.00396317260301814, 0.0191772698764066, 0.700811773637797,
0.00482934642627173, 0.00201429499675114, 0.00021494623655914,
0.0000520855057351408)), row.names = c(12925L, 13011L, 15189L,
18469L, 19494L, 22385L, 22594L, 29467L, 31907L, 38037L), class = "data.frame")
AE.mac = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AE", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "mac", class = "factor"),
cpm = c(0.000353264424964019, 0.00390138781055901, 0.000893105609526794,
0.0099634872417983, 0.00119375573921028, 0.00535134321942833,
0.00318471337579618, 0.000983284169124877, 0.116180371352785
)), row.names = c(2622L, 6483L, 6898L, 9383L, 25280L, 25923L,
29649L, 37977L, 40411L), class = "data.frame")
AF.android = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AF", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "android", class = "factor"),
cpm = c(0.193592767295597, 0.153727276424417, 0.30376596601237,
0.43615845874945, 0.552450120363948, 0.214786723495654, 0.206123674204523,
0.0250727462779332, 0.157723828668625)), row.names = c(955L,
7975L, 8899L, 9297L, 11223L, 14963L, 17452L, 19883L, 20555L), class = "data.frame")
I believe, that the solution is easy and requires the use of eapply function, but I just can't figure it out
env = .GlobalEnv
eapply(env, quantile, probs = c(.15,.8))
This command results in an error:
Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
undefined columns selected
EDIT
To make it clear, here is what I did and what I need as a result:
I had Data like this
data = structure(list(geo = structure(c(15L, 1L, 3L, 16L, 1L, 9L, 17L,
23L, 29L, 52L, 26L, 55L, 34L, 46L, 25L, 52L, 17L, 15L, 27L, 35L,
45L, 8L, 21L, 24L, 6L, 16L, 52L, 31L, 14L, 38L, 21L, 5L, 41L,
16L, 34L, 52L, 27L, 16L, 7L, 13L, 10L, 35L, 52L, 44L, 27L, 19L,
35L, 6L, 42L, 25L, 40L, 31L, 43L, 33L, 13L, 2L, 4L, 12L, 30L,
44L, 51L, 38L, 35L, 28L, 52L, 32L, 20L, 19L, 34L, 56L, 51L, 53L,
54L, 22L, 49L, 18L, 4L, 36L, 34L, 4L, 47L, 11L, 25L, 9L, 6L,
46L, 39L, 25L, 12L, 50L, 27L, 39L, 48L, 27L, 23L, 9L, 19L, 9L,
44L, 37L), .Label = c("AE", "AR", "AT", "AU", "AZ", "BD", "BG",
"BO", "CA", "CD", "CH", "CO", "DK", "DZ", "EC", "EG", "ES", "FI",
"FR", "GA", "GB", "GE", "HK", "HU", "ID", "IE", "IN", "IR", "IT",
"KE", "KR", "LB", "LY", "MX", "MY", "NL", "PE", "PH", "PK", "PL",
"PT", "QA", "RO", "RU", "RW", "SE", "SG", "SK", "SY", "TH", "TR",
"US", "UY", "VN", "YE", "ZA"), class = "factor"), os = structure(c(3L,
2L, 1L, 1L, 1L, 6L, 4L, 1L, 1L, 4L, 6L, 1L, 1L, 1L, 6L, 7L, 1L,
4L, 1L, 3L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
6L, 1L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 1L,
1L, 6L, 1L, 1L, 1L, 4L, 4L, 1L, 3L, 1L, 5L, 1L, 6L, 6L, 1L, 3L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 6L, 4L, 2L,
6L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 6L,
4L, 3L, 1L), .Label = c("android", "blackberry", "ios", "mac",
"other", "windows", "windows_phone"), class = "factor"), cpm = c(0.259529602595296,
0.008325, 0.664507018855387, 0.000646161798914448, 0.117647058823529,
0.630132741077424, 0.00398838150289017, 0.0986788005043583, 0.483832900637243,
0.631904877252478, 0.00499783423573511, 0.408063887806778, 0.0916731378464372,
1.3325069724202, 0.0112485708069297, 0.00171537666632221, 0.0129665435458787,
0.00296443300606869, 0.22941417451864, 0.000426580184572523,
0.206888580674988, 0.000622490272373541, 0.016084968041569, 0.119169168392267,
0.0216352172946694, 0.0552526416330796, 0.0150883006745904, 0.324403186817902,
0.188053932659688, 0.00389006342494715, 0.0625410833224263, 0.00111134385665529,
0.000198831231813773, 0.00551511140525039, 1.02902374670185,
0.574300071787509, 0.371022474579782, 0.111970606352996, 0.0000313953488372093,
0.380035469977198, 0.0159468438538206, 0.0274524158125915, 0.237448482577744,
0.083452302337827, 0.371352785145889, 0.129754756459319, 0.0261164794985636,
0.602409638554217, 0.0157611216101295, 0.347620654741816, 0.130193264668441,
0.34434946165254, 0.0693131695022054, 0.673575129533679, 0.0272002127093858,
0.0295980803571429, 0.482425913163336, 0.00235336471280429, 0.00508469886782341,
0.0000840689365279529, 0.236539258503618, 0.0799443865137296,
0.296296296296296, 0.0236127508854782, 0.0152198636822762, 0.00339285714285714,
0.150753768844221, 0.0859481582537517, 0.000587920688617856,
0.00127715231788079, 0.150836862270619, 0.0849810111668886, 0.279757646414598,
0.00113308871141809, 0.996427153632394, 0.00269808881394042,
0.374087591240876, 0.228267072474796, 0.0516169572925784, 0.00902986826347305,
0.000207365145228216, 0.244244977712646, 0.169128424850603, 0.573023255813954,
0.0152944175375988, 1.11731843575419, 0.426646706586826, 0.0544090571844687,
0.271433919880195, 0.0271570068233128, 0.00445611403693561, 0.00160892057026477,
0.671800318640467, 0.0216794334441393, 0.00285318261516391, 0.295866741619575,
0.0843108504398827, 1.60302577359969, 0.0132230143658259, 0.00246752277351996
)), row.names = c(6L, 22L, 25L, 28L, 31L, 41L, 43L, 45L, 47L,
59L, 68L, 70L, 71L, 72L, 73L, 80L, 94L, 95L, 96L, 101L, 115L,
117L, 121L, 123L, 125L, 140L, 144L, 149L, 151L, 165L, 169L, 170L,
179L, 182L, 186L, 189L, 190L, 206L, 207L, 208L, 221L, 238L, 239L,
259L, 271L, 275L, 276L, 280L, 281L, 294L, 303L, 308L, 311L, 315L,
318L, 345L, 354L, 355L, 362L, 374L, 377L, 383L, 384L, 385L, 386L,
394L, 405L, 407L, 408L, 419L, 422L, 424L, 425L, 427L, 442L, 445L,
454L, 455L, 465L, 466L, 482L, 484L, 485L, 487L, 496L, 506L, 510L,
513L, 517L, 518L, 523L, 528L, 544L, 548L, 552L, 557L, 570L, 579L,
586L, 596L), class = "data.frame")
Used split function to get list of data frames, which separated geo+os combinations from each other and wrote them down in a list of data frames:
X <- split(data, list(data$geo,data$os))
Than I pulled data frames out from that list into the environment and deleted data frames with zero rows
list2env(X, envir = .GlobalEnv)
## create a function that returns a logical value
isEmpty <- function(x) {
is.data.frame(x) && nrow(x) == 0L
}
## apply it over the environment
empty <- unlist(eapply(.GlobalEnv, isEmpty))
## remove the empties
rm(list = names(empty)[empty])
The desired result is a Data frame, which has 4 columns:
geo, os, quantile_15,quantile_80
Where geo+os are unique and have a certain quantile_15,quantile_80
I'd strongly suggest putting your data frames in a list instead of just leaving them in the global environment. The answer I link to should help you understand why lists are better, and also show how you could do lists from the start instead of this "find all data frames and put them in a list" approach.
eapply is difficult because there's nothing built-in to let you apply, say, only to data frames. And eapply returns results as a list, so it doesn't make much sense for adding columns to existing data frames.
df_names = ls()[sapply(mget(ls()), is.data.frame)]
df_list = mget(df_names)
result_list = lapply(df_list, function(d) d$new_col = <code for new column>)
I'm not sure what you want since you don't post your desired output. quantile(x, c(.15, .8)) returns 2 values, and your data frames have more than 2 rows, so I'm not sure what you want added - 2 new columns? 1 new column with recycling? something else?
Alternatively, maybe you just want a 2-number summary for each data frame? In that case sapply does nice simplification and keeps the names:
sapply(df_list, function(d) quantile(d$cpm, c(0.15, 0.8)))
# AE.mac AF.android BD.ios
# 15% 0.0009111413 0.1545266 0.0002341395
# 80% 0.0071962008 0.3567230 0.0076989311
EDIT based on your edits, let's work directly with data. We don't need to split, we certainly don't need list2env after the split. Adding columns by group is easy and efficient with dplyr or data.table. For example:
library(dplyr)
data %>%
group_by(geo, os) %>%
summarize(quantile_15 = quantile(cpm, .15),
quantile_80 = quantile(cpm, 0.8))
# # A tibble: 81 x 4
# # Groups: geo [?]
# geo os quantile_15 quantile_80
# <fct> <fct> <dbl> <dbl>
# 1 AE android 0.118 0.118
# 2 AE blackberry 0.00833 0.00833
# 3 AR mac 0.0296 0.0296
# 4 AT android 0.665 0.665
# 5 AU android 0.482 0.482
# 6 AU ios 0.374 0.374
# 7 AU mac 0.00903 0.00903
# ...
Or with data.table:
library(data.table)
setDT(data)
data[, as.list(quantile(cpm, c(0.15, 0.8))), by = .(geo, os)]
# geo os 15% 80%
# 1: EC ios 2.595296e-01 2.595296e-01
# 2: AE blackberry 8.325000e-03 8.325000e-03
# 3: AT android 6.645070e-01 6.645070e-01
# 4: EG android 1.702811e-02 8.928342e-02
# 5: AE android 1.176471e-01 1.176471e-01
# 6: CA windows 6.301327e-01 6.301327e-01
My task is to extract all values in a column "2" after sorting by factor level in another column "3" (for the interested, i am sorting fasta sequences by organism). I am using this very simple code to get what i need.
df <- read.table("outfile.txt", fill=T) # the original output file includes many empty cells
# df is availabe at the bottom of this post
# splitting by factors
list1 <- split(df, df$V3)
# extract all values in column 2
list2 <- lapply(list1, function(x) as.data.frame(x$V2))
# writing results to file
for (x in names(list2))
write.table(list2[[x]], file=paste(x,".txt"), quote=F, row.names = F, col.names=F)
The works well on a small df. However, the output file contains several gigabytes of data. I tried a subset (500,000 rows on my local machine with 8GB RAM), but the second command is extremely slow (or R just hangs).
So i wondered and am asking the community, if there is a better way to solve this. Thank you.
Here is df:
dput(df)
structure(list(V1 = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("C", "U"), class = "factor"),
V2 = structure(c(10L, 2L, 27L, 29L, 25L, 32L, 28L, 39L, 40L,
22L, 8L, 7L, 19L, 38L, 15L, 3L, 16L, 26L, 34L, 13L, 17L,
18L, 14L, 41L, 44L, 12L, 45L, 46L, 5L, 1L, 31L, 4L, 37L,
11L, 43L, 20L, 21L, 30L, 23L, 35L, 24L, 42L, 9L, 33L, 36L,
6L), .Label = c("M02978:20:000000000-B8C4P:1:1101:11008:4137",
"M02978:20:000000000-B8C4P:1:1101:14389:3444", "M02978:20:000000000-B8C4P:1:1101:14986:3769",
"M02978:20:000000000-B8C4P:1:1101:15333:4161", "M02978:20:000000000-B8C4P:1:1101:15438:4092",
"M02978:20:000000000-B8C4P:1:1101:15516:4514", "M02978:20:000000000-B8C4P:1:1101:16313:3660",
"M02978:20:000000000-B8C4P:1:1101:16433:3650", "M02978:20:000000000-B8C4P:1:1101:16663:4462",
"M02978:20:000000000-B8C4P:1:1101:17179:3407", "M02978:20:000000000-B8C4P:1:1101:17779:4225",
"M02978:20:000000000-B8C4P:1:1101:18008:3981", "M02978:20:000000000-B8C4P:1:1101:18047:3851",
"M02978:20:000000000-B8C4P:1:1101:18920:3936", "M02978:20:000000000-B8C4P:1:1101:19086:3737",
"M02978:20:000000000-B8C4P:1:1101:19203:3783", "M02978:20:000000000-B8C4P:1:1101:19335:3908",
"M02978:20:000000000-B8C4P:1:1101:19520:3921", "M02978:20:000000000-B8C4P:1:1101:19612:3701",
"M02978:20:000000000-B8C4P:1:1101:19655:4289", "M02978:20:000000000-B8C4P:1:1101:19918:4313",
"M02978:20:000000000-B8C4P:1:1101:20321:3602", "M02978:20:000000000-B8C4P:1:1101:21089:4350",
"M02978:20:000000000-B8C4P:1:1101:22293:4406", "M02978:20:000000000-B8C4P:1:1101:22453:3490",
"M02978:20:000000000-B8C4P:1:1101:23026:3811", "M02978:20:000000000-B8C4P:1:1101:23065:3472",
"M02978:20:000000000-B8C4P:1:1101:23770:3507", "M02978:20:000000000-B8C4P:1:1101:23991:3472",
"M02978:20:000000000-B8C4P:1:1101:24290:4332", "M02978:20:000000000-B8C4P:1:1101:24415:4142",
"M02978:20:000000000-B8C4P:1:1101:25066:3498", "M02978:20:000000000-B8C4P:1:1101:25678:4466",
"M02978:20:000000000-B8C4P:1:1101:25992:3830", "M02978:20:000000000-B8C4P:1:1101:26431:4388",
"M02978:20:000000000-B8C4P:1:1101:26573:4479", "M02978:20:000000000-B8C4P:1:1101:5567:4179",
"M02978:20:000000000-B8C4P:1:1101:6323:3723", "M02978:20:000000000-B8C4P:1:1101:6675:3536",
"M02978:20:000000000-B8C4P:1:1101:6868:3559", "M02978:20:000000000-B8C4P:1:1101:7078:3965",
"M02978:20:000000000-B8C4P:1:1101:8145:4431", "M02978:20:000000000-B8C4P:1:1101:8449:4257",
"M02978:20:000000000-B8C4P:1:1101:8592:3966", "M02978:20:000000000-B8C4P:1:1101:9468:4026",
"M02978:20:000000000-B8C4P:1:1101:9970:4051"), class = "factor"),
V3 = c(926550L, 0L, 1121396L, 406818L, 1265505L, 1167006L,
1121399L, 0L, 177437L, 0L, 1536652L, 0L, 1196029L, 0L, 1178540L,
138119L, 0L, 1536652L, 186802L, 0L, 1322246L, 1232437L, 1196029L,
1121396L, 452637L, 0L, 0L, 0L, 1541959L, 1121403L, 96561L,
1167006L, 767528L, 0L, 0L, 653733L, 1423815L, 857293L, 0L,
0L, 0L, 468059L, 1167006L, 1232437L, 880073L, 761193L), V4 = c(171L,
NA, 264L, 88L, 356L, 257L, 128L, NA, 97L, NA, 243L, NA, 96L,
NA, 80L, 93L, NA, 138L, 155L, NA, 243L, 262L, 77L, 470L,
135L, NA, NA, NA, 124L, 161L, 211L, 202L, 91L, NA, NA, 146L,
98L, 93L, NA, NA, NA, 107L, 382L, 247L, 130L, 157L), V5 = structure(c(25L,
1L, 2L, 17L, 9L, 5L, 3L, 1L, 16L, 1L, 14L, 1L, 7L, 1L, 6L,
11L, 1L, 14L, 24L, 1L, 10L, 8L, 7L, 2L, 18L, 1L, 1L, 1L,
15L, 4L, 26L, 5L, 13L, 1L, 1L, 20L, 12L, 22L, 1L, 1L, 1L,
19L, 5L, 8L, 23L, 21L), .Label = c("", "1121396,", "1121399,",
"1121403,", "1167006,", "1178540,", "1196029,", "1232437,",
"1265505,", "1322246,", "138119,", "1423815,", "1460634,1460635,",
"1536652,", "1541959,", "177437,", "406818,", "452637,",
"468059,", "653733,", "761193,", "857293,", "880073,", "883109,888727,1161902,1230734,1392487,",
"926550,", "96561,"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c(NA,
-46L))
using data.table package combined with write.table.
order by V3 and then write the V2 columns separately for each group in V3.
library('data.table')
setDT(df)[ order(V3), write.table(V2, file = paste0( V3, ".txt")), by = V3]
This worked for me but I cannot speak for how fast it would be on your machine.
lapply(unique(df$V3), function(x) write.table(df[which(df$V3 == x),]$V2, file = paste(x, ".txt", sep = ""), quote = FALSE, row.names = FALSE, col.names = FALSE))