Speeding up a loop (extracting specific values from a data frame) - r

My task is to extract all values in a column "2" after sorting by factor level in another column "3" (for the interested, i am sorting fasta sequences by organism). I am using this very simple code to get what i need.
df <- read.table("outfile.txt", fill=T) # the original output file includes many empty cells
# df is availabe at the bottom of this post
# splitting by factors
list1 <- split(df, df$V3)
# extract all values in column 2
list2 <- lapply(list1, function(x) as.data.frame(x$V2))
# writing results to file
for (x in names(list2))
write.table(list2[[x]], file=paste(x,".txt"), quote=F, row.names = F, col.names=F)
The works well on a small df. However, the output file contains several gigabytes of data. I tried a subset (500,000 rows on my local machine with 8GB RAM), but the second command is extremely slow (or R just hangs).
So i wondered and am asking the community, if there is a better way to solve this. Thank you.
Here is df:
dput(df)
structure(list(V1 = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L), .Label = c("C", "U"), class = "factor"),
V2 = structure(c(10L, 2L, 27L, 29L, 25L, 32L, 28L, 39L, 40L,
22L, 8L, 7L, 19L, 38L, 15L, 3L, 16L, 26L, 34L, 13L, 17L,
18L, 14L, 41L, 44L, 12L, 45L, 46L, 5L, 1L, 31L, 4L, 37L,
11L, 43L, 20L, 21L, 30L, 23L, 35L, 24L, 42L, 9L, 33L, 36L,
6L), .Label = c("M02978:20:000000000-B8C4P:1:1101:11008:4137",
"M02978:20:000000000-B8C4P:1:1101:14389:3444", "M02978:20:000000000-B8C4P:1:1101:14986:3769",
"M02978:20:000000000-B8C4P:1:1101:15333:4161", "M02978:20:000000000-B8C4P:1:1101:15438:4092",
"M02978:20:000000000-B8C4P:1:1101:15516:4514", "M02978:20:000000000-B8C4P:1:1101:16313:3660",
"M02978:20:000000000-B8C4P:1:1101:16433:3650", "M02978:20:000000000-B8C4P:1:1101:16663:4462",
"M02978:20:000000000-B8C4P:1:1101:17179:3407", "M02978:20:000000000-B8C4P:1:1101:17779:4225",
"M02978:20:000000000-B8C4P:1:1101:18008:3981", "M02978:20:000000000-B8C4P:1:1101:18047:3851",
"M02978:20:000000000-B8C4P:1:1101:18920:3936", "M02978:20:000000000-B8C4P:1:1101:19086:3737",
"M02978:20:000000000-B8C4P:1:1101:19203:3783", "M02978:20:000000000-B8C4P:1:1101:19335:3908",
"M02978:20:000000000-B8C4P:1:1101:19520:3921", "M02978:20:000000000-B8C4P:1:1101:19612:3701",
"M02978:20:000000000-B8C4P:1:1101:19655:4289", "M02978:20:000000000-B8C4P:1:1101:19918:4313",
"M02978:20:000000000-B8C4P:1:1101:20321:3602", "M02978:20:000000000-B8C4P:1:1101:21089:4350",
"M02978:20:000000000-B8C4P:1:1101:22293:4406", "M02978:20:000000000-B8C4P:1:1101:22453:3490",
"M02978:20:000000000-B8C4P:1:1101:23026:3811", "M02978:20:000000000-B8C4P:1:1101:23065:3472",
"M02978:20:000000000-B8C4P:1:1101:23770:3507", "M02978:20:000000000-B8C4P:1:1101:23991:3472",
"M02978:20:000000000-B8C4P:1:1101:24290:4332", "M02978:20:000000000-B8C4P:1:1101:24415:4142",
"M02978:20:000000000-B8C4P:1:1101:25066:3498", "M02978:20:000000000-B8C4P:1:1101:25678:4466",
"M02978:20:000000000-B8C4P:1:1101:25992:3830", "M02978:20:000000000-B8C4P:1:1101:26431:4388",
"M02978:20:000000000-B8C4P:1:1101:26573:4479", "M02978:20:000000000-B8C4P:1:1101:5567:4179",
"M02978:20:000000000-B8C4P:1:1101:6323:3723", "M02978:20:000000000-B8C4P:1:1101:6675:3536",
"M02978:20:000000000-B8C4P:1:1101:6868:3559", "M02978:20:000000000-B8C4P:1:1101:7078:3965",
"M02978:20:000000000-B8C4P:1:1101:8145:4431", "M02978:20:000000000-B8C4P:1:1101:8449:4257",
"M02978:20:000000000-B8C4P:1:1101:8592:3966", "M02978:20:000000000-B8C4P:1:1101:9468:4026",
"M02978:20:000000000-B8C4P:1:1101:9970:4051"), class = "factor"),
V3 = c(926550L, 0L, 1121396L, 406818L, 1265505L, 1167006L,
1121399L, 0L, 177437L, 0L, 1536652L, 0L, 1196029L, 0L, 1178540L,
138119L, 0L, 1536652L, 186802L, 0L, 1322246L, 1232437L, 1196029L,
1121396L, 452637L, 0L, 0L, 0L, 1541959L, 1121403L, 96561L,
1167006L, 767528L, 0L, 0L, 653733L, 1423815L, 857293L, 0L,
0L, 0L, 468059L, 1167006L, 1232437L, 880073L, 761193L), V4 = c(171L,
NA, 264L, 88L, 356L, 257L, 128L, NA, 97L, NA, 243L, NA, 96L,
NA, 80L, 93L, NA, 138L, 155L, NA, 243L, 262L, 77L, 470L,
135L, NA, NA, NA, 124L, 161L, 211L, 202L, 91L, NA, NA, 146L,
98L, 93L, NA, NA, NA, 107L, 382L, 247L, 130L, 157L), V5 = structure(c(25L,
1L, 2L, 17L, 9L, 5L, 3L, 1L, 16L, 1L, 14L, 1L, 7L, 1L, 6L,
11L, 1L, 14L, 24L, 1L, 10L, 8L, 7L, 2L, 18L, 1L, 1L, 1L,
15L, 4L, 26L, 5L, 13L, 1L, 1L, 20L, 12L, 22L, 1L, 1L, 1L,
19L, 5L, 8L, 23L, 21L), .Label = c("", "1121396,", "1121399,",
"1121403,", "1167006,", "1178540,", "1196029,", "1232437,",
"1265505,", "1322246,", "138119,", "1423815,", "1460634,1460635,",
"1536652,", "1541959,", "177437,", "406818,", "452637,",
"468059,", "653733,", "761193,", "857293,", "880073,", "883109,888727,1161902,1230734,1392487,",
"926550,", "96561,"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5"), class = "data.frame", row.names = c(NA,
-46L))

using data.table package combined with write.table.
order by V3 and then write the V2 columns separately for each group in V3.
library('data.table')
setDT(df)[ order(V3), write.table(V2, file = paste0( V3, ".txt")), by = V3]

This worked for me but I cannot speak for how fast it would be on your machine.
lapply(unique(df$V3), function(x) write.table(df[which(df$V3 == x),]$V2, file = paste(x, ".txt", sep = ""), quote = FALSE, row.names = FALSE, col.names = FALSE))

Related

Apply a function on all the element of the environment [duplicate]

I have lots of data frames with the same columns. What I want is to apply quantile (15% and 80%) function to the 3rd ("cpm") column for all the data frames in my environment and add the result as a new column to each data frame
All the data frames in environment are the same, here is the sample of them:
BD.ios = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "BD", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ios", class = "factor"),
cpm = c(0.00026978417266187, 0.000276497695852535, 0.00442228161827238,
0.00396317260301814, 0.0191772698764066, 0.700811773637797,
0.00482934642627173, 0.00201429499675114, 0.00021494623655914,
0.0000520855057351408)), row.names = c(12925L, 13011L, 15189L,
18469L, 19494L, 22385L, 22594L, 29467L, 31907L, 38037L), class = "data.frame")
AE.mac = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AE", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "mac", class = "factor"),
cpm = c(0.000353264424964019, 0.00390138781055901, 0.000893105609526794,
0.0099634872417983, 0.00119375573921028, 0.00535134321942833,
0.00318471337579618, 0.000983284169124877, 0.116180371352785
)), row.names = c(2622L, 6483L, 6898L, 9383L, 25280L, 25923L,
29649L, 37977L, 40411L), class = "data.frame")
AF.android = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AF", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "android", class = "factor"),
cpm = c(0.193592767295597, 0.153727276424417, 0.30376596601237,
0.43615845874945, 0.552450120363948, 0.214786723495654, 0.206123674204523,
0.0250727462779332, 0.157723828668625)), row.names = c(955L,
7975L, 8899L, 9297L, 11223L, 14963L, 17452L, 19883L, 20555L), class = "data.frame")
I believe, that the solution is easy and requires the use of eapply function, but I just can't figure it out
env = .GlobalEnv
eapply(env, quantile, probs = c(.15,.8))
This command results in an error:
Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
undefined columns selected
EDIT
To make it clear, here is what I did and what I need as a result:
I had Data like this
data = structure(list(geo = structure(c(15L, 1L, 3L, 16L, 1L, 9L, 17L,
23L, 29L, 52L, 26L, 55L, 34L, 46L, 25L, 52L, 17L, 15L, 27L, 35L,
45L, 8L, 21L, 24L, 6L, 16L, 52L, 31L, 14L, 38L, 21L, 5L, 41L,
16L, 34L, 52L, 27L, 16L, 7L, 13L, 10L, 35L, 52L, 44L, 27L, 19L,
35L, 6L, 42L, 25L, 40L, 31L, 43L, 33L, 13L, 2L, 4L, 12L, 30L,
44L, 51L, 38L, 35L, 28L, 52L, 32L, 20L, 19L, 34L, 56L, 51L, 53L,
54L, 22L, 49L, 18L, 4L, 36L, 34L, 4L, 47L, 11L, 25L, 9L, 6L,
46L, 39L, 25L, 12L, 50L, 27L, 39L, 48L, 27L, 23L, 9L, 19L, 9L,
44L, 37L), .Label = c("AE", "AR", "AT", "AU", "AZ", "BD", "BG",
"BO", "CA", "CD", "CH", "CO", "DK", "DZ", "EC", "EG", "ES", "FI",
"FR", "GA", "GB", "GE", "HK", "HU", "ID", "IE", "IN", "IR", "IT",
"KE", "KR", "LB", "LY", "MX", "MY", "NL", "PE", "PH", "PK", "PL",
"PT", "QA", "RO", "RU", "RW", "SE", "SG", "SK", "SY", "TH", "TR",
"US", "UY", "VN", "YE", "ZA"), class = "factor"), os = structure(c(3L,
2L, 1L, 1L, 1L, 6L, 4L, 1L, 1L, 4L, 6L, 1L, 1L, 1L, 6L, 7L, 1L,
4L, 1L, 3L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
6L, 1L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 1L,
1L, 6L, 1L, 1L, 1L, 4L, 4L, 1L, 3L, 1L, 5L, 1L, 6L, 6L, 1L, 3L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 6L, 4L, 2L,
6L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 6L,
4L, 3L, 1L), .Label = c("android", "blackberry", "ios", "mac",
"other", "windows", "windows_phone"), class = "factor"), cpm = c(0.259529602595296,
0.008325, 0.664507018855387, 0.000646161798914448, 0.117647058823529,
0.630132741077424, 0.00398838150289017, 0.0986788005043583, 0.483832900637243,
0.631904877252478, 0.00499783423573511, 0.408063887806778, 0.0916731378464372,
1.3325069724202, 0.0112485708069297, 0.00171537666632221, 0.0129665435458787,
0.00296443300606869, 0.22941417451864, 0.000426580184572523,
0.206888580674988, 0.000622490272373541, 0.016084968041569, 0.119169168392267,
0.0216352172946694, 0.0552526416330796, 0.0150883006745904, 0.324403186817902,
0.188053932659688, 0.00389006342494715, 0.0625410833224263, 0.00111134385665529,
0.000198831231813773, 0.00551511140525039, 1.02902374670185,
0.574300071787509, 0.371022474579782, 0.111970606352996, 0.0000313953488372093,
0.380035469977198, 0.0159468438538206, 0.0274524158125915, 0.237448482577744,
0.083452302337827, 0.371352785145889, 0.129754756459319, 0.0261164794985636,
0.602409638554217, 0.0157611216101295, 0.347620654741816, 0.130193264668441,
0.34434946165254, 0.0693131695022054, 0.673575129533679, 0.0272002127093858,
0.0295980803571429, 0.482425913163336, 0.00235336471280429, 0.00508469886782341,
0.0000840689365279529, 0.236539258503618, 0.0799443865137296,
0.296296296296296, 0.0236127508854782, 0.0152198636822762, 0.00339285714285714,
0.150753768844221, 0.0859481582537517, 0.000587920688617856,
0.00127715231788079, 0.150836862270619, 0.0849810111668886, 0.279757646414598,
0.00113308871141809, 0.996427153632394, 0.00269808881394042,
0.374087591240876, 0.228267072474796, 0.0516169572925784, 0.00902986826347305,
0.000207365145228216, 0.244244977712646, 0.169128424850603, 0.573023255813954,
0.0152944175375988, 1.11731843575419, 0.426646706586826, 0.0544090571844687,
0.271433919880195, 0.0271570068233128, 0.00445611403693561, 0.00160892057026477,
0.671800318640467, 0.0216794334441393, 0.00285318261516391, 0.295866741619575,
0.0843108504398827, 1.60302577359969, 0.0132230143658259, 0.00246752277351996
)), row.names = c(6L, 22L, 25L, 28L, 31L, 41L, 43L, 45L, 47L,
59L, 68L, 70L, 71L, 72L, 73L, 80L, 94L, 95L, 96L, 101L, 115L,
117L, 121L, 123L, 125L, 140L, 144L, 149L, 151L, 165L, 169L, 170L,
179L, 182L, 186L, 189L, 190L, 206L, 207L, 208L, 221L, 238L, 239L,
259L, 271L, 275L, 276L, 280L, 281L, 294L, 303L, 308L, 311L, 315L,
318L, 345L, 354L, 355L, 362L, 374L, 377L, 383L, 384L, 385L, 386L,
394L, 405L, 407L, 408L, 419L, 422L, 424L, 425L, 427L, 442L, 445L,
454L, 455L, 465L, 466L, 482L, 484L, 485L, 487L, 496L, 506L, 510L,
513L, 517L, 518L, 523L, 528L, 544L, 548L, 552L, 557L, 570L, 579L,
586L, 596L), class = "data.frame")
Used split function to get list of data frames, which separated geo+os combinations from each other and wrote them down in a list of data frames:
X <- split(data, list(data$geo,data$os))
Than I pulled data frames out from that list into the environment and deleted data frames with zero rows
list2env(X, envir = .GlobalEnv)
## create a function that returns a logical value
isEmpty <- function(x) {
is.data.frame(x) && nrow(x) == 0L
}
## apply it over the environment
empty <- unlist(eapply(.GlobalEnv, isEmpty))
## remove the empties
rm(list = names(empty)[empty])
The desired result is a Data frame, which has 4 columns:
geo, os, quantile_15,quantile_80
Where geo+os are unique and have a certain quantile_15,quantile_80
I'd strongly suggest putting your data frames in a list instead of just leaving them in the global environment. The answer I link to should help you understand why lists are better, and also show how you could do lists from the start instead of this "find all data frames and put them in a list" approach.
eapply is difficult because there's nothing built-in to let you apply, say, only to data frames. And eapply returns results as a list, so it doesn't make much sense for adding columns to existing data frames.
df_names = ls()[sapply(mget(ls()), is.data.frame)]
df_list = mget(df_names)
result_list = lapply(df_list, function(d) d$new_col = <code for new column>)
I'm not sure what you want since you don't post your desired output. quantile(x, c(.15, .8)) returns 2 values, and your data frames have more than 2 rows, so I'm not sure what you want added - 2 new columns? 1 new column with recycling? something else?
Alternatively, maybe you just want a 2-number summary for each data frame? In that case sapply does nice simplification and keeps the names:
sapply(df_list, function(d) quantile(d$cpm, c(0.15, 0.8)))
# AE.mac AF.android BD.ios
# 15% 0.0009111413 0.1545266 0.0002341395
# 80% 0.0071962008 0.3567230 0.0076989311
EDIT based on your edits, let's work directly with data. We don't need to split, we certainly don't need list2env after the split. Adding columns by group is easy and efficient with dplyr or data.table. For example:
library(dplyr)
data %>%
group_by(geo, os) %>%
summarize(quantile_15 = quantile(cpm, .15),
quantile_80 = quantile(cpm, 0.8))
# # A tibble: 81 x 4
# # Groups: geo [?]
# geo os quantile_15 quantile_80
# <fct> <fct> <dbl> <dbl>
# 1 AE android 0.118 0.118
# 2 AE blackberry 0.00833 0.00833
# 3 AR mac 0.0296 0.0296
# 4 AT android 0.665 0.665
# 5 AU android 0.482 0.482
# 6 AU ios 0.374 0.374
# 7 AU mac 0.00903 0.00903
# ...
Or with data.table:
library(data.table)
setDT(data)
data[, as.list(quantile(cpm, c(0.15, 0.8))), by = .(geo, os)]
# geo os 15% 80%
# 1: EC ios 2.595296e-01 2.595296e-01
# 2: AE blackberry 8.325000e-03 8.325000e-03
# 3: AT android 6.645070e-01 6.645070e-01
# 4: EG android 1.702811e-02 8.928342e-02
# 5: AE android 1.176471e-01 1.176471e-01
# 6: CA windows 6.301327e-01 6.301327e-01

Why is prediction error discrete in adabag?

I've got the table of 55 observations with 5 variables (F,H,R,T,U) and 1 classifier variable ("Group") in which I have two groups.
I'm doing data sampling by splitting the data into the training set (70%) and test set (30%). Then I run adaboosting and check how it works.
I want to get the adaboost error distribution for 100 samplings. But the distribution occurs to be discrete, outputting only five value variants: 0, 0.0588235294117647, 0.117647058823529 0.176470588235294 and 0.235294117647059.It doesn't change with mfinal argument. I guess there should be more! How it works?
I use the folowing code:
predictions<-list()
for (i in 1:100){
train.ind<-sample(nrow(df), nrow(df) * 0.7)
assign(paste0("ada",i), do.call(boosting,
c(formula=Group~F + H + R + T + U,
data=substitute(df[train.ind,]), mfinal=50, boos=FALSE,
coeflearn='Breiman'),envir = parent.frame()))
assign(paste0("pred",i), predict(ada,df[-train.ind,]))
predictions[[i]]<-get(paste0("pred",i))$error
}
hist(100*unlist(predictions),breaks=10,
main="Error probability [%] ntrees=10. 100 sampling operations", xlab="AdaBoost error")
dput(df)
structure(list(Group = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Canines", "Sled"), class = "factor"), F = c(0.263150566678734,
0.260347316635598, 0.26437277258488, 0.265710057607949, 0.254866055219663,
0.263294264681227, 0.261901194801303, 0.257318268395066, 0.26420207103455,
0.252093225560912, 0.255473253732324, 0.259067858940115, 0.259528043446917,
0.267331491048901, 0.260246447333382, 0.26035486437815, 0.254553215708594,
0.274074579975413, 0.262896904742862, 0.260504330262876, 0.258329960879536,
0.262664861154909, 0.256148832094211, 0.258509128895957, 0.256292083925698,
0.262358651734143, 0.254578103664353, 0.255386025800537, 0.264120912009577,
0.275232714712253, 0.265375720277527, 0.267601768121804, 0.262932226832642,
0.263633189245163, 0.262826186070212, 0.261058637786334, 0.262979366135887,
0.259232168979912, 0.252933156025384, 0.263963451214447, 0.258511197058683,
0.261957295373665, 0.253412282699461, 0.260748166588172, 0.263136039863289,
0.255317062006506, 0.258822015633545, 0.252757763183064, 0.260840486010478,
0.258620689655172, 0.263738813871524, 0.26241134751773, 0.26405425581719,
0.263685152057245, 0.262062787572784), H = c(0.242711147002311,
0.243850477245014, 0.245132979060713, 0.241794831140003, 0.235370262206577,
0.241392449436832, 0.236787894677703, 0.240434935369935, 0.234076675284456,
0.236978505926275, 0.23489414817613, 0.236461115627298, 0.241377100655228,
0.240778565421122, 0.238954656595734, 0.237237027626932, 0.23562891291975,
0.228247507171151, 0.235543469567304, 0.238348073568565, 0.237639956832591,
0.237993655975811, 0.23053394888479, 0.237553985998722, 0.238716430501961,
0.241044553515742, 0.23579805839771, 0.244646715997643, 0.245211405561299,
0.248463204730402, 0.237910443860818, 0.23772859908127, 0.242517289073306,
0.230376515634971, 0.239386381312522, 0.242971498213445, 0.248246377553633,
0.245227816034538, 0.237968589560153, 0.235998092571798, 0.235639593181493,
0.240320284697509, 0.239383587641388, 0.237939850635807, 0.240409493084614,
0.239705089012767, 0.235291279312896, 0.237725562711216, 0.251017166425148,
0.244410329082034, 0.247581475626206, 0.244082639531298, 0.248022977743474,
0.246127343801762, 0.246345535241663), R = c(0.23238005068085,
0.233913128793082, 0.232906768805408, 0.234580624702711, 0.23729616240706,
0.232552468336102, 0.23566425708828, 0.233370934038501, 0.23413197660754,
0.241255572873247, 0.240609653949119, 0.233790113420818, 0.239086204963073,
0.233644719452121, 0.23849468613068, 0.236846146329206, 0.239755264655663,
0.225925420024587, 0.239355887920232, 0.237429996633718, 0.23819641170916,
0.232039177131833, 0.223832380603256, 0.235838907338977, 0.236669843303285,
0.234916072348618, 0.238304558463179, 0.235904655883701, 0.232124394623714,
0.222879222527955, 0.233232723139038, 0.233871666714818, 0.235947441217151,
0.242585880964708, 0.234693056561268, 0.233941777691605, 0.229366135886539,
0.23539800906269, 0.239803390172875, 0.236505714593364, 0.24647853698133,
0.235569395017794, 0.242526379716086, 0.236207360559779, 0.234180854122081,
0.240408036487878, 0.239601762794737, 0.245058343429191, 0.234449894103222,
0.237875925051173, 0.230698942666106, 0.233475177304965, 0.231384358432554,
0.233114688928642, 0.230655428424067), T = c(0.261758235638105,
0.261889077326307, 0.257587479549, 0.257914486549337, 0.272467520166701,
0.262760817545838, 0.265646653432713, 0.268875862196498, 0.267589277073454,
0.269672695639567, 0.269022944142428, 0.270680912011768, 0.260008650934782,
0.258245224077857, 0.262304209940204, 0.265561961665713, 0.270062606715993,
0.271752492828849, 0.262203737769602, 0.263717599534841, 0.265833670578713,
0.267302305737446, 0.289484838417743, 0.268097977766344, 0.268321642269056,
0.261680722401497, 0.271319279474757, 0.264062602318119, 0.258543287805409,
0.253424858029389, 0.263481112722616, 0.260797966082108, 0.258603042876902,
0.263404414155158, 0.263094376055998, 0.262028086308617, 0.259408120423941,
0.26014200592286, 0.269294864241588, 0.263532741620391, 0.259370672778494,
0.262153024911032, 0.264677749943065, 0.265104622216242, 0.262273612930016,
0.264569812492848, 0.266284942258822, 0.264458330676529, 0.253692453461153,
0.25909305621162, 0.257980767836164, 0.260030835646007, 0.256538408006782,
0.25707281521235, 0.260936248761486), U = c(0.276642254462421,
0.275750907536407, 0.274138521440258, 0.279385339041277, 0.283770344294126,
0.273124933319108, 0.276770665567999, 0.272796198013943, 0.273326789343435,
0.278824893979485, 0.282917535762971, 0.269035729493284, 0.276381346021371,
0.275681845488406, 0.280473043309851, 0.274957072857482, 0.279453614114969,
0.265400901516186, 0.284438401450319, 0.275270067631668, 0.277080803992985,
0.268341093323935, 0.26334299428362, 0.27494270078114, 0.277070411973316,
0.276364671746617, 0.277622940087166, 0.275489489882784, 0.275412200032649,
0.267636555236813, 0.275475938484053, 0.27914367434201, 0.281161825726141,
0.287341513046201, 0.274277898463271, 0.272041104617345, 0.268317034458041,
0.277054269097656, 0.276448903327891, 0.282483963758864, 0.288513266166897,
0.280409252669039, 0.283610415243301, 0.27874587902846, 0.274619094771137,
0.275604453090517, 0.286100299160421, 0.288513039597016, 0.270078586556683,
0.280480764184118, 0.274123602187187, 0.277940178846747, 0.273784368554907,
0.282369310276287, 0.277372857201026)), na.action = structure(c(`2` = 2L,
`4` = 4L, `19` = 18L, `24` = 20L, `28` = 24L, `29` = 25L, `30` = 26L,
`32` = 28L, `33` = 29L, `42` = 38L, `54` = 46L, `69` = 54L, `74` = 58L,
`77` = 59L, `79` = 60L, `80` = 61L, `83` = 62L), class = "omit"), row.names = c(5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 15L, 16L, 17L, 18L, 20L,
25L, 26L, 27L, 31L, 41L, 44L, 46L, 47L, 48L, 50L, 51L, 52L, 55L,
57L, 64L, 65L, 66L, 67L, 68L, 70L, 71L, 72L, 85L, 86L, 87L, 88L,
89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L, 100L,
101L, 102L, 103L), class = "data.frame")

How to apply a function to a certain column for all the data frames in environment in R

I have lots of data frames with the same columns. What I want is to apply quantile (15% and 80%) function to the 3rd ("cpm") column for all the data frames in my environment and add the result as a new column to each data frame
All the data frames in environment are the same, here is the sample of them:
BD.ios = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = "BD", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ios", class = "factor"),
cpm = c(0.00026978417266187, 0.000276497695852535, 0.00442228161827238,
0.00396317260301814, 0.0191772698764066, 0.700811773637797,
0.00482934642627173, 0.00201429499675114, 0.00021494623655914,
0.0000520855057351408)), row.names = c(12925L, 13011L, 15189L,
18469L, 19494L, 22385L, 22594L, 29467L, 31907L, 38037L), class = "data.frame")
AE.mac = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AE", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "mac", class = "factor"),
cpm = c(0.000353264424964019, 0.00390138781055901, 0.000893105609526794,
0.0099634872417983, 0.00119375573921028, 0.00535134321942833,
0.00318471337579618, 0.000983284169124877, 0.116180371352785
)), row.names = c(2622L, 6483L, 6898L, 9383L, 25280L, 25923L,
29649L, 37977L, 40411L), class = "data.frame")
AF.android = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "AF", class = "factor"), os = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "android", class = "factor"),
cpm = c(0.193592767295597, 0.153727276424417, 0.30376596601237,
0.43615845874945, 0.552450120363948, 0.214786723495654, 0.206123674204523,
0.0250727462779332, 0.157723828668625)), row.names = c(955L,
7975L, 8899L, 9297L, 11223L, 14963L, 17452L, 19883L, 20555L), class = "data.frame")
I believe, that the solution is easy and requires the use of eapply function, but I just can't figure it out
env = .GlobalEnv
eapply(env, quantile, probs = c(.15,.8))
This command results in an error:
Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
undefined columns selected
EDIT
To make it clear, here is what I did and what I need as a result:
I had Data like this
data = structure(list(geo = structure(c(15L, 1L, 3L, 16L, 1L, 9L, 17L,
23L, 29L, 52L, 26L, 55L, 34L, 46L, 25L, 52L, 17L, 15L, 27L, 35L,
45L, 8L, 21L, 24L, 6L, 16L, 52L, 31L, 14L, 38L, 21L, 5L, 41L,
16L, 34L, 52L, 27L, 16L, 7L, 13L, 10L, 35L, 52L, 44L, 27L, 19L,
35L, 6L, 42L, 25L, 40L, 31L, 43L, 33L, 13L, 2L, 4L, 12L, 30L,
44L, 51L, 38L, 35L, 28L, 52L, 32L, 20L, 19L, 34L, 56L, 51L, 53L,
54L, 22L, 49L, 18L, 4L, 36L, 34L, 4L, 47L, 11L, 25L, 9L, 6L,
46L, 39L, 25L, 12L, 50L, 27L, 39L, 48L, 27L, 23L, 9L, 19L, 9L,
44L, 37L), .Label = c("AE", "AR", "AT", "AU", "AZ", "BD", "BG",
"BO", "CA", "CD", "CH", "CO", "DK", "DZ", "EC", "EG", "ES", "FI",
"FR", "GA", "GB", "GE", "HK", "HU", "ID", "IE", "IN", "IR", "IT",
"KE", "KR", "LB", "LY", "MX", "MY", "NL", "PE", "PH", "PK", "PL",
"PT", "QA", "RO", "RU", "RW", "SE", "SG", "SK", "SY", "TH", "TR",
"US", "UY", "VN", "YE", "ZA"), class = "factor"), os = structure(c(3L,
2L, 1L, 1L, 1L, 6L, 4L, 1L, 1L, 4L, 6L, 1L, 1L, 1L, 6L, 7L, 1L,
4L, 1L, 3L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L,
6L, 1L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 1L,
1L, 6L, 1L, 1L, 1L, 4L, 4L, 1L, 3L, 1L, 5L, 1L, 6L, 6L, 1L, 3L,
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 6L, 4L, 2L,
6L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 6L,
4L, 3L, 1L), .Label = c("android", "blackberry", "ios", "mac",
"other", "windows", "windows_phone"), class = "factor"), cpm = c(0.259529602595296,
0.008325, 0.664507018855387, 0.000646161798914448, 0.117647058823529,
0.630132741077424, 0.00398838150289017, 0.0986788005043583, 0.483832900637243,
0.631904877252478, 0.00499783423573511, 0.408063887806778, 0.0916731378464372,
1.3325069724202, 0.0112485708069297, 0.00171537666632221, 0.0129665435458787,
0.00296443300606869, 0.22941417451864, 0.000426580184572523,
0.206888580674988, 0.000622490272373541, 0.016084968041569, 0.119169168392267,
0.0216352172946694, 0.0552526416330796, 0.0150883006745904, 0.324403186817902,
0.188053932659688, 0.00389006342494715, 0.0625410833224263, 0.00111134385665529,
0.000198831231813773, 0.00551511140525039, 1.02902374670185,
0.574300071787509, 0.371022474579782, 0.111970606352996, 0.0000313953488372093,
0.380035469977198, 0.0159468438538206, 0.0274524158125915, 0.237448482577744,
0.083452302337827, 0.371352785145889, 0.129754756459319, 0.0261164794985636,
0.602409638554217, 0.0157611216101295, 0.347620654741816, 0.130193264668441,
0.34434946165254, 0.0693131695022054, 0.673575129533679, 0.0272002127093858,
0.0295980803571429, 0.482425913163336, 0.00235336471280429, 0.00508469886782341,
0.0000840689365279529, 0.236539258503618, 0.0799443865137296,
0.296296296296296, 0.0236127508854782, 0.0152198636822762, 0.00339285714285714,
0.150753768844221, 0.0859481582537517, 0.000587920688617856,
0.00127715231788079, 0.150836862270619, 0.0849810111668886, 0.279757646414598,
0.00113308871141809, 0.996427153632394, 0.00269808881394042,
0.374087591240876, 0.228267072474796, 0.0516169572925784, 0.00902986826347305,
0.000207365145228216, 0.244244977712646, 0.169128424850603, 0.573023255813954,
0.0152944175375988, 1.11731843575419, 0.426646706586826, 0.0544090571844687,
0.271433919880195, 0.0271570068233128, 0.00445611403693561, 0.00160892057026477,
0.671800318640467, 0.0216794334441393, 0.00285318261516391, 0.295866741619575,
0.0843108504398827, 1.60302577359969, 0.0132230143658259, 0.00246752277351996
)), row.names = c(6L, 22L, 25L, 28L, 31L, 41L, 43L, 45L, 47L,
59L, 68L, 70L, 71L, 72L, 73L, 80L, 94L, 95L, 96L, 101L, 115L,
117L, 121L, 123L, 125L, 140L, 144L, 149L, 151L, 165L, 169L, 170L,
179L, 182L, 186L, 189L, 190L, 206L, 207L, 208L, 221L, 238L, 239L,
259L, 271L, 275L, 276L, 280L, 281L, 294L, 303L, 308L, 311L, 315L,
318L, 345L, 354L, 355L, 362L, 374L, 377L, 383L, 384L, 385L, 386L,
394L, 405L, 407L, 408L, 419L, 422L, 424L, 425L, 427L, 442L, 445L,
454L, 455L, 465L, 466L, 482L, 484L, 485L, 487L, 496L, 506L, 510L,
513L, 517L, 518L, 523L, 528L, 544L, 548L, 552L, 557L, 570L, 579L,
586L, 596L), class = "data.frame")
Used split function to get list of data frames, which separated geo+os combinations from each other and wrote them down in a list of data frames:
X <- split(data, list(data$geo,data$os))
Than I pulled data frames out from that list into the environment and deleted data frames with zero rows
list2env(X, envir = .GlobalEnv)
## create a function that returns a logical value
isEmpty <- function(x) {
is.data.frame(x) && nrow(x) == 0L
}
## apply it over the environment
empty <- unlist(eapply(.GlobalEnv, isEmpty))
## remove the empties
rm(list = names(empty)[empty])
The desired result is a Data frame, which has 4 columns:
geo, os, quantile_15,quantile_80
Where geo+os are unique and have a certain quantile_15,quantile_80
I'd strongly suggest putting your data frames in a list instead of just leaving them in the global environment. The answer I link to should help you understand why lists are better, and also show how you could do lists from the start instead of this "find all data frames and put them in a list" approach.
eapply is difficult because there's nothing built-in to let you apply, say, only to data frames. And eapply returns results as a list, so it doesn't make much sense for adding columns to existing data frames.
df_names = ls()[sapply(mget(ls()), is.data.frame)]
df_list = mget(df_names)
result_list = lapply(df_list, function(d) d$new_col = <code for new column>)
I'm not sure what you want since you don't post your desired output. quantile(x, c(.15, .8)) returns 2 values, and your data frames have more than 2 rows, so I'm not sure what you want added - 2 new columns? 1 new column with recycling? something else?
Alternatively, maybe you just want a 2-number summary for each data frame? In that case sapply does nice simplification and keeps the names:
sapply(df_list, function(d) quantile(d$cpm, c(0.15, 0.8)))
# AE.mac AF.android BD.ios
# 15% 0.0009111413 0.1545266 0.0002341395
# 80% 0.0071962008 0.3567230 0.0076989311
EDIT based on your edits, let's work directly with data. We don't need to split, we certainly don't need list2env after the split. Adding columns by group is easy and efficient with dplyr or data.table. For example:
library(dplyr)
data %>%
group_by(geo, os) %>%
summarize(quantile_15 = quantile(cpm, .15),
quantile_80 = quantile(cpm, 0.8))
# # A tibble: 81 x 4
# # Groups: geo [?]
# geo os quantile_15 quantile_80
# <fct> <fct> <dbl> <dbl>
# 1 AE android 0.118 0.118
# 2 AE blackberry 0.00833 0.00833
# 3 AR mac 0.0296 0.0296
# 4 AT android 0.665 0.665
# 5 AU android 0.482 0.482
# 6 AU ios 0.374 0.374
# 7 AU mac 0.00903 0.00903
# ...
Or with data.table:
library(data.table)
setDT(data)
data[, as.list(quantile(cpm, c(0.15, 0.8))), by = .(geo, os)]
# geo os 15% 80%
# 1: EC ios 2.595296e-01 2.595296e-01
# 2: AE blackberry 8.325000e-03 8.325000e-03
# 3: AT android 6.645070e-01 6.645070e-01
# 4: EG android 1.702811e-02 8.928342e-02
# 5: AE android 1.176471e-01 1.176471e-01
# 6: CA windows 6.301327e-01 6.301327e-01

Specific data in secondary y axis

This language is still a bit alien to me. I want to make a complicate graph with two axis and data plotted by groups.
The nature of my data STAT. I will write it as code, otherwise I cannot manage to publish the post:
4 time points ("0", "3", "5" and "7"), column Day.
Data divided in 5 groups, column SNu ("1", "2", "3", "4", "5") or SNa (the actual name of each group).
There are 4 values per group and time point, column Rep. Graph could plot the mean of these four values.
Data1 based on the area between the actual measures of one day and the following day, column SAr (some values are 0, between 0 and 205, some of them with decimals). I want to plot this in the primary y axis.
Data2, column DW (values between 0 and 1, all of them with 4 decimals). I want to plot this in the secondary axis.
I show below some modified data as an example.
structure(list(Sname = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("H4.8", "S302", "S309",
"S313", "T.m"), class = "factor"), Snumber = c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), Day = c(0L, 3L,
5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L,
5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L,
5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L,
5L, 7L, 0L, 3L, 5L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L,
7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L, 0L, 3L, 5L, 7L), Replica = c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), Diff = c(0L,
0L, 160L, 200L, 0L, 10L, 140L, 160L, 0L, 0L, 50L, 170L, 0L, 10L,
70L, 150L, 0L, 10L, 210L, 140L, 0L, 0L, 0L, 120L, 0L, 30L, 70L,
160L, 0L, 20L, 110L, 140L, 0L, 30L, 190L, 150L, 0L, 10L, 80L,
130L, 0L, 10L, 90L, 140L, 0L, 0L, 170L, 170L, 0L, 80L, 200L,
410L, 0L, 10L, 150L, 0L, 90L, 200L, 390L, 0L, 50L, 220L, 600L,
0L, 0L, 0L, 100L, 0L, 0L, 0L, 70L, 0L, 20L, 10L, 150L, 0L, 20L,
40L, 140L), Sum = c(0L, 0L, 160L, 360L, 0L, 10L, 150L, 310L,
0L, 0L, 50L, 220L, 0L, 10L, 80L, 230L, 0L, 10L, 220L, 360L, 0L,
0L, 0L, 120L, 0L, 30L, 100L, 260L, 0L, 20L, 130L, 270L, 0L, 30L,
220L, 370L, 0L, 10L, 90L, 220L, 0L, 10L, 100L, 240L, 0L, 0L,
170L, 340L, 0L, 80L, 280L, 690L, 0L, 10L, 160L, 0L, 90L, 290L,
680L, 0L, 50L, 270L, 870L, 0L, 0L, 0L, 100L, 0L, 0L, 0L, 70L,
0L, 20L, 30L, 180L, 0L, 20L, 60L, 200L), Sumarea = structure(c(1L,
1L, 17L, 33L, 1L, 2L, 16L, 29L, 1L, 1L, 3L, 22L, 1L, 2L, 9L,
22L, 1L, 2L, 22L, 32L, 1L, 1L, 1L, 14L, 1L, 20L, 12L, 23L, 1L,
13L, 15L, 24L, 1L, 20L, 22L, 31L, 1L, 2L, 11L, 21L, 1L, 2L, 12L,
23L, 1L, 1L, 18L, 31L, 1L, 4L, 27L, 7L, 1L, 2L, 17L, 1L, 6L,
28L, 5L, 1L, 30L, 25L, 10L, 1L, 1L, 1L, 12L, 1L, 1L, 1L, 8L,
1L, 13L, 26L, 17L, 1L, 13L, 6L, 19L), .Label = c("0", "1,6",
"12,5", "13,3", "147,5", "15", "152,5", "17,5", "20", "205",
"22,5", "25", "3,3", "30", "32,5", "37,5", "40", "42,5", "45",
"5", "52,5", "55", "57,5", "62,5", "67,5", "7,5", "70", "72,5",
"75", "8,3", "85", "87,5", "90"), class = "factor"), Sumarea10 = c(0L,
0L, 400L, 900L, 0L, 16L, 375L, 750L, 0L, 0L, 125L, 550L, 0L,
16L, 200L, 550L, 0L, 16L, 550L, 875L, 0L, 0L, 0L, 300L, 0L, 50L,
250L, 575L, 0L, 33L, 325L, 625L, 0L, 50L, 550L, 850L, 0L, 16L,
225L, 525L, 0L, 16L, 250L, 575L, 0L, 0L, 425L, 850L, 0L, 133L,
700L, 1525L, 0L, 16L, 400L, 0L, 150L, 725L, 1475L, 0L, 83L, 675L,
2050L, 0L, 0L, 0L, 250L, 0L, 0L, 0L, 175L, 0L, 33L, 75L, 400L,
0L, 33L, 150L, 450L), Dweight = structure(c(1L, 6L, 34L, 38L,
1L, 7L, 32L, 45L, 1L, 8L, 31L, 48L, 1L, 9L, 30L, 44L, 1L, 11L,
37L, 50L, 1L, 11L, 33L, 49L, 1L, 13L, 35L, 51L, 1L, 18L, 36L,
52L, 1L, 21L, 47L, 53L, 1L, 19L, 43L, 54L, 1L, 20L, 46L, 56L,
1L, 22L, 42L, 55L, 1L, 17L, 28L, 39L, 1L, 15L, 27L, 1L, 13L,
26L, 41L, 1L, 17L, 29L, 40L, 1L, 5L, 10L, 24L, 1L, 3L, 14L, 24L,
1L, 4L, 16L, 23L, 1L, 2L, 12L, 25L), .Label = c("0", "0,0003",
"0,0006", "0,0007", "0,0008", "0,0011", "0,0017", "0,0026", "0,0033",
"0,004", "0,0045", "0,0048", "0,005", "0,0051", "0,0053", "0,0055",
"0,0056", "0,006", "0,007", "0,0074", "0,0082", "0,0086", "0,0142",
"0,0204", "0,0222", "0,0333", "0,0342", "0,0345", "0,038", "0,0423",
"0,0426", "0,0637", "0,0668", "0,0679", "0,0736", "0,0808", "0,0922",
"0,0952", "0,0986", "0,0989", "0,0996", "0,1078", "0,1215", "0,1242",
"0,1349", "0,1483", "0,1512", "0,1576", "0,1682", "0,1731", "0,1949",
"0,2099", "0,262", "0,2676", "0,2742", "0,2808"), class = "factor"),
Wweight = structure(c(1L, 3L, 40L, 42L, 1L, 4L, 37L, 44L,
1L, 8L, 26L, 48L, 1L, 9L, 24L, 43L, 1L, 10L, 41L, 49L, 1L,
11L, 39L, 46L, 1L, 12L, 35L, 50L, 1L, 14L, 38L, 53L, 1L,
22L, 52L, 57L, 1L, 20L, 47L, 58L, 1L, 17L, 51L, 60L, 1L,
21L, 45L, 59L, 1L, 15L, 34L, 54L, 1L, 19L, 32L, 1L, 16L,
31L, 56L, 1L, 18L, 36L, 55L, 1L, 7L, 13L, 27L, 1L, 6L, 29L,
25L, 1L, 5L, 30L, 23L, 1L, 2L, 33L, 28L), .Label = c("0",
"0,0089", "0,0105", "0,0136", "0,0144", "0,0147", "0,0152",
"0,0201", "0,0265", "0,0339", "0,0345", "0,0371", "0,045",
"0,0463", "0,0569", "0,0583", "0,0587", "0,0596", "0,0602",
"0,0649", "0,069", "0,0834", "0,1264", "0,1829", "0,1897",
"0,1909", "0,1974", "0,2309", "0,3", "0,344", "0,3491", "0,3547",
"0,364", "0,3729", "0,3756", "0,3932", "0,4357", "0,4361",
"0,451", "0,4634", "0,479", "0,5109", "0,6594", "0,7182",
"0,7423", "0,7865", "0,7938", "0,8406", "0,8407", "0,9152",
"0,9347", "0,9675", "1", "1,0908", "1,1366", "1,1465", "1,6905",
"1,7799", "1,8875", "1,9493"), class = "factor")), class = "data.frame", row.names = c(NA, -79L))
#Pretreat dataframe by creating factors for every column.
STAT<- read.table("Biomass.txt", header=TRUE, fill=TRUE)
SNa <- as.factor(STAT$Sname)
SNu <- as.factor(STAT$Snumber)
Day <- as.numeric(STAT$Day)
Rep <- as.numeric(STAT$Replica)
Dif <- as.numeric(STAT$Diff)
Sum <- as.numeric(STAT$Sum)
SAr10 <- as.numeric(STAT$Sumarea10)
SAr <- c(SAr10/10)
DW <- as.numeric(STAT$Dweight)
WW <- as.numeric(STAT$Wweight)
#I first tried to plot Dataone (`SAr`) as follows:
points1 <- geom_point(aes(colour = SNa), size =.8)
lines1 <- geom_smooth(method = loess, aes(colour = SNa), size =.5, se=TRUE, alpha=.2)
text1 <- labs(title=expression (Biomass~and~CO[2]~production~summed~ area), x=expression(Time~" "~(days)), y=expression(CO[2]~production~sum~" "~(ppm)))
g <- ggplot(data=STAT, aes(x=Day, y=SAr, group=SNa, fill=SNa, colour=SNa), par(mar=Marg))
g <- g + points1 + lines1 + text1
This is the result:
So far so good, but here start the problems.
1. SHADE
I would like to shade the area below the graphs. I have tried:
area1 <- geom_ribbon(data = STAT[STAT$Snumber == '1',],
aes(ymin = 0, ymax = predict(loess(Day ~ Sumarea))),
alpha = 0.3, fill = "#114477")
g <- g + points1 + lines1 + text1 + area1
plot(g) returns:
Error in loess(Day ~ Sumarea) : predictors must all be numeric
I have tried to put the numeric factors I created at the beginning, but Day and SAr do not have the same length
Error in model.frame.default(formula = Day ~ SAr) :
variable lengths differ (found for 'SAr').
I have also tried to make this with a density function and a geom_area but none of them resulted in what I wanted.
2. PLOT DATA2
I want the Datatwo (DW) attachted to the secondary y axis.
#Secondary y axis
y2 <- scale_y_continuous(sec.axis = sec_axis(~./150, name = "Dry
weight"))
#Grouped bars per time point
bars2 <- geom_bar(aes(factor(Day), DW), stat="identity", position = "dodge")
g <- g + points1 + lines1 + text1 + y2 + bars2
plot(g) returns:
Error: Discrete value supplied to continuous scale
I know that there cannot be a continuous scale on variable of the factor type (Plotting with ggplot2: "Error: Discrete value supplied to continuous scale" on categorical y-axis). But their solution does not work for me either.
ggplot(STAT[STAT$SNu == 1,], aes(x = STAT$Day, y = STAT$DW)) +
scale_x_continuous(limits=c(0,7)) +
scale_y_continuous(limits=c(0,1))
Returning
Error: Aesthetics must be either length 1 or the same as the data
(79): x, y`
If anyone can help me with this two issues it would be super appreciated. As I am new in this code, I also encourage you to ask me about specific details that might have relevance and I did not add in the post. Also any improvement in my code even not related with my questions would be very welcome.

How to plot exponential decay in geom_smooth in ggplot2 in R?

Data
> dput(new.gapdata.cc)
structure(list(gap.interval = structure(c(1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 34L
), .Label = c("[0.0568,10.1]", "(10.1,20.1]", "(20.1,30.1]",
"(30.1,40.1]", "(40.1,50.1]", "(50.1,60.1]", "(60.1,70.1]", "(70.1,80.1]",
"(80.1,90.1]", "(90.1,100]", "(100,110]", "(110,120]", "(120,130]",
"(130,140]", "(140,150]", "(150,160]", "(160,170]", "(170,180]",
"(180,190]", "(190,200]", "(200,210]", "(210,220]", "(220,230]",
"(230,240]", "(240,250]", "(250,260]", "(260,270]", "(270,280]",
"(280,290]", "(290,300]", "(300,310]", "(310,320]", "(320,330]",
"(330,340]", "(340,350]", "(350,360]", "(360,370]", "(370,380]",
"(380,390]", "(390,400]", "(400,410]", "(410,420]", "(420,430]",
"(430,440]", "(440,450]", "(450,460]", "(460,470]", "(470,480]",
"(480,490]", "(490,500]", "(500,510]", "(510,520]", "(520,530]",
"(530,540]", "(540,550]", "(550,560]", "(560,570]", "(570,580]",
"(580,590]", "(590,600]", "(600,610]", "(610,620]", "(620,630]",
"(630,640]", "(640,650]", "(650,660]", "(660,670]", "(670,680]",
"(680,690]", "(690,700]", "(700,710]", "(710,720]", "(720,730]",
"(730,740]", "(740,750]"), class = "factor"), Vehicle.class = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Car following",
"Heavy-Vehicle following"), class = "factor"), PrecVehClass = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Car",
"Heavy-Vehicle"), class = "factor"), sd.speed = c(8.10631184218832,
11.4437550056097, 11.8038327709683, 10.8543703246156, 9.99720748006444,
9.44865875583687, 8.96606665646703, 8.49351869704553, 7.93669264490773,
8.13551032227591, 7.84202528436342, 8.0475744381228, 7.91648183675322,
7.43125313026708, 7.35410275703108, 7.60500908370333, 7.0498555301719,
7.55232413932399, 8.06598948864824, 6.76873032867712, 9.5638441069889,
8.04863015016668, 6.3210319215341, 4.64833690603376, 6.62719482422681,
6.64056528224281, 4.73744287133819, 7.47515815690314, 7.69289983159388,
0.306328206216196, 0.686563613792699), m.speed = c(7.49142882761648,
14.9015932672865, 23.2183766318976, 29.4281833927603, 33.2698195905316,
35.8151829762138, 37.5490804914733, 38.5477371278585, 39.3540677299243,
40.6919294171912, 41.1003756008852, 41.8182626555034, 43.0467747414578,
42.8363357874289, 43.4938190765401, 43.3542212600658, 45.4415004558705,
46.0292158248193, 45.2411112123218, 45.3142872888847, 45.8483490730252,
44.9081708678314, 48.91998889291, 47.3070826500395, 47.6670737425671,
46.3952054632908, 43.9972157634013, 51.2984320152685, 60.9675201903266,
44.7204961417801, 49.3765339447783), m.gapdist = c(7.7653843749647,
16.1638754974281, 25.4776617248361, 35.2445820779774, 44.9431006950918,
54.8030747287456, 64.7488740187079, 74.7493853439047, 84.7618392182203,
94.6265821702835, 104.858371321352, 114.633780836178, 124.562176064196,
134.473095135859, 144.806940411055, 154.554692908294, 164.982952591097,
174.906212522406, 185.553895860064, 194.461299821333, 204.825162321106,
215.128853160835, 225.333436194581, 235.137188240688, 244.880475531984,
255.160919142993, 264.314402521448, 274.575498681999, 285.224335149303,
293.119840359603, 337.618758706201)), .Names = c("gap.interval",
"Vehicle.class", "PrecVehClass", "sd.speed", "m.speed", "m.gapdist"
), row.names = c(3L, 8L, 13L, 18L, 24L, 31L, 37L, 43L, 49L, 55L,
61L, 66L, 71L, 76L, 81L, 85L, 88L, 91L, 94L, 96L, 98L, 100L,
102L, 105L, 107L, 109L, 112L, 114L, 116L, 118L, 121L), class = "data.frame")
What I want to achieve
I have 'sd.speed/m.speed' as dependent variable and 'm.gapdist' as explanatory variable. When I do a scatter plot the trend seems to be exponential decay. So I want to get the summary statistics as well as the plot fitted on the data points. I used following code:
ggplot() +
geom_point(data=new.gapdata.cc,
aes(y=sd.speed/m.speed, x=m.gapdist, shape=interaction(Vehicle.class,PrecVehClass)),
size=3) +
geom_smooth(data=new.gapdata.cc,
mapping = aes(y= sd.speed/m.speed, x=m.gapdist,
linetype=interaction(Vehicle.class,PrecVehClass)), method="lm", formula = log(y) ~ x,
se=F, size=1, color="black")
Question
This does not plot the exponential decay curve on the points. How can I fit the curve on points?

Resources