Execute code on different subsets - r

I have a data.frame with couple of thousands rows. I am applying several lines of code to subsets of this data.
I have 4 subsets in a column "mergeorder$phylum":
[1] "ascomycota" "basidiomycota" "unidentified"
[4] "chytridiomycota"
And on every subset i have to apply this set of functions separately:
ascomycota<-mergeorder[mergeorder$phylum %in% c("ascomycota"), ]
group_ascomycota <- aggregate(ascomycota[,2:62], by=list(ascomycota$order), FUN=sum)
row.names(group_ascomycota)<-group_ascomycota[,1]
group_ascomycota$sum <-apply(group_ascomycota[,-1],1,sum)
dat5 <-sweep(group_ascomycota[,2:62], 2, colSums(group_ascomycota[2:62]), '/')
dat5$sum <-apply(group_ascomycota[,-1],1,sum)
reorder_dat5 <- dat5[order(dat5$sum, decreasing=T),]
reorder_dat5$OTU_ID <- row.names(reorder_dat5)
FINITO<-reorder_dat5[1:15,]
write.table(FINITO, file="output_ITS1/ITS1_ascomycota_order_top15.csv", col.names=TRUE,row.names=FALSE, sep=",", quote=FALSE)
This code works. However, I would like to apply this code without manually replacing every "ascomycota" with "basidiomycota", "unidentified", "chytridiomycota".
What function should I use? How should I use it? I've been struggling with sapply(), repeat() but haven't come far.
The end result should execute the whole code and export csv separate files.
Many thanks for your answer

It's usually possible to write code that handles all subsets in one go. However, what you are doing is pretty complicated. The best thing to do might be to gather all that into a function and then just run the function for each subset. Something like this:
subset_transform <- function(subset){
t <-mergeorder[mergeorder$phylum %in% c(subset), ]
group_t <- aggregate(t[,2:62], by=list(t$order), FUN=sum)
row.names(group_t)<-group_t[,1]
group_t$sum <-apply(group_t[,-1],1,sum)
dat5 <-sweep(group_t[,2:62], 2, colSums(group_t[2:62]), '/')
dat5$sum <-apply(group_t[,-1],1,sum)
reorder_dat5 <- dat5[order(dat5$sum, decreasing=T),]
reorder_dat5$OTU_ID <- row.names(reorder_dat5)
FINITO<-reorder_dat5[1:15,]
write.table(FINITO, file = paste("output_ITS1/ITS1_", subset, "_order_top15.csv"), col.names=TRUE,row.names=FALSE, sep=",", quote=FALSE)
}
subset_transform("ascomycota")
subset_transform("basidiomycota")
subset_transform("unidentified")
subset_transform("chytridiomycota")

Related

Calling variable in the loop by using assign and paste0

I have a variable named SAL_mean created like this (I want to make a loop once I figure this out):
watersheds <- c('ANE', 'SAL', 'CER')
assign(paste0(watersheds[1], '_mean'), read.csv(paste0(watersheds[1], '_mean.csv')))
now the next step should be something like this (which works):
cols_dont_want <- c('B1', 'B2', 'B3')
assign(paste0(watersheds[1], '_mean'), SAL_mean[, !names(SAL_mean) %in% cols_dont_want])
but I wanted to ask how to replace "SAL_mean" by using watersheds[1], because this line of code doesn't work:
assign(paste0(watersheds[1], '_mean'), paste0(watersheds[1], '_mean')[, !names(paste0(watersheds[1], '_mean')) %in% cols_dont_want])
I think it treats the "paste0(watersheds[2], '_mean')" as string and not as a name of variable but I haven't been able to find a solution (I tried for example "as.name" function but it gave me an error "object of type 'symbol' is not subsettable")
Keep dataframes in a list using ?lapply, then it gets easier to carry out same transformations on multiple dataframes in a list, something like:
# set vars
watersheds <- c('ANE', 'SAL', 'CER')
cols_dont_want <- c('B1', 'B2', 'B3')
# result, all dataframes in one list
myList <- lapply(watersheds, function(i){
# read the file
x <- read.csv(paste0(i, "_mean.csv"))
# exclude columns and return
x[, !colnames(x) %in% cols_dont_want]
} )
replace
paste0(watersheds[2], '_mean')
with
eval(parse(text = paste0(watersheds[2], '_mean')))
and it should work. Your guess is correct, paste0 just gives you a string but you need to call the variable which is done using eval()
Or you can do it in a for loop (some find the syntax more understandable). It's equivalent to zx8754's solution, except it assigns names to each dataframe as per the OP. It's trivial to modify zx8754's solution do do the same.
watersheds <- c('ANE', 'SAL', 'CER')
cols_dont_want <- c('B1', 'B2', 'B3')
ws.list <- list()
for (i in 1:length(watersheds)) {
ws.list[[i]] <- read.csv(paste0(watersheds[i], '_mean.csv'))
names(ws.list)[i] <- paste0(watersheds[i], '_mean')
ws.list[[i]] <- ws.list[[i]][!names(ws.list[[i]]) %in% cols_dont_want]
}
names(ws.list)
# "ANE_mean" "SAL_mean" "CER_mean"
# If you absolutely want to call the data.frames by their
# individual names, you can do so after you attach() the list.
attach(ws.list)
ANE_mean

How would you write this using apply family of functions in R? Should you?

Here is my R Script that works just fine:
perc.rank <- function(x) trunc(rank(x)) / length(x) * 100.0
library(dplyr)
setwd("~/R/xyz")
datFm <- read.csv("yellow_point_02.csv")
datFm <- filter(datFm, HRA_ClassHRA_Final != -9999)
quant_cols <- c("CL_GammaRay_Despiked_Spline_MLR", "CT_Density_Despiked_Spline_FinalMerged",
"HRA_PC_1HRA_Final", "HRA_PC_2HRA_Final","HRA_PC_3HRA_Final",
"SRES_IMGCAL_SHIFT2VL_Slab_SHIFT2CL_DT", "Ultrasonic_DT_Despiked_Spline_MLR")
# add an extra column to datFm to store the quantile value
for (column_name in quant_cols) {
datFm[paste(column_name, "quantile", sep = "_")] <- NA
}
# initialize an empty dataframe with the new column names appended
newDatFm <- datFm[0,]
# get the unique values for the hra classes
hraClassNumV <- sort(unique(datFm$HRA_ClassHRA_Final))
# loop through the vector and create currDatFm and append it to newDatFm
for (i in hraClassNumV) {
currDatFm <- filter(datFm, HRA_ClassHRA_Final == i)
for (column_name in quant_cols) {
currDatFm <- within(currDatFm,
{
CL_GammaRay_Despiked_Spline_MLR_quantile <- perc.rank(currDatFm$CL_GammaRay_Despiked_Spline_MLR)
CT_Density_Despiked_Spline_FinalMerged_quantile <- perc.rank(currDatFm$CT_Density_Despiked_Spline_FinalMerged)
HRA_PC_1HRA_Final_quantile <- perc.rank(currDatFm$HRA_PC_1HRA_Final)
HRA_PC_2HRA_Final_quantile <- perc.rank(currDatFm$HRA_PC_2HRA_Final)
HRA_PC_3HRA_Final_quantile <- perc.rank(currDatFm$HRA_PC_3HRA_Final)
SRES_IMGCAL_SHIFT2VL_Slab_SHIFT2CL_DT_quantile <- perc.rank(currDatFm$SRES_IMGCAL_SHIFT2VL_Slab_SHIFT2CL_DT)
Ultrasonic_DT_Despiked_Spline_MLR_quantile <- perc.rank(currDatFm$Ultrasonic_DT_Despiked_Spline_MLR)
}
)
}
newDatFm <- rbind(newDatFm, currDatFm)
}
newDatFm <- newDatFm[order(newDatFm$Core_Depth),]
# head(newDatFm, 10)
write.csv(newDatFm, file = "Ricardo_quantiles.csv")
I have a few questions though. Every R book or video that I have read or watched, recommends using the 'apply' family of language constructs over the classic 'for' loop stating that apply is much faster.
So the first question is: how would you write it using apply (or tapply or some other apply)?
Second, is this really true though that apply is much faster than for? The csv file 'yellow_point_02.csv' has approx. 2500 rows. This script runs almost instantly on my Macbook Pro which has 16 Gig of memory.
Third, See the 'quant_cols' vector? I created it so that I could write a generic loop (for columm_name in quant_cols) ....But I could not make it to work. So I hard-coded the column names post-fixed with '_quantile' and called the 'perc.rank' many times. Is there a way this could be made dynamic? I tried the 'paste' stuff that I have in my script, but that did not work.
On the positive side though, R seems awesome in its ability to cut through the 'Data Wrangling' tasks with very few statements.
Thanks for your time.

Delete data.frame columns and loop through data.frame assignment function

I found the following piece of code here at stackoverflow:
library(svDialogs)
columnFunction <- function (x) {
column.D <- dlgList(names(x), multiple = T, title = "Spalten auswaehlen")$res
if (!length((column.D))) {
cat("No column selected\n")
} else {
cat("The following columns are choosen:\n")
print(column.D)
x <- x[,!names(x) %in% column.D]
}
return(x)
}
df <- columnFunction(df)
So i wanted to use it for my own proposes, but it did not work out as planned.
What i try to archive is to use it in a for loop or with lapply to use it with multiple data.frames. Amongst others I tried:
d.frame1 <- iris
d.frame2 <- cars
l.frames <- c("d.frame1","d.frame2")
for (b in l.frames){
columnFunction(b)
}
but it yields the following error message:
Error in dlgList(names(x), multiple = T, title = "Spalten auswaehlen")$res :
$ operator is invalid for atomic vectors
Well, what i need additionally is that I can loop though that function so that i can iterate through different data.frames.
Last but not least I would need something like:
for (xyz in l.frames){
xyz <- columnFunction(xyz)
}
to automate the saving step.
Does anyone have any idea how i could loop though that function or how i could change the function so that it performs all those steps and is loopable.
I`m quite new to R so perhaps Im missing something obvious.
lapply was designed for this task:
l.frames <- list(d.frame1, d.frame2)
l.frames <- lapply(l.frames, columnFunction)
If you insist on using a for loop:
for (i in seq_along(l.frames)) l.frames[[i]] <- columnFunction(l.frames[[i]])

Add a column to every object currently loaded in the R environment

I would like to add a column to every data frame in my R environment which all have the same format.
I can create the column I want with a simple assignment like this:
x[,8] <- x[,4]/(x[,4]+x[,5])
When I try to put this in a for loop that will iterate over every object in the environment, I get an error.
control_data <- ls()
for (i in control_data) {(i[,8] <- i[,4]/(i[,4]+i[,5]))}
Error: unexpected '[' in "for (i in control_data) {["
Here is what the input files look like:
ENSMUSG00000030088 Aldh1l1 chr6:90436420-90550197 1.5082200 3.130860 0.671814 0.0000000
ENSMUSG00000020932 Gfap chr11:102748649-102762226 7.0861500 44.182700 20.901700 0.2320750
ENSMUSG00000024411 Aqp4 chr18:15547902-15562193 3.4920400 3.474880 2.463230 0.0331238
ENSMUSG00000023913 Pla2g7 chr17:43705046-43749150 1.5105400 24.275600 11.422400 1.5111100
ENSMUSG00000035805 Mlc1 chr15:88786313-88809437 1.9010200 7.147400 5.313190 0.6358940
ENSMUSG00000007682 Dio2 chr12:91962993-91976878 1.7322900 12.094200 6.738320 1.0736900
ENSMUSG00000017390 Aldoc chr11:78136469-78141283 55.4562000 199.958000 91.328300 22.9541000
ENSMUSG00000005089 Slc1a2 chr2:102498815-102630941 63.7394000 130.729000 103.710000 10.0406000
ENSMUSG00000070880 Gad1 chr2:70391128-70440071 2.6501400 14.907500 13.730200 1.3992200
ENSMUSG00000026787 Gad2 chr2:22477724-22549394 3.9908200 11.308600 28.221500 1.4530500
Thank you for any help you could provide. Is there a better way to do this using an apply function?
As mentioned in the comment, your error happens because the results of calling ls are not the objects themselves but rather their names as strings.
To use the for-loop, you'll be headed down the eval(parse(...)) path. You can also do this with apply and a function.
myfun <- function(x) {
df <- get(x)
df[,8] <- df[,4] / (df[,4] + df[,5])
return(df)
}
control_data <- ls()
lapply(control_data, myfun)
As per the comment:
for(i in control_data) {
df <- get(i)
df[,8] <- df[,4] / (df[,4] + df[,5])
assign(i, df)
}

Assign names from one list to another

I got a bunch dynamically created regressions stored in some list called regressions. Now I´d like to rename their coefficients efficiently. What I have so far is this loop that works:
for (i in 1:length(params[,1])){
names(regressions[[i]]$coefficients)[pos] <- paste(params[i,1],".lag",params[i,2],sep="")
}
I've been trying for quite a while to get this done a little more generally with the help of a function, cause this not the only list of regressions I have. However I could not get anything else to work. Here a few other tries basically based on lapply:
correctNames <- function(reglist,namevec,pos){
names(reglist[[i]]$coefficients)[pos] <- as.character(namevec)
}
lapply(regressions,correctNames(reglist,namevec,pos),
reglist=regressions,namevec=params[,1],pos=2)
Another try was to write a function with a for loop which also works internally as print shows but does not assign the names globally (where the regressions list is stored).
correctNames <- function(reglist,pos,namevec){
for (i in 1:length(params[,1])){
names(reglist[[i]]$coefficients)[pos] <- paste(namevec,".lag",namevec,sep="")
}
#this test proves it's work inside the function...
print(reglist[[10]]
}
Ah, gimme a break.
There's no "i" inside that first version of "correctNames" function; and you probably don't realize that you are not assigning it to "regressions", only to a copy of the regression object. Try instead:
correctNames <- function(reglist,namevec,pos){
names(reglist$coefficients)[pos] <- as.character(namevec)
return(reglist) }
newregs <- mapply(correctNames,
reglist=regressions,
namevec=as.character(params[,1]),
MoreArgs= list( pos=2))
After seeing the note from Ramnath and noticing that the code did work but was giving flaky names for the "params" I looked at params and saw that it was a factor, and so changed the argument in the mapply call to as.character(params[,1]).
> newregs[1,1]
[[1]]
(Intercept) log(M1)
-5.753758 2.178137
If this is a follow up to your earlier question, then here is what I would do
coefs = plyr::ldply(regressions, coef)
coefs = transform(coefs, reg_name = paste(x, '.lag', l, sep = ""))[,-c(1, 2)]
names(coefs) = c('intercept', 'reg_coef', 'reg_name')
This gives you
intercept reg_coef reg_name
1 -5.753758 2.178137 log(M1).lag0
2 7.356434 7.532603 rs.lag0
3 7.198149 8.993312 rl.lag0
4 -5.840754 2.193382 log(M1).lag1
5 7.366914 7.419599 rs.lag1
6 7.211223 8.879969 rl.lag1
7 -5.988306 2.220994 log(M1).lag4
8 7.395494 7.127231 rs.lag4
9 7.246161 8.582998 rl.lag4

Resources