Variable length differ in R - r

i'm performing Anova testing for my current datasets that has multiple columns which i am trying to loop to make things easier but it seems to me that i am always facing the same error called "variable lengths differ"
here is my code for the loop:
for(i in 5:125){
WL<- colnames(NB[i])
model <- lm(WL ~ Treatment , data = NB)
if(!exists("aovNB")){
aovNB<-anova(model)
}
if(exists("aovNB")){
aovNB <- rbind(aovNB,anova(model))
}
}
and i'm wondering if it is possible that way to store the column names into WL variable which i can use to read the multiple columns i have.
thanks if anyone could solve it. i'm using base R.

Use reformulate/as.formula to create formula from strings. Also instead of rbinding the datasets in a loop store them in a list.
cols <- colnames(NB)[5:125]
result <- vector('list', length(cols))
for(i in seq_along(cols)){
model <- lm(reformulate('Treatment', cols[i]) , data = NB)
result[[i]] <- anova(model)
}
If needed you can combine them using result <- do.call(rbind, result)

We may do this with paste
cols <- colnames(NB)[5:125]
result <- vector('list', length(cols))
for(i in seq_along(cols)) {
result[[i]] <- anova(lm(as.formula(paste(cols[i], '~ Treatment')), data = NB))
}

Related

Apply string values from a list to a default drc function

I'm running several models and functions with the models. I need to apply values from a list to a default R function, e.g., modelFit()
My idea is to run the same function to several models and compile the results after.
I'm trying to use loops and the Family of 'apply' functions in R, but with no success.
#package drc is necessary
library(drc)
#my data
rates <- c(.1,.1,.1,1,1,1,10,10,10,100,100,100,1000,1000,1000,.1,.1,.1,1,1,1,10,10,10,100,100,100,1000,1000,1000)
prod <- c("A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B")
resp <- c(.295,.32,.301,.155,.1501,.148,.05,.03,.044,.002,.001,.0015,.001,.0005,.0003,.312,.337,.299,.265,.2501,.248,.102,.103,.114,.02,.01,.015,.003,.0002,.0007)
data.test <- data.frame(rates,prod,resp) #my data frame
#my models
m1 <- drm(resp~rates, fct=LL.4(), data=data.test[data.test$prod=="A",])
m2 <- drm(resp~rates, fct=LL.4(), data=data.test[data.test$prod=="B",])
#lack of fit test
modelFit(m1)
modelFit(m2)
#I can get the modelFit p-values this way:
modelFit(m1)$"p value"[2]
modelFit(m2)$"p value"[2]
#I have several models. I want to create a loop to give me only the p value for each model fit. I want to use that for other information given by the summaries and function
#list of models
modelsList <- c("m1","m2")
#I can print the strings with the loop
for(i in modelsList){
print(i)
}
My idea was to use the strings to add information for the default 'drc' function modelFit(). The result would be a list with all p values from all the results in the loop, but I'm getting the error:
Error: $ operator is invalid for atomic vectors
#Not working
for(i in modelsList){
modelFit(i)$"p value"[2]
}
#Error
# Error: $ operator is invalid for atomic vectors
#Trying to use lapply
#Following this logic
lapply(modelsList, function(x) print(x))
#I could not get the results
lapply(modelsList, function(x) modelFit(x)$"p value"[2])
With the results, I'd continue and I'd create a data.frame with all the models and respective p values.
good to see you, Danilo
I think you missed drm function.
modelsList <- c("m1","m2")
prod = c("A","B")
type = data.frame(modelsList,
prod)
#I can print the strings with the loop
for(i in modelsList){
print(i)
}
for(i in modelsList){
model_Prod = type[modelsList == i,]$prod
drm_result<-drm(resp~rates, fct=LL.4(), data=data.test[data.test$prod==model_Prod,])
print(modelFit(drm_result)$"p value"[2])
}
# lapply
lapply(modelsList, function(x) {
model_Prod = type[modelsList == x,]$prod
x<-drm(resp~rates, fct=LL.4(), data=data.test[data.test$prod==model_Prod,])
modelFit(x)$"p value"[2]
}
)
Continuing, after Steve Lee inputs, I could generate a data.frame with all results I got from the loop:
#source of information:
#https://stackoverflow.com/questions/25285570/invalid-factor-level-with-rbind-to-data-frame
#df.results will be a data.frane with headers = "Model" and "ModelFit_pvalue""
df.results <- NULL
#Loop to print only specif information requested from function and add it to a data frame
for(i in modelsList){
model_Prod = type[modelsList == i,]$prod
drm_result<-drm(resp~rates, fct=LL.4(), data=data.test[data.test$prod==model_Prod,])
#print(i)
#print(modelFit(drm_result)$"p value"[2])
modelpvalue <- modelFit(drm_result)$"p value"[2] #store p values in a variable
de <- data.frame("Model"=i,"ModelFit_pvalue"= modelpvalue)
df.results = rbind(df.results,de)
}
df.results
#df.results output:
Model ModelFit_pvalue
1 m1 0.2346460
2 m2 0.5757368

Run Function in Loop in R

I need to use the below function in loop as i have 100s of variables.
binning <- function (df,vars,by=0.1,eout=TRUE,verbose=FALSE) {
for (col in vars) {
breaks <- numeric(0)
if(eout) {
x <- boxplot(df[,col][!df[[col]] %in% boxplot.stats(df[[col]])$out],plot=FALSE)
non_outliers <- df[,col][df[[col]] <= x$stats[5] & df[[col]] >= x$stats[1]]
if (!(min(df[[col]])==min(non_outliers))) {
breaks <- c(breaks, min(df[[col]]))
}
}
breaks <- c(breaks, quantile(if(eout) non_outliers else df[[col]], probs=seq(0,1, by=by)))
if(eout) {
if (!(max(df[[col]])==max(non_outliers))) {
breaks <- c(breaks, max(df[[col]]))
}
}
return (cut(df[[col]],breaks=breaks,include.lowest=TRUE))
}}
It creates a variable with binned score. The naming convention of variable is "the original name" plus "_bin".
data$credit_amount_bin <- iv.binning.simple(data,"credit_amount",eout=FALSE)
I want the function runs for all the NUMERIC variables and store the converted bins variables in a different data frame and name them with "the original name _bin".
Any help would be highly appreciated.
Using your function, you could go via lapply, looping over all values that are numeric.
# some data
dat0 <- data.frame(a=letters[1:10], x=rnorm(10), y=rnorm(10), z=rnorm(10))
# find all numeric by names
vars <- colnames(dat0)[which(sapply(dat0,is.numeric))]
# target data set
dat1 <- as.data.frame( lapply(vars, function(x) binning(dat0,x,eout=FALSE)) )
colnames(dat1) <- paste(vars, "_bin", sep="")
Personally, I would prefer having this function with vector input instead of data frame plus variable names. It might run more efficiently, too.

How do I convert this for loop into something cooler like by in R

uniq <- unique(file[,12])
pdf("SKAT.pdf")
for(i in 1:length(uniq)) {
dat <- subset(file, file[,12] == uniq[i])
names <- paste("Sample_filtered_on_", uniq[i], sep="")
qq.chisq(-2*log(as.numeric(dat[,10])), df = 2, main = names, pvals = T,
sub=subtitle)
}
dev.off()
file[,12] is an integer so I convert it to a factor when I'm trying to run it with by instead of a for loop as follows:
pdf("SKAT.pdf")
by(file, as.factor(file[,12]), function(x) { qq.chisq(-2*log(as.numeric(x[,10])), df = 2, main = paste("Sample_filtered_on_", file[1,12], sep=""), pvals = T, sub=subtitle) } )
dev.off()
It works fine to sort the data frame by this (now a factor) column. My problem is that for the plot title, I want to label it with the correct index from that column. This is easy to do in the for loop by uniq[i]. How do I do this in a by function?
Hope this makes sense.
A more vectorized (== cooler?) version would pull the common operations out of the loop and let R do the book-keeping about unique factor levels.
dat <- split(-2 * log(as.numeric(file[,10])), file[,12])
names(dat) <- paste0("IoOPanos_filtered_on_pc_", names(dat))
(paste0 is a convenience function for the common use case where normally one would use paste with the argument sep=""). The for loop is entirely appropriate when you're running it for its side effects (plotting pretty pictures) rather than trying to capture values for further computation; it's definitely un-cool to use T instead of TRUE, while seq_along(dat) means that your code won't produce unexpected results when length(dat) == 0.
pdf("SKAT.pdf")
for(i in seq_along(dat)) {
vals <- dat[[i]]
nm <- names(dat)[[i]]
qq.chisq(val, main = nm, df = 2, pvals = TRUE, sub=subtitle)
}
dev.off()
If you did want to capture values, the basic observation is that your function takes 2 arguments that vary. So by or tapply or sapply or ... are not appropriate; each of these assume that just a single argument is varying. Instead, use mapply or the comparable Map
Map(qq.chisq, dat, main=names(dat),
MoreArgs=list(df=2, pvals=TRUE, sub=subtitle))

Store results of a for-loop in an object or matrix

i've following problem:
I use the for-loop within R to get specific data from a matrix.
my code is as follows.
for(i in 1:100){
T <- as.Date(as.mondate (STARTLISTING)+i)
DELIST <- (subset(datensatz_Start_End.frame, TIME <= T))[,1]
write.table(DELIST, file = paste("tab", i, ".csv"), sep="," )
print(DELIST)
}
Using print, R delivers the data.
Using write.table, R delivers the data into different files.
My aim is to aggregate the results from the for-loop within one matrix. (each row for 'i')
But unfortunately I can not make it.
sorry, i'm a real noob within R.
for(i in 1:100)
{
T <- as.Date(as.mondate (STARTLISTING)+i)
DELIST <- (subset(datensatz_Start_End.frame, TIME <= T))[,1]
assign(paste('b',i,sep=''),DELIST)
}
this delivers 100 objects, which contain my results.
But what i need is one matrix/dataframe with 100 columns or one list.
Any ideas?
Hey!
Hence I'm not allowed to edit my own answers, here my (simple) solution as follows:
DELIST <- vector("list",100)
for(i in 1:100)
{
T <- as.Date(as.mondate (STARTLISTING)+i)
DELIST[[i]] <- as.character((subset(datensatz_Start_End.frame, TIME <= T))[,1])
}
DELIST[[99]] ## it is possible to requist the relevant companies for every 'i'
Thx to everyone!
George
If you want a list you can use lapply instead of loop
LL <- lapply(1:100,
function(i) {
T <- as.Date(as.mondate (STARTLISTING)+i)
DELIST <- (subset(datensatz_Start_End.frame, TIME <= T))[,1]
assign(paste('b',i,sep=''),DELIST)
}
)
After that you can rbind results together using do.call
result <- do.call(rbind, LL)
Or if you are confident that columns of all elements of LL are going to be of same, then you can use more efficient rbindlist from package data.table
result <- rbindlist(LL)
check out rbind function. You can start with empty DELIST.DF and append each row to it inside the loop -
DELIST.DF <- NULL
for(i in 1:100){
T <- as.Date(as.mondate (STARTLISTING)+i)
DELIST <- (subset(datensatz_Start_End.frame, TIME <= T))[,1]
DELIST.DF <- rbind(DELIST.DF, DELIST)
write.table(DELIST, file = paste("tab", i, ".csv"), sep="," )
print(DELIST)
}

R: make pls calibration models from n number of subset and use them to predict different test sets

I am trying to apply a function I wrote that uses the 'pls' package to make a model and then use it
to predict several test set(in this case 9), returning the R2,RMSEP and prediction bias of each test set
for n number of subset selected from the data frame.
the function is
cpo<-function(data,newdata1,newdata2,newdata3,newdata4,newdata5,newdata6,newdata7,newdata8,newdata9){
data.pls<-plsr(protein~.,8,data=data,validation="LOO")#making a pls model
newdata1.pred<-predict(data.pls,8,newdata=newdata1) #using the model to predict test sets
newdata2.pred<-predict(data.pls,8,newdata=newdata2)
newdata3.pred<-predict(data.pls,8,newdata=newdata3)
newdata4.pred<-predict(data.pls,8,newdata=newdata4)
newdata5.pred<-predict(data.pls,8,newdata=newdata5)
newdata6.pred<-predict(data.pls,8,newdata=newdata6)
newdata7.pred<-predict(data.pls,8,newdata=newdata7)
newdata8.pred<-predict(data.pls,8,newdata=newdata8)
newdata9.pred<-predict(data.pls,8,newdata=newdata9)
pred.bias1<-mean(newdata1.pred-newdata1[742]) #calculating the prediction bias
pred.bias2<-mean(newdata2.pred-newdata2[742])
pred.bias3<-mean(newdata3.pred-newdata3[742]) #[742] reference values in column742
pred.bias4<-mean(newdata4.pred-newdata4[742])
pred.bias5<-mean(newdata5.pred-newdata5[742])
pred.bias6<-mean(newdata6.pred-newdata6[742])
pred.bias7<-mean(newdata7.pred-newdata7[742])
pred.bias8<-mean(newdata8.pred-newdata8[742])
pred.bias9<-mean(newdata9.pred-newdata9[742])
r<-c(R2(data.pls,"train"),RMSEP(data.pls,"train"),pred.bias1,
pred.bias2,pred.bias3,pred.bias4,pred.bias5,pred.bias6,
pred.bias7,pred.bias8,pred.bias9)
return(r)
}
selecting n number of subsets (based on an answer from my question[1]: Select several subsets by taking different row interval and appy function to all subsets
and applying cpo function to each subset I tried
Edited based on #Gavin advice
FO03 <- function(data, nSubsets, nSkip){
outList <- vector("list", 11)
names(outList) <- c("R2train","RMSEPtrain", paste("bias", 1:9, sep = ""))
sub <- vector("list", length = nSubsets) # sub is the n number subsets created by selecting rows
names(sub) <- c( paste("sub", 1:nSubsets, sep = ""))
totRow <- nrow(data)
for (i in seq_len(nSubsets)) {
rowsToGrab <- seq(i, totRow, nSkip)
sub[[i]] <- data[rowsToGrab ,]
}
for(i in sub) { #for every subset in sub i want to apply cpo
outList[[i]] <- cpo(data=sub,newdata1=gag11p,newdata2=gag12p,newdata3=gag13p,
newdata4=gag21p,newdata5=gag22p,newdata6=gag23p,
newdata7=gag31p,newdata8=gag32p,newdata9=gag33p) #new data are test sets loaded in the workspace
}
return(outlist)
}
FOO3(GAGp,10,10)
when I try this I keep getting 'Error in eval(expr, envir, enclos) : object 'protein' not found' not found.
Protein is used in the plsr formula of cpo, and is in the data set.
I then tried to use the plsr function directly as seen below
FOO4 <- function(data, nSubsets, nSkip){
outList <- vector("list", 11)
names(outList) <- c("R2train","RMSEPtrain", paste("bias", 1:9, sep = ""))
sub <- vector("list", length = nSubsets)
names(sub) <- c( paste("sub", 1:nSubsets, sep = ""))
totRow <- nrow(data)
for (i in seq_len(nSubsets)) {
rowsToGrab <- seq(i, totRow, nSkip)
sub[[i]] <- data[rowsToGrab ,]
}
cal<-vector("list", length=nSubsets) #for each subset in sub make a pls model for protein
names(cal)<-c(paste("cal",1:nSubsets, sep=""))
for(i in sub) {
cal[[i]] <- plsr(protein~.,8,data=sub,validation="LOO")
}
return(outlist) # return is just used to end script and check if error still occurs
}
FOO4(gagpm,10,10)
When I tried this I get the same error 'Error in eval(expr, envir, enclos) : object 'protein' not found'.
Any advice on how to deal with this and make the function work will be much appreciated.
I suspect the problem is immediately at the start of FOO3():
FOO3 <- function(data, nSubsets, nSkip) {
outList <- vector("list", r <- c(R2(data.pls,"train"), RMSEP(data.pls,"train"),
pred.bias1, pred.bias2, pred.bias3, pred.bias4, pred.bias5,
pred.bias6, pred.bias7, pred.bias8, pred.bias9))
Not sure what you are trying to do when creating outList, but vector() has two arguments and you seem to be assigning to r a vector of numerics that you want R to use as the length argument to vector().
Here you are using the object data.pls and this doesn't exist yet - and never will in the frame of FOO3() - it is only ever created in cpo().
Your second loop looks totally wrong - you are not assigning the output from cpo() to anything. I suspect you wanted:
outList <- vector("list", 11)
names(outList) <- c("R2train","RMSEPtrain", paste("bias", 1:9, sep = ""))
....
for(i in subset) {
outList[[i]] <- cpo(....)
}
return(outList)
But that depends on what subset is etc. You also haven't got the syntax for this loop right. You have
for(i in(subset)) {
when it should be
for(i in subset) {
And subset and data aren't great names as these are common R functions and modelling arguments.
There are lots of problems with your code. Try to start simple and build up from there.
I have managed to achieved what i wanted using this, if there is a better way of doing it (i'm sure there must be) I'm eager to learn.This function preforms the following task
1. select "n" number of subsets from a dataframe
2. For each subset created, a plsr model is made
3. Each plsr model is used to predict 9 test sets
4. For each prediction, the prediction bias is calculated
far5<- function(data, nSubsets, nSkip){
sub <- vector("list", length = nSubsets)
names(sub) <- c( paste("sub", 1:nSubsets, sep = ""))
totRow <- nrow(data)
for (i in seq_len(nSubsets)) {
rowsToGrab <- seq(i, totRow, nSkip)
sub[[i]] <- data[rowsToGrab ,]} #sub is the subsets created
mop<- lapply(sub,cpr2) #assigning output from cpr to mop
names(mop)<-c(paste("mop", mop, sep=""))
return(names(mop))
}
call: far5(data,nSubsets, nSkip))
The first part -selecting the subsets is based on the answer to my question Select several subsets by taking different row interval and appy function to all subsets
I was then able to apply the function cpr2 to the subsets created using "lapply" instead of the "for' loop as was previously done.
cpr2 is a modification of cpo, for which only data is supplied, and the new data to be predicted is used directly in the function as shown below.
cpr2<-function(data){
data.pls<-plsr(protein~.,8,data=data,validation="LOO") #make plsr model
gag11p.pred<-predict(data.pls,8,newdata=gag11p) #predict each test set
gag12p.pred<-predict(data.pls,8,newdata=gag12p)
gag13p.pred<-predict(data.pls,8,newdata=gag13p)
gag21p.pred<-predict(data.pls,8,newdata=gag21p)
gag22p.pred<-predict(data.pls,8,newdata=gag22p)
gag23p.pred<-predict(data.pls,8,newdata=gag23p)
gag31p.pred<-predict(data.pls,8,newdata=gag31p)
gag32p.pred<-predict(data.pls,8,newdata=gag32p)
gag33p.pred<-predict(data.pls,8,newdata=gag33p)
pred.bias1<-mean(gag11p.pred-gag11p[742]) #calculate prediction bias
pred.bias2<-mean(gag12p.pred-gag12p[742])
pred.bias3<-mean(gag13p.pred-gag13p[742])
pred.bias4<-mean(gag21p.pred-gag21p[742])
pred.bias5<-mean(gag22p.pred-gag22p[742])
pred.bias6<-mean(gag23p.pred-gag23p[742])
pred.bias7<-mean(gag31p.pred-gag31p[742])
pred.bias8<-mean(gag32p.pred-gag32p[742])
pred.bias9<-mean(gag33p.pred-gag33p[742])
r<-signif(c(pred.bias1,pred.bias2,pred.bias3,pred.bias4,pred.bias5,
pred.bias6,pred.bias7,pred.bias8,pred.bias9),2)
out<-c(R2(data.pls,"train",ncomp=8),RMSEP(data.pls,"train",ncomp=8),r)
return(out)
} #signif use to return 2 decimal place for prediction bias
call:cpr2(data)
I was able to use this to solve my problem, however since the amount of new data to be predicted was only nine, it was possible to list them out as i did. If there is a more generalized way to do this I'm interested in learning.

Resources