I'm trying to use transformations of my outcomevar in a function that runs a few variants of models and stores the result in a list.
The runpanelsfunction first calls the prepare data function, which creates the lagged and differenced variables of the outcome variable specified as argument in the function. So after preparedata, model data contains outcomevar, doutcomevar and loutcomevar.
My problem is I now need to call/get these transformations of the outcomevar to subset the data such that loutcomevar and doutcomevar is not zero.
And then i need to use doutcomevar and loutcomevar in the models.
set.seed(1)
df <- data.frame(firm=rep(LETTERS[1:5],each=10),
date=as.Date("2014-01-01")+1:10,
y1=sample(1:100,50),y2=sample(1:100,50),y3=sample(1:100,50),
x1=sample(1:100,50), x2=sample(1:100,50))
preparedata<-function(testData,outcomevar){
require(data.table)
DT <- as.data.table(testData)
setkey(DT,firm,date)
DT[,lag := c(NA,unlist(.SD)[-.N]), by=firm, .SDcols=outcomevar]
DT[,diff := c(NA,diff(unlist(.SD))), by=firm, .SDcols=outcomevar]
setnames(DT,c("lag","diff"),paste0(c("loutcomevar","doutcomevar")))
return(DT)
modeldata<-as.data.frame(DT)
}
runpanels <- function(testData,outcomevar) {
modeldata<-preparedata(testData,outcomevar)
modeldata<-subset(modeldata,loutcomevar!=0& doutcomevar!=0)
modellist<-list()
modellist$m1<-lm(log(outcomevar)~-1+x1+x2,data=modeldata)
modellist$m2<-lm(log(doutcomevar)~-1+x1+date,data=modeldata)
modellist$m3<-lm(log(outcomevar)~-1+log(loutcomevar)+x1+x2,data=modeldata)
return(modellist)
}
Example use: modelsID1<-runpanels(df,outcomevar="y1")
Unsurprisingly, I get the error when it gets to evaluating "loutcomevar!=0"
: Error in eval(expr, envir, enclos) : object 'loutcomevar' not found
Called from: eval(e, x, parent.frame())
So it does not find the lagged variable i created in the prepare data function in the environment of the run panels function.
How can I call those variables?
The below example solution from another question was using call which is similar to my problem but i also want to call transformations of my outcomevar which is an argument of the function.
Any ideas how to tackle this will be much appreciated!
Example solution from other question that was kind of similar:
air <- data(airquality)
fm <- lm(Ozone ~ Solar.R, data=airquality)
myfun <- function(fm, name){
dn <- fm$call[['data']]
varname <- deparse(substitute(name))
get(as.character(dn),envir=.GlobalEnv)[varname]
}
Usage: myfun(fm, Temp)
You are assuming way too much capacity of the R interpreter to think like you do. It's powers of abstraction are much more limited. In particular there is no interpretation that would allow doutcomevar nd loutcomevar to be constructed within a formula or in the subset call.
Something allong these (untested) lines might work:
runpanels <- function(testData,outcomevar) {
modeldata<-preparedata(testData,outcomevar)
idx <- testData[[ paste0("l", outcomevar) ]] != 0 &
testData[[ paste0("d", outcomevar) ]] != 0
modeldata<-modeldata[idx ,]
modellist<-list()
form1 <- as.formula( "log(", outcomevar,")~-1+x1+x2" )
modellist$m1<-lm(log(outcomevar)~-1+x1+x2,data=modeldata)
#similar construction of formula objects for models 2 and 3
# .........
modellist$m2<-lm(log(doutcomevar)~-1+x1+date,data=modeldata)
modellist$m3<-lm(log(outcomevar)~-1+log(loutcomevar)+x1+x2,data=modeldata)
return(modellist)
}
set.seed(1)
df <- data.frame(firm=rep(LETTERS[1:5],each=10),
date=as.Date("2014-01-01")+1:10,
y1=sample(1:100,50),y2=sample(1:100,50),y3=sample(1:100,50),
x1=sample(1:100,50), x2=sample(1:100,50))
preparedata<-function(testData,outcomevar){
require(data.table)
DT <- as.data.table(testData)
setkey(DT,firm,date)
DT[,lag := c(NA,unlist(.SD)[-.N]), by=firm, .SDcols=outcomevar]
DT[,diff := c(NA,diff(unlist(.SD))), by=firm, .SDcols=outcomevar]
setnames(DT,c("lag","diff"),paste0(c("loutcomevar","doutcomevar")))
DT$outcomevar <- with(DT, eval(parse(text=outcomevar)))
return(DT)
modeldata<-as.data.frame(DT)
}
runpanels <- function(testData,outcomevar) {
modeldata<-preparedata(testData,outcomevar)
modeldata<-subset(modeldata,loutcomevar!=0& doutcomevar!=0)
modellist<-list()
modellist$m1<-lm(log(outcomevar)~-1+x1+x2,data=modeldata)
modellist$m2<-lm(log(doutcomevar)~-1+x1+date,data=modeldata)
modellist$m3<-lm(log(outcomevar)~-1+log(loutcomevar)+x1+x2,data=modeldata)
return(modellist)
}
Example use: modelsID1<-runpanels(df,outcomevar="y1")
Example use: modelsID1<-runpanels(df,outcomevar="y2")
Related
I'm running several models and functions with the models. I need to apply values from a list to a default R function, e.g., modelFit()
My idea is to run the same function to several models and compile the results after.
I'm trying to use loops and the Family of 'apply' functions in R, but with no success.
#package drc is necessary
library(drc)
#my data
rates <- c(.1,.1,.1,1,1,1,10,10,10,100,100,100,1000,1000,1000,.1,.1,.1,1,1,1,10,10,10,100,100,100,1000,1000,1000)
prod <- c("A","A","A","A","A","A","A","A","A","A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B","B","B","B","B","B")
resp <- c(.295,.32,.301,.155,.1501,.148,.05,.03,.044,.002,.001,.0015,.001,.0005,.0003,.312,.337,.299,.265,.2501,.248,.102,.103,.114,.02,.01,.015,.003,.0002,.0007)
data.test <- data.frame(rates,prod,resp) #my data frame
#my models
m1 <- drm(resp~rates, fct=LL.4(), data=data.test[data.test$prod=="A",])
m2 <- drm(resp~rates, fct=LL.4(), data=data.test[data.test$prod=="B",])
#lack of fit test
modelFit(m1)
modelFit(m2)
#I can get the modelFit p-values this way:
modelFit(m1)$"p value"[2]
modelFit(m2)$"p value"[2]
#I have several models. I want to create a loop to give me only the p value for each model fit. I want to use that for other information given by the summaries and function
#list of models
modelsList <- c("m1","m2")
#I can print the strings with the loop
for(i in modelsList){
print(i)
}
My idea was to use the strings to add information for the default 'drc' function modelFit(). The result would be a list with all p values from all the results in the loop, but I'm getting the error:
Error: $ operator is invalid for atomic vectors
#Not working
for(i in modelsList){
modelFit(i)$"p value"[2]
}
#Error
# Error: $ operator is invalid for atomic vectors
#Trying to use lapply
#Following this logic
lapply(modelsList, function(x) print(x))
#I could not get the results
lapply(modelsList, function(x) modelFit(x)$"p value"[2])
With the results, I'd continue and I'd create a data.frame with all the models and respective p values.
good to see you, Danilo
I think you missed drm function.
modelsList <- c("m1","m2")
prod = c("A","B")
type = data.frame(modelsList,
prod)
#I can print the strings with the loop
for(i in modelsList){
print(i)
}
for(i in modelsList){
model_Prod = type[modelsList == i,]$prod
drm_result<-drm(resp~rates, fct=LL.4(), data=data.test[data.test$prod==model_Prod,])
print(modelFit(drm_result)$"p value"[2])
}
# lapply
lapply(modelsList, function(x) {
model_Prod = type[modelsList == x,]$prod
x<-drm(resp~rates, fct=LL.4(), data=data.test[data.test$prod==model_Prod,])
modelFit(x)$"p value"[2]
}
)
Continuing, after Steve Lee inputs, I could generate a data.frame with all results I got from the loop:
#source of information:
#https://stackoverflow.com/questions/25285570/invalid-factor-level-with-rbind-to-data-frame
#df.results will be a data.frane with headers = "Model" and "ModelFit_pvalue""
df.results <- NULL
#Loop to print only specif information requested from function and add it to a data frame
for(i in modelsList){
model_Prod = type[modelsList == i,]$prod
drm_result<-drm(resp~rates, fct=LL.4(), data=data.test[data.test$prod==model_Prod,])
#print(i)
#print(modelFit(drm_result)$"p value"[2])
modelpvalue <- modelFit(drm_result)$"p value"[2] #store p values in a variable
de <- data.frame("Model"=i,"ModelFit_pvalue"= modelpvalue)
df.results = rbind(df.results,de)
}
df.results
#df.results output:
Model ModelFit_pvalue
1 m1 0.2346460
2 m2 0.5757368
I am still getting used with functions. I had a look in environments documentation but I can't figure out how to solve the error. Lets see what I tried until now:
I have a list of documents. Lets suppose it is "core"
library(dplyr)
table_1 <- data.frame(replicate(10,sample(0:1,1000,rep=TRUE)))
table_2 <- data.frame(replicate(10,sample(0:1,1000,rep=TRUE)))
core <- list(table_1, table_2)
Then, I have to run the function documents_ for each element of the list. This function gives some parameters to execute in another nested function:
documents_ <- function(i) {
core_processed <- as.data.frame(core[[i]])
x <- 1:nrow(core_processed)
y <- 1:ncol(core_processed)
temp <- sapply(x, function(x) mapply(calc_dens_,x,y))
return(temp)
}
Inside that, there is the function calc_dens, which is:
calc_dens_ <- function(x, y) {
core_temp <- core_processed %>%
filter(X2 == x & X3 == y)
return(core_temp)
}
Then, for iterate for each element of the list, I tried without success:
calc <- lapply(c(1:2), function(i) documents_(i))
Error in eval(lhs, parent, parent) : object 'core_processed' not found
The calc_dens function doesn't get the results of the documents_ (environment problem. Is there a way to solve this, or another better approach? My function is more complex than this, but the main elements are in this example. Thank you in advance.
As the other commenters have said, the problem is that you are referring to a variable, core_processed that is not in scope. You can make it a global variable, but it might be more sensible just to use it in a closure like this:
table_1 <- data.frame(replicate(10,sample(0:1,1000,rep=TRUE)))
table_2 <- data.frame(replicate(10,sample(0:1,1000,rep=TRUE)))
cores <- list(table_1, table_2)
documents_ <- function(core_processed) {
x <- 1:nrow(core_processed)
y <- 1:ncol(core_processed)
calc_dens <- function(x, y) core_processed %>% filter(X2 == x & X3 == y)
sapply(x, function(x) mapply(calc_dens, x, y))
}
calc <- lapply(cores, documents_)
If cores is a list of data frames, you do not need to to use as.data.frame and since you use lapply, there is no need to apply over indices and then index into the list. So the code I wrote here is simplified but does the same as your code.
I have to wonder, though, is this really what you want? The sapply over x and then mapply over x and y -- where x is the one from the sapply and not the ist you built in documents_ -- looks mighty strange to me.
I am trying to make a simple function which will find and remove outliers automatically. This is the function I have created so far:
fOutlier <- function(x, y) {
outlier <- with(x, boxplot.stats(y)$out)
subset(x, !(y %in% outlier))
}
data <- fOutlier(data, variable)
The problem is that the function does not read x as dataset name. It works if I use the following:
data <- fOutlier(data, data$variable)
Non-standard evaluation seems to be the culprit.
This is what I would personally do.
set.seed(1)
# mock data set
d<-data.frame(var1=rnorm(1000,500,50),
var2=rnorm(1000,1000,100),
var3=rnorm(1000,1000,100),
var4=rnorm(1000,1000,100))
fOutlier<-function(dat, var_name){
var_vec<-dat[,var_name]
outliers<-boxplot.stats(var_vec)$out
clean_dat<-dat[!(var_vec %in% outliers),]
}
# test with different variables
d_var1_clean<-fOutlier(d, 'var1')
d_var2_clean<-fOutlier(d, 'var2')
d_var3_clean<-fOutlier(d, 'var3')
If you really like the non-standard evaluation, then you can add eval() and substitute() to maintain this functionality.
This function is a workable version of what you posted (note the creation of y_vec):
fOutlier2 <- function(x, y) {
y_vec<-eval(substitute(y),eval(x))
outlier <- boxplot.stats(y_vec)$out
subset(x, !(y_vec %in% outlier))
}
d_var1_clean2<-fOutlier2(d, var1)
I have roughly this function:
plot_pca_models <- function(models, id) {
library(lattice)
splom(models, groups=id)
}
and I'm calling it like this:
plot_pca_models(data.pca, log$id)
wich results in this error:
Error in eval(expr, envir, enclos) : object 'id' not found
when I call it without the wrapping function:
splom(data.pca, groups=log$id)
it raises this error:
Error in log$id : object of type 'special' is not subsettable
but when I do this:
id <- log$id
splom(models, groups=id)
it behaves as expected.
Please can anybody explain why it behaves like this and how to correct it? Thanks.
btw:
I'm aware of similar questions here, eg:
Help understand the error in a function I defined in R
Object not found error with ddply inside a function
Object disappears from namespace in function
but none of them helped me.
edit:
As requested, there is full "plot_pca_models" function:
plot_pca_models <- function(data, id, sel=c(1:4), comp=1) {
# 'data' ... princomp objects
# 'id' ... list of samples id (classes)
# 'sel' ... list of models to compare
# 'comp' ... which pca component to compare
library(lattice)
models <- c()
models.size <- 1:length(data)
for(model in models.size) {
models <- c(models, list(data[[model]]$scores[,comp]))
}
names(models) <- 1:length(data)
models <- do.call(cbind, models[sel])
splom(models, groups=id)
}
edit2:
I've managed to make the problem reproducible.
require(lattice)
my.data <- data.frame(pca1 = rnorm(100), pca2 = rnorm(100), pca3 = rnorm(100))
my.id <- data.frame(id = sample(letters[1:4], 100, replace = TRUE))
plot_pca_models2 <- function(x, ajdi) {
splom(x, group = ajdi)
}
plot_pca_models2(x = my.data, ajdi = my.id$id)
which produce the same error like above.
The problem is that splom evaluates its groups argument in a nonstandard way.A quick fix is to rewrite your function so that it constructs the call with the appropriate syntax:
f <- function(data, id)
eval(substitute(splom(data, groups=.id), list(.id=id)))
# test it
ir <- iris[-5]
sp <- iris[, 5]
f(ir, sp)
log is a function in base R. Good practice is to not name objects after functions...it can create confusion. Type log$test into a clean R session and you'll see what's happening:
object of type 'special' is not subsettable
Here's a modification of Hong Oi's answer. First I would recommend to include id in the main data frame, i.e
my.data <- data.frame(pca1 = rnorm(100), pca2 = rnorm(100), pca3 = rnorm(100), id = sample(letters[1:4], 100, replace = TRUE))
.. and then
plot_pca_models2 <- function(x, ajdi) {
Call <- bquote(splom(x, group = x[[.(ajdi)]]))
eval(Call)
}
plot_pca_models2(x = my.data, ajdi = "id")
The cause of the confusion is the following line in lattice:::splom.formula:
groups <- eval(substitute(groups), data, environment(formula))
... whose only point is to be able to specify groups without quotation marks, that is,
# instead of
splom(DATA, groups="ID")
# you can now be much shorter, thanks to eval and substitute:
splom(DATA, groups=ID)
But of course, this makes using splom (and other functions e.g. substitute which use "nonstandard evaluation") harder to use from within other functions, and is against the philosophy that is "mostly" followed in the rest of R.
I am trying to apply a function I wrote that uses the 'pls' package to make a model and then use it
to predict several test set(in this case 9), returning the R2,RMSEP and prediction bias of each test set
for n number of subset selected from the data frame.
the function is
cpo<-function(data,newdata1,newdata2,newdata3,newdata4,newdata5,newdata6,newdata7,newdata8,newdata9){
data.pls<-plsr(protein~.,8,data=data,validation="LOO")#making a pls model
newdata1.pred<-predict(data.pls,8,newdata=newdata1) #using the model to predict test sets
newdata2.pred<-predict(data.pls,8,newdata=newdata2)
newdata3.pred<-predict(data.pls,8,newdata=newdata3)
newdata4.pred<-predict(data.pls,8,newdata=newdata4)
newdata5.pred<-predict(data.pls,8,newdata=newdata5)
newdata6.pred<-predict(data.pls,8,newdata=newdata6)
newdata7.pred<-predict(data.pls,8,newdata=newdata7)
newdata8.pred<-predict(data.pls,8,newdata=newdata8)
newdata9.pred<-predict(data.pls,8,newdata=newdata9)
pred.bias1<-mean(newdata1.pred-newdata1[742]) #calculating the prediction bias
pred.bias2<-mean(newdata2.pred-newdata2[742])
pred.bias3<-mean(newdata3.pred-newdata3[742]) #[742] reference values in column742
pred.bias4<-mean(newdata4.pred-newdata4[742])
pred.bias5<-mean(newdata5.pred-newdata5[742])
pred.bias6<-mean(newdata6.pred-newdata6[742])
pred.bias7<-mean(newdata7.pred-newdata7[742])
pred.bias8<-mean(newdata8.pred-newdata8[742])
pred.bias9<-mean(newdata9.pred-newdata9[742])
r<-c(R2(data.pls,"train"),RMSEP(data.pls,"train"),pred.bias1,
pred.bias2,pred.bias3,pred.bias4,pred.bias5,pred.bias6,
pred.bias7,pred.bias8,pred.bias9)
return(r)
}
selecting n number of subsets (based on an answer from my question[1]: Select several subsets by taking different row interval and appy function to all subsets
and applying cpo function to each subset I tried
Edited based on #Gavin advice
FO03 <- function(data, nSubsets, nSkip){
outList <- vector("list", 11)
names(outList) <- c("R2train","RMSEPtrain", paste("bias", 1:9, sep = ""))
sub <- vector("list", length = nSubsets) # sub is the n number subsets created by selecting rows
names(sub) <- c( paste("sub", 1:nSubsets, sep = ""))
totRow <- nrow(data)
for (i in seq_len(nSubsets)) {
rowsToGrab <- seq(i, totRow, nSkip)
sub[[i]] <- data[rowsToGrab ,]
}
for(i in sub) { #for every subset in sub i want to apply cpo
outList[[i]] <- cpo(data=sub,newdata1=gag11p,newdata2=gag12p,newdata3=gag13p,
newdata4=gag21p,newdata5=gag22p,newdata6=gag23p,
newdata7=gag31p,newdata8=gag32p,newdata9=gag33p) #new data are test sets loaded in the workspace
}
return(outlist)
}
FOO3(GAGp,10,10)
when I try this I keep getting 'Error in eval(expr, envir, enclos) : object 'protein' not found' not found.
Protein is used in the plsr formula of cpo, and is in the data set.
I then tried to use the plsr function directly as seen below
FOO4 <- function(data, nSubsets, nSkip){
outList <- vector("list", 11)
names(outList) <- c("R2train","RMSEPtrain", paste("bias", 1:9, sep = ""))
sub <- vector("list", length = nSubsets)
names(sub) <- c( paste("sub", 1:nSubsets, sep = ""))
totRow <- nrow(data)
for (i in seq_len(nSubsets)) {
rowsToGrab <- seq(i, totRow, nSkip)
sub[[i]] <- data[rowsToGrab ,]
}
cal<-vector("list", length=nSubsets) #for each subset in sub make a pls model for protein
names(cal)<-c(paste("cal",1:nSubsets, sep=""))
for(i in sub) {
cal[[i]] <- plsr(protein~.,8,data=sub,validation="LOO")
}
return(outlist) # return is just used to end script and check if error still occurs
}
FOO4(gagpm,10,10)
When I tried this I get the same error 'Error in eval(expr, envir, enclos) : object 'protein' not found'.
Any advice on how to deal with this and make the function work will be much appreciated.
I suspect the problem is immediately at the start of FOO3():
FOO3 <- function(data, nSubsets, nSkip) {
outList <- vector("list", r <- c(R2(data.pls,"train"), RMSEP(data.pls,"train"),
pred.bias1, pred.bias2, pred.bias3, pred.bias4, pred.bias5,
pred.bias6, pred.bias7, pred.bias8, pred.bias9))
Not sure what you are trying to do when creating outList, but vector() has two arguments and you seem to be assigning to r a vector of numerics that you want R to use as the length argument to vector().
Here you are using the object data.pls and this doesn't exist yet - and never will in the frame of FOO3() - it is only ever created in cpo().
Your second loop looks totally wrong - you are not assigning the output from cpo() to anything. I suspect you wanted:
outList <- vector("list", 11)
names(outList) <- c("R2train","RMSEPtrain", paste("bias", 1:9, sep = ""))
....
for(i in subset) {
outList[[i]] <- cpo(....)
}
return(outList)
But that depends on what subset is etc. You also haven't got the syntax for this loop right. You have
for(i in(subset)) {
when it should be
for(i in subset) {
And subset and data aren't great names as these are common R functions and modelling arguments.
There are lots of problems with your code. Try to start simple and build up from there.
I have managed to achieved what i wanted using this, if there is a better way of doing it (i'm sure there must be) I'm eager to learn.This function preforms the following task
1. select "n" number of subsets from a dataframe
2. For each subset created, a plsr model is made
3. Each plsr model is used to predict 9 test sets
4. For each prediction, the prediction bias is calculated
far5<- function(data, nSubsets, nSkip){
sub <- vector("list", length = nSubsets)
names(sub) <- c( paste("sub", 1:nSubsets, sep = ""))
totRow <- nrow(data)
for (i in seq_len(nSubsets)) {
rowsToGrab <- seq(i, totRow, nSkip)
sub[[i]] <- data[rowsToGrab ,]} #sub is the subsets created
mop<- lapply(sub,cpr2) #assigning output from cpr to mop
names(mop)<-c(paste("mop", mop, sep=""))
return(names(mop))
}
call: far5(data,nSubsets, nSkip))
The first part -selecting the subsets is based on the answer to my question Select several subsets by taking different row interval and appy function to all subsets
I was then able to apply the function cpr2 to the subsets created using "lapply" instead of the "for' loop as was previously done.
cpr2 is a modification of cpo, for which only data is supplied, and the new data to be predicted is used directly in the function as shown below.
cpr2<-function(data){
data.pls<-plsr(protein~.,8,data=data,validation="LOO") #make plsr model
gag11p.pred<-predict(data.pls,8,newdata=gag11p) #predict each test set
gag12p.pred<-predict(data.pls,8,newdata=gag12p)
gag13p.pred<-predict(data.pls,8,newdata=gag13p)
gag21p.pred<-predict(data.pls,8,newdata=gag21p)
gag22p.pred<-predict(data.pls,8,newdata=gag22p)
gag23p.pred<-predict(data.pls,8,newdata=gag23p)
gag31p.pred<-predict(data.pls,8,newdata=gag31p)
gag32p.pred<-predict(data.pls,8,newdata=gag32p)
gag33p.pred<-predict(data.pls,8,newdata=gag33p)
pred.bias1<-mean(gag11p.pred-gag11p[742]) #calculate prediction bias
pred.bias2<-mean(gag12p.pred-gag12p[742])
pred.bias3<-mean(gag13p.pred-gag13p[742])
pred.bias4<-mean(gag21p.pred-gag21p[742])
pred.bias5<-mean(gag22p.pred-gag22p[742])
pred.bias6<-mean(gag23p.pred-gag23p[742])
pred.bias7<-mean(gag31p.pred-gag31p[742])
pred.bias8<-mean(gag32p.pred-gag32p[742])
pred.bias9<-mean(gag33p.pred-gag33p[742])
r<-signif(c(pred.bias1,pred.bias2,pred.bias3,pred.bias4,pred.bias5,
pred.bias6,pred.bias7,pred.bias8,pred.bias9),2)
out<-c(R2(data.pls,"train",ncomp=8),RMSEP(data.pls,"train",ncomp=8),r)
return(out)
} #signif use to return 2 decimal place for prediction bias
call:cpr2(data)
I was able to use this to solve my problem, however since the amount of new data to be predicted was only nine, it was possible to list them out as i did. If there is a more generalized way to do this I'm interested in learning.