create data frame in R - r

see: Selecting significant cases from a chi-squared test
The model example given in the case above is:
f = function(N=1000){
out <- data.frame("Row" = 1:N
, "Column" = 1:N
, "Chi.Square" = runif(N)
, "df"= sample(N, 1:10, replace=T)
, "p.value" = round(runif(N), 3)
)
return(out)
}
but when I would apply this to my model I would turn this into:
f = function(N=7000){
combos <- combn(ncol(final),2)
adply(combos, 2, function(x) {
test <- chisq.test(final[, x[1]], final[, x[2]])
out <- data.frame("Row" = colnames(final)[x[1]]
, "Column" = colnames(final[x[2]])
, "Chi.Square" = round(test$statistic,3)
, "df"= test$parameter
, "p.value" = round(test$p.value, 3)
)
return(out)
}}
yet R does not see this as a finished command line. Why?

Get yourself a decent editor :-)
adply(
isn't closed.
Edit: nor
function(...){
It looks like the final } should really be a ) + }

Related

Error in { : task 1 failed - "could not find function "ranger"" >

I was able to run the following code without any problems:
# first code: works fine
library(dplyr)
library(ranger)
original_data = rbind( data_1 = data.frame( class = 1, height = rnorm(10000, 180,10), weight = rnorm(10000, 90,10), salary = rnorm(10000,50000,10000)), data_2 = data.frame(class = 0, height = rnorm(100, 160,10), weight = rnorm(100, 100,10), salary = rnorm(100,40000,10000)) )
original_data$class = as.factor(original_data$class)
original_data$id = 1:nrow(original_data)
test_set= rbind(original_data[ sample( which( original_data$class == "0" ) , replace = FALSE , 30 ) , ], original_data[ sample( which( original_data$class == "1" ) , replace = FALSE, 2000 ) , ])
train_set = anti_join(original_data, test_set)
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict( results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
I am now trying to run the same code (Step 2, Step 3, Step 4) in parallel - here is my attempt:
# second code: does not work fine
library(doParallel)
library(foreach)
registerDoParallel(cores = detectCores())
foreach(i = 1:100) %dopar% {
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict( results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
}
stopImplicitCluster()
This is giving me the following error:
Error in { : task 1 failed - "could not find function "ranger""
I am not sure why this error is being produced, seeing as I have loaded the "ranger" library.
My Question: Can someone please show me what I am doing wrong and how can I make the second code run like the first code?
Thanks!
Note : After adding the suggestion made by #Waldi, the code doesn't produce an error, but is taking a very long time to run. Does anyone have any recommendations on how to improve this?
You can specify the packages you need using the .packages argument in foreach:
foreach(i = 1:100, .packages = 'ranger') %dopar% {...}
Detailed explanation on footnote regarding parallel processing being slow can be found here

Nested if else statements in R. Keep getting "Error: no function to return from, jumping to top level"

I know there are lots of questions like this around and tried the solutions proposed. But still I could not solve the following.
My aim is to create a function in R which would correlate pairs of columns in data frame. Dependent on the number of pairwise complete observations it would use slightly different approaches.
The problem here is no matter what I try, while declaring the function, I keep getting:
Error: no function to return from, jumping to top level
and
Error: unexpected '}' in "}"
Here is the function:
corr.loop <- function(df, varsA, varsB, normal, nonnormal) {
results <- matrix(ncol = 8)
colnames(results) <- c("varA", "varB", "type", "complete.obs.n", "estimate", "p", "lower.CI", "upper.CI")
for (i in 1:length(varsA)) {
for (j in 1:length(varsB)) {
if (
pairwise.complete.obs.n(df[, varsA[i]], df[, varsB[j]]) < 3
) {
results <- rbind(results,
c(
varsA[i],
varsB[j],
NA,
pairwise.complete.obs.n(df[, varsA[i]], df[, varsB[j]]),
rep(NA, 4)
))
} else {
type <- ifelse( (varsA[i] %in% nonnormal | varsB[j] %in% nonnormal), "spearman", "pearson")
cor.results <- ifelse(
type == "pearson",
cor.test(
x = df[, varsA[i]],
y = df[, varsB[j]],
alternative = "two.sided",
method = "pearson",
exact = TRUE,
conf.level = 0.95,
continuity = TRUE
),
cor.test(
x = df[, varsA[i]],
y = df[, varsB[j]],
alternative = "two.sided",
method = "spearman",
exact = TRUE,
conf.level = 0.95,
continuity = TRUE
)
)
if (
pairwise.complete.obs.n(df[, varsA[i]], df[, varsB[j]]) = 3
) {
results <- rbind(
results,
c(
varsA[i],
varsB[j],
type,
pairwise.complete.obs.n(df[, varsA[i]], df[, varsB[j]]),
cor.results$estimate,
cor.results$p.value,
NA,
NA
)
)
} else {
results <- rbind(
results,
c(
varsA[i],
varsB[j],
type,
pairwise.complete.obs.n(df[, varsA[i]], df[, varsB[j]]),
cor.results$estimate,
cor.results$p.value,
cor.results$conf.int[1],
cor.results$conf.int[2]
)
)
}
}
}
}
results <- as.data.frame(results[-1, ])
results[, 1:ncol(results)] <- lapply(results[, 1:ncol(results)], as.character)
results[, 4:ncol(results)] <- lapply(results[, 4:ncol(results)], as.numeric)
return(results)
}
Is there something obvious I am missing? Seems I just need a fresh eye here. Thank you!
Line 45 you mean == instead of =

rgenoud - How to pass parameters to the function?

I have a function that currently plays nice with rgenoud. It has one parameter (xx) and rgenoud will optimize xx perfectly.
However, I would like to add a second parameter to my function that wouldnt be optimized by rgendoud . For example, I would like my function to either fit a model with a gaussian link or a poisson link and to specify that when I call rgenoud.
Any idea?
thanks
edit: here is a minimal working example of what I mean. How would you get the last line to work?
adstock reflect the fact that TV advertising should have an impact on the number of quotes of future weeks.
Adstock[t] = Ads[t] + rate* Ads[t-1] + rate^2*Ads[t-2] + .... + rate^max_memory * Ads[t-max_memory]
We want rgenoud to figure out what rate and max_memory will return the model with the best fit. Best fit is defined as the lowest RMSE.
set.seed(107)
library(fpp)
library(rgenoud)
adstock_k <- function(x, adstock_rate = 0, max_memory = 12){
learn_rates <- rep(adstock_rate, max_memory+1) ^ c(0:max_memory)
adstocked_advertising <- stats::filter(c(rep(0, max_memory), x), learn_rates, method="convolution")
adstocked_advertising <- adstocked_advertising[!is.na(adstocked_advertising)]
return(as.numeric(adstocked_advertising))
}
getRMSE <- function(x, y) {
mean((x-y)^2) %>% sqrt
}
df <- data.frame(insurance) %>%
mutate(Quotes = round (Quotes*1000, digits = 0 ))
df$idu <- as.numeric(rownames(df))
my_f <- function(xx){
adstock_rate <- xx[1]
adstock_memory <- xx[2]
df.temp <- df %>%
mutate(adstock = adstock_k(TV.advert, adstock_rate/100, adstock_memory ))
mod <- lm(data=df.temp, Quotes ~ adstock )
getRMSE( df.temp$Quotes, predict(mod))
}
domaine <- cbind(c(30,1), c(85, 8))
#this works
min_f <- genoud(my_f, nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T)
#here I try to add a second parameter to the function.
my_f2 <- function(xx,first_n_weeks=20){
adstock_rate <- xx[1]
adstock_memory <- xx[2]
df.temp <- df %>%
filter(idu<= first_n_weeks) %>%
mutate(adstock = adstock_k(TV.advert, adstock_rate/100, adstock_memory ))
mod <- lm(data=df.temp, Quotes ~ adstock )
getRMSE( df.temp$Quotes, predict(mod))
}
#this doesnt work
min_f2 <- genoud(my_f2(first_n_week=10), nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T)
Include the argument in the call to genoud, e.g.
genoud(my_f2, nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T, first_n_weeks = 10)

Function inputs from a list

How can I run a function (in R) where some of the inputs are pulled from a list (or data frame)? Am I right in thinking that this would be more efficient than running a for-loop?
I am running simulations and want to change the variable values, but as they take a long time to run I want them to run overnight and to just tick through the different values automatically.
Here's the code for the function:
n = 10000
mu = 0
sd = 1
n.sub = 100
iboot = 100
isim = 1000 ### REDUCED FOR THIS EXAMPLE ###
var.values <- NULL
var.values.pop <- NULL
hist.fn <- function(n,mu,sd,n.sub,iboot)
{
Pop <- rnorm(n,mu,sd)
var.pop <- var(Pop)
Samp <- sample(Pop, n.sub, replace = FALSE)
var.samp <- var(Samp)
for(i in 1:isim) {
for(j in 1:iboot) {
Boot <- sample(Samp, n.sub, replace = TRUE)
var.values[j] <- var(Boot)
}
Samp <- sample(Pop, n.sub, replace = FALSE)
var.values.pop[i] <- var(Samp)
}
hist.pop <- hist(var.values.pop,plot=F)
hist.boot <- hist(var.values,plot=F)
#mypath = file.path("C:", "Output", paste("hist.boot_n.", n.sub, "_var.", sd^2, "_isim.", isim, "_iboot.", iboot, ".wmf", sep=""))
#win.metafile(file=mypath)
plot.new() #### ADDED FOR THIS EXAMPLE INSTEAD OF OUTPUTTING TO FILE ####
plot(hist.pop, freq=FALSE, xlim=range(var.values.pop, var.values), ylim=range(hist.pop$density, hist.boot$density), main = paste("Histogram of variances \n n=",n.sub," mu=",mu,"var=",sd^2,"\n n.sim=",isim,"n.boot=",iboot,"\n"), cex.main=0.8, xlab="Variance", col="red")
plot(hist.boot, freq=FALSE, col="blue", border="blue", add=T, density=20, angle=45)
abline(v=var.pop, lty=2, col="black", lwd=2)
legend("topright", legend=c("sample","bootstrap"),col=c("red","blue"),lty=1,lwd=2,bty="n",cex=0.7)
#dev.off()
}
hist.fn(n,mu,sd,n.sub,iboot)
Then I want sd, n.sub, and iboot to change by running through the following values:
sd <- c(1,10,100,1000)
n.sub <- c(4,10,100,1000)
iboot <- c(100,1000,10000)
Perhaps something like this?
n = 10000
mu = 0
sd = 1
n.sub = 100
iboot = 100
isim = 1000
sd <- c(1,10,100,1000)
n.sub <- c(4,10,100,1000)
iboot <- c(100,1000,10000)
# hist.fn parameters: n,mu,sd,n.sub,iboot
params <- expand.grid(n = n, mu = mu, sd = sd,
n.sub = n.sub, iboot = iboot)
apply(params, 1, FUN = function(x) do.call(hist.fn, as.list(x) ) )
You probably want to put these:
var.values <- NULL
var.values.pop <- NULL
Inside hist.fn, because assigning values to variables outside a function doesn't work like you seem to think.
You should use do.call, which will apply the function using arguments in a list. I have simplified your example to run less loops for the example. You can modify the printline of the script in order to monitor your progress for a larger job:
# The function
hist.fn <- function(n,mu,isim,sd,n.sub,iboot)
{
Pop <- rnorm(n,mu,sd)
var.pop <- var(Pop)
Samp <- sample(Pop, n.sub, replace = FALSE)
var.samp <- var(Samp)
var.values <- NaN*seq(isim) # sets up an empty vector for results
var.values.pop <- NaN*seq(isim) # sets up an empty vector for results
for(i in seq(isim)) {
for(j in seq(iboot)) {
Boot <- sample(Samp, n.sub, replace = TRUE)
var.values[j] <- var(Boot)
print(paste("i =", i, "; j =", j))
}
Samp <- sample(Pop, n.sub, replace = FALSE)
var.values.pop[i] <- var(Samp)
}
list(var.values=var.values, var.values.pop=var.values.pop) #returns results in the form of a list
}
# Global variables
n = 100
mu = 0
isim = 10
# Changing variables
sd <- c(1,10,20,30)
n.sub <- c(4,10,20,30)
iboot <- c(100,200,300,400)
df <- data.frame(sd=sd, n.sub=n.sub, iboot=iboot)
res <- vector(mode="list", nrow(df)) # sets up an empty list for results
for(i in seq(nrow(df))){
res[[i]] <- do.call(hist.fn, c(n=n, mu=mu, isim=isim, df[i,]) )
}
res # show results
sd <- 1:3
n.sub <- 4:6
iboot <- 7:9
funct1<-function(x,y,z) print(x+y+z)
for (i in 1:length(sd)){
funct1(sd[i],n.sub[i],iboot[i])
}
just an example. Doing it with loop.

Performance: combn on large data.table

Lets start with some generated data which are pretty realistic:
tmp <- data.table(
label = sprintf( "X%03d", 1:500),
start = sample( 50:950, 500, replace=TRUE ),
length = round( 20 * rf( rep(1, 500), 5, 5 ), 0 )
)
DT <- tmp[ , list( t = seq( start, length.out=length ) ), by = label ]
DT[ , I := sample(1:100, 1) * dbeta( seq(from=0,to=1, length.out=length(t)), sample(3:6,1), sample(5:10,1) ), by = label ]
DT <- DT[ I > 1E-2 ]
DT represents time series data for (in this case) 500 labels:
library(ggplot2)
ggplot( DT[ t %between% c(100,200) ], aes( x = t, y = I, group = label ) ) +
geom_line()
I want to correlate the data by all label pairs, given that they have a sufficient overlap. This is my approach:
# feel free to use just a subset here
labs <- DT[ , unique( label ) ][1:50]
# is needed for fast intersecting
setkey( DT, t )
# just needed for tracking progress
count <- 0
progress <- round(seq( from = 1, to = length(labs) * (length(labs) -1) / 2, length.out=100 ),0)
corrs <-
combn( labs, m=2, simplify=TRUE, minOverlap = 5, FUN = function( x, minOverlap ) {
# progress
count <<- count + 1
if( count %in% progress ){
cat( round( 100*count/max(progress),0 ), ".." )
}
# check overlap and correlate
a <- DT[label == x[1]]
b <- DT[label == x[2]]
iscectT <- intersect( a[ , t], b[ , t] )
n <- length(iscectT)
if( n >= minOverlap ){
R <- cor( a[J(iscectT)][, I], b[J(iscectT)][, I] )
return( c( x[1], x[2], n, min(iscectT), max(iscectT), R) )
}
else{
# only needed because of simplify = TRUE
return( rep(NA, 6) )
}
})
This works pretty fine, but is much slower than expected. In the particular case this would take up to 10 minutes on my machine.
Any help on improving the performance of this approach is highly appreciated. Questions which came to my mind:
Do I have to expect any side effects concerning on DTif I would deploy one of R's parallelization mechanisms, e.g. foreach? Is there a parallelization interface for data.table as there is for example for plyr?
Is there a way of using combn with simplify = FALSE without having horrible runtimes the longer the process goes. I assume that a lot of list copying takes place because increasing list capacities.
Is there anything I can do on the algorithmic side to make this faster?
As Roland suggested in his comment, using combn just to calculate the combinations of labels and then perform directly joins on the data.table, is magnitudes faster:
corrs <- as.data.frame(do.call( rbind, combn(labs, m=2, simplify = FALSE) ), stringsAsFactors=FALSE)
names(corrs) <- c("a", "b")
setDT(corrs)
setkey(DT, label)
setkey( corrs, a )
corrs <- corrs[ DT, nomatch = 0, allow.cartesian = TRUE]
setkey(corrs, b, t)
setkey(DT, label, t)
corrs <- corrs[ DT, nomatch = 0 ]
corrs[ , overlap := .N >= minOverlap , by = list(a,b) ]
corrs <- corrs[ (overlap) ]
corrs <- corrs[ ,list( start = min(t), end = max(t), R = cor(I,I.1) ), by = list(a,b) ]

Resources