Linking two functions - r

I have to admit I am new to coding functions, wherefore I need your help.
This code shall provide a Bayesian criterion (pBIC) following an ANOVA and automatically read the necessary information from the ANOVA table.
I have two functions
## This is function 1
test_pBIC1 <- function(name,c){ ## name is the name of the ANOVA table, e.g. "ANOVA_ALL_wake" and c is the number of conditions
c = c
data = get(name)
i = length(data$ANOVA$Effect)
result1 = data.frame(name,c,i)
return(result1)
}
## ----------------------------------------------------
## I now run and save the result of Function 1
result1 <- test_pBIC1("ANOVA_ALL_wake",3) ## for test
## ----------------------------------------------------
## This is function 2
test_pBIC2 <- function(result1){
name1 <- as.character(result1$name)
data = get(name1)
count <- as.vector(result1$i)
for (i in 1:count){
s = (data$ANOVA$DFd[i]/data$ANOVA$DFn[i])+1
n = s*(result1[2]-1)
SSE1 = data$ANOVA$SSd[i]
SSE0 = data$ANOVA$SSd[i]+data$ANOVA$SSn[i]
deltaBIC = (n * log(SSE1/SSE0))+(data$ANOVA$DFn[i]*log(n))
BF01 = exp(deltaBIC/2)
pH0_D = (BF01/(1+BF01))
pH1_D = (1-pH0_D)
result = data.frame(pH0_D, pH1_D)
colnames(result) <- c("pH0_D", "pH1_D")
rownames(result) <- c(data$ANOVA$Effect[i])
if (i == 1){
result_all <- result
} else {
result_all <- rbind (result_all, result)
}
}
return(result_all)
}
## ------------------------------------------------------
Now I run function 2 and receive the result
test_pBIC2(result1)
Now while this does it's job, I would like to link the two functions so I just have to give the name and the parameter c and still get result_all in the end, i.e. without having to run the two functions after each other.
I have tried to come up with this solution:
test_pBIC <- function(name,c){ ## pass arguments as: test_pBIC(name = "ANOVA_all_wake", c = 3)
c = c
name = name
result1 = data.frame(name,c)
# return(result1)
test_pBIC1 <- function(result1){
c = as.vector(result1$c)
name1 <- as.character(result1$name)
data = get(name)
i = length(data$ANOVA$Effect)
result2 = data.frame(name,c,i)
# return(result2)
test_pBIC2 <- function(result2){
name1 <- as.character(result2$name)
data = get(name1)
count <- as.numeric(integer$i)
for (i in 1:count){
s = (data$ANOVA$DFd[i]/data$ANOVA$DFn[i])+1
n = s*(result1[2]-1)
SSE1 = data$ANOVA$SSd[i]
SSE0 = data$ANOVA$SSd[i]+data$ANOVA$SSn[i]
deltaBIC = (n * log(SSE1/SSE0))+(data$ANOVA$DFn[i]*log(n))
BF01 = exp(deltaBIC/2)
pH0_D = (BF01/(1+BF01))
pH1_D = (1-pH0_D)
result = data.frame(pH0_D, pH1_D)
colnames(result) <- c("pH0_D", "pH1_D")
rownames(result) <- c(data$ANOVA$Effect[i])
if (i == 1){
result_all <- result
} else {
result_all <- rbind (result_all, result)
}
}
return(result_all)
}
}
}
test_pBIC("ANOVA_all_wake", 3)
However, I just get NOTHING...and I cannot find the mistake :(.
Thanks!!

Not entirely sure what the issue is, a reproducible example would help a lot. If you want to just combine it into one function you could do...
test_overall <- function(name,c) {
c = c
data = get(name)
i = length(data$ANOVA$Effect)
result1 = data.frame(name,c,i)
name1 <- as.character(result1$name)
data = get(name1)
count <- as.vector(result1$i)
for (i in 1:count){
s = (data$ANOVA$DFd[i]/data$ANOVA$DFn[i])+1
n = s*(result1[2]-1)
SSE1 = data$ANOVA$SSd[i]
SSE0 = data$ANOVA$SSd[i]+data$ANOVA$SSn[i]
deltaBIC = (n * log(SSE1/SSE0))+(data$ANOVA$DFn[i]*log(n))
BF01 = exp(deltaBIC/2)
pH0_D = (BF01/(1+BF01))
pH1_D = (1-pH0_D)
result = data.frame(pH0_D, pH1_D)
colnames(result) <- c("pH0_D", "pH1_D")
rownames(result) <- c(data$ANOVA$Effect[i])
if (i == 1){
result_all <- result
} else {
result_all <- rbind (result_all, result)
}
}
return(result_all)
}

In your first code example you've created functions test_pBIC1 and test_pBIC2. If you want to create a function test_pBIC that calls both, you can just define a function that calls both:
test_pBIC <- function(name, c) test_pBIC2(test_pBIC1(name, c))

Related

using map() instead of for loop

My task is to recreate the result of this for loop by using the map() function. Unfortunately, I can't get my head around this.
play_roulette <- function(bet, number) {
draw <- sample(0:36, 1)
tibble(
winning_number = draw,
your_number = number,
your_bet = bet,
your_return = if (number == draw) {
bet * 36
} else {
0
}
)
}
desired_length <- 10
list_w_for <- vector(mode = "list", length = desired_length)
for(i in seq_along(list_w_for)){
list_w_for[[i]] <- play_roulette(bet = 1, number = 5)
}
for_tibble <- bind_rows(list_w_for)
for_tibble
My current map code:
num_vec <- 1:10
bet_vec <- 1
tibble_2c <- tibble(x= bet_vec, y= num_vec)
map_dfc( tibble_2c,
play_roulette(bet = x, number = y))
You just have to call the function 10 times, since the iterator i is not used inside of the function.
# use *_dfr to row_bind the result
map_dfr(
# call the function ten times
1:10,
# note that .x, the default iterator in map, is not needed inside of the function
~play_roulette(bet = 1, number = 5))

How to define optional arguments in a function

I'm writing a function to get some descriptive stats from a data frame. The function takes three argument: data set, set of numerical variables, set of character variables. I managed to write the function to successfully obtain the required result when the both numerical and character variables are identified within the argument. However, when one of these argument is missing, I'd like the function to return a list with two components with the missing argument as NULL within its component.
Here's the code I've written. Please let me know if you have an answer.
table1 <- function(dat, numvar, charvar){
result_n <- numeric()
result_c <- data.frame()
#This is the original table function for numerical values
for (i in 1:length(numvar)) {
new_row <- c(round(mean(dat[[numvar[i]]],na.rm = T),2) ,
round(median(dat[[numvar[i]]],na.rm = T),2),
round(sd(dat[[numvar[i]]],na.rm = T),2),
length(dat[[numvar[i]]])-sum(is.na(dat[[numvar[i]]])),
sum(is.na(dat[[numvar[i]]])))
result_n <- rbind(result_n,new_row)
}
rownames(result_n) <- numvar
colnames(result_n) <- c("Mean", "Median", "SD", "N", "N_miss")
#Thisi is the new table for char values
for (i in 1:length(charvar)) {
tab.dat <- as.data.frame(table(dat[charvar[i]],useNA = "ifany" ))
a1 <- as.character(tab.dat$Var1)
a1[3] <- "NMiss"
one.table <- data.frame(
Varname = c(charvar[i], rep(" ", nrow(tab.dat)-1)),
group = a1,
count= tab.dat$Freq)
result_c <- rbind(result_c, one.table)
}
result_list <- list(numericStats = result_n, FactorStats =result_c)
return(result_list)
}
You can set the default value to a function:
table1 <- function(dat = NULL, numvar = NULL, charvar = NULL) {...
From there, the script can determine which is missing and go from there.
Here's the answer:
table1 <- function(dat, numvar=NULL, charvar=NULL){
result_n <- numeric()
result_c <- data.frame()
#This is the original table function for numerical values
#I borrowed builtin function (ifmissing) from the internet
if(!missing(numvar)) {for (i in 1:length(numvar)) {
new_row <- c(round(mean(dat[[numvar[i]]],na.rm = T),2) ,
round(median(dat[[numvar[i]]],na.rm = T),2),
round(sd(dat[[numvar[i]]],na.rm = T),2),
length(dat[[numvar[i]]])-sum(is.na(dat[[numvar[i]]])),
sum(is.na(dat[[numvar[i]]])))
result_n <- rbind(result_n,new_row)
}
rownames(result_n) <- numvar
colnames(result_n) <- c("Mean", "Median", "SD", "N", "N_miss")}
#Thisi is the new table for char values
#I borrowed builtin function (ifmissing) from the internet
if(!missing(charvar)) {for (i in 1:length(charvar)) {
tab.dat <- as.data.frame(table(dat[charvar[i]],useNA = "ifany" ))
a1 <- as.character(tab.dat$Var1)
a1[3] <- "NMiss"
one.table <- data.frame(
Varname = c(charvar[i], rep(" ", nrow(tab.dat)-1)),
group = a1,
count= tab.dat$Freq)
result_c <- rbind(result_c, one.table)
}}
result_list <- list(numericStats = result_n, FactorStats =result_c)
return(result_list)
}

R non overlapping sample - faster function

I have a table with more than one contract per client. I want to take a sample but not allowing more than one contract per client within 6 months. I created one function (that uses another) that does the job, but it is too slow.
The callable function is:
non_overlapping_sample <- function (tbla, date_field, id_field, window_days) {
base_evaluar = data.table(tbla)
base_evaluar[,(date_field):= ymd(base_evaluar[[date_field]]) ]
setkeyv(base_evaluar, date_field)
setkeyv(base_evaluar, id_field)
id_primero = sample(1:nrow(tbla), 1)
base_muestra = data.frame(base_evaluar[id_primero,])
base_evaluar = remove_rows(base_evaluar, id_primero, date_field, id_field, window_days)
while (nrow(base_evaluar) > 0) {
id_a_sacar = sample(1:nrow(base_evaluar), 1)
base_muestra = rbind(base_muestra,data.frame(base_evaluar[id_a_sacar,]))
base_evaluar = remove_rows(base_evaluar, id_a_sacar, date_field, id_field, window_days)
}
base_muestra = base_muestra[order(base_muestra[,id_field],base_muestra[,date_field]),]
return(base_muestra)
}
Ant the internal function is:
remove_rows <- function(tabla, indice_fila, date_field, id_field, window_days) {
fecha = tabla[indice_fila, get(date_field)]
element = tabla[indice_fila, get(id_field)]
lim_sup=fecha + window_days
lim_inf=fecha - window_days
queda = tabla[ tabla[[id_field]] != element | tabla[[date_field]] > lim_sup | tabla[[date_field]] < lim_inf]
return(queda)
}
An example to use it is:
set.seed(1)
library(lubridate)
sem = sample(seq.Date(ymd(20150101),ymd(20180101),1), 3000, replace = T)
base = data.frame(fc_fin_semana = sem, cd_cliente=round(runif(3000)*10,0))
base=base[!duplicated(base),]
non_overlapping_sample(base, date_field='fc_fin_semana', 'cd_cliente', 182)
Any ideas to make it work faster?
Thanks!
EDITION:
An example of what would be wrong and right:
rbind is slow in loops. Try something like this:
non_overlapping_sample2 <- function(tbla, date_field, id_field, window_days) {
dt <- data.table(tbla)
dt[, (date_field) := ymd(dt[[date_field]])]
setkeyv(dt, c(id_field, date_field))
# create vectors for while loop:
rowIDS <- 1:nrow(dt)
selected_rows <- NULL
use <- rep(T, nrow(dt))
dates <- dt[[date_field]]
ids <- dt[[id_field]]
rowIDS2 <- rowIDS
while (length(rowIDS2) > 0) {
sid <- sample.int(length(rowIDS2), 1) # as rowIDS2 can be length 1 vector, use this approach
row_selected <- rowIDS2[sid] # selected row
selected_rows <- c(selected_rows, row_selected)
sel_date <- dates[row_selected] # selected date
sel_ID <- ids[row_selected] # selected ID
date_max <- sel_date + window_days
date_min <- sel_date - window_days
use[ids == sel_ID & dates <= date_max & dates >= date_min] <- FALSE
rowIDS2 <- rowIDS[use == TRUE] # subset for next sample
}
result <- dt[selected_rows, ] # dt subset
setorderv(result, c(id_field, date_field))
return(result)
}
In loop we do not need to do data.table\data.frame subsets, operate only on vectors.
Sub-setting can be done in the end.

Problems with loop/repeat in R

I need to execute this code many times in order to get 45 different matrices at the end: mat[j], j=1:45.
Not sure how to use "for-loop" to achieve that, will be grateful for any tips.
Data files are stored here, year-by-year https://intl-atlas-downloads.s3.amazonaws.com/index.html
library(readstata13)
library(diverse)
library(plyr)
for (j in 1:45) {
dat <- read.dta13(file.choose())
data = aggregate(dat$export_value, by = list(dat$exporter,dat$commoditycode), FUN = sum)
colnames(data) = c("land","product","value")
dt = split(data, f = data$product)
land = as.data.frame(sort(unique(data[, 1])))
nds = seq(1, nrow(land), by = 1)
texmat = cbind(nds, land)
colnames(texmat) = c("num", "land")
for (i in 1:length(unique(data[, 2]))) {
(join(texmat, dt[[i]], by = "land", type = "left")$value)
}
mt = sapply(1:length(unique(data[, 2])), function(i) join(texmat, dt[[i]], by = "land", type = "left")$value)
colnames(mt) = unique(data[, 2])
rownames(mt) = sort(unique(data[, 1]))
mt[is.na(mt)] = 0
rcamat=values(mt, category_row = FALSE, norm = "rca",filter = 1, binary = TRUE)
rcamat[is.na(rcamat)] = 0
tmat = rcamat[rowSums(rcamat) != 0, , drop = TRUE]
mat = t(tmat)
}
It looks like you're almost there with the for loop. You just need to add 2 concepts:
1) Creating a list of matrices to read at the start. A construction like:
filenames <- paste0('H0_',1995:2016,'.dta')
filenames <- c(filenames,paste0('S2_final_',1962:2016,'.dta'))
that creates a vector of the files you want to read will allow you to replace file.choose with something like the following (inside the loop):
dat <- read.dta13(paste0('/path/to/directory/with/files/',filenames[i]))
This way you can grab a new file with each loop iteration.
2) Storing the output matrices at the end of the loop. You can do this either by putting them all in a list, or by using assign to create a collection of objects. I prefer the list approach:
#before the for loop initialize a NULL list:
mats <- NULL
#at the end of the loop, (after mat = t(tmat) but before the close bracket) add this line to add it to the list
mats[[i]] <- mat
This will create a list mats with mats[[1]] holding the first matrix, mats[[2]] holding the second, and so on.
You could alternatively create a bunch of objects like so:
#at the end of the for loop add
assign(paste0('mat_',i),mat)
Which will create mat_1, mat_2, and so on as separate objects. A full implementation would look something like this:
library(readstata13)
library(diverse)
library(plyr)
setwd('/path/to/files/')
filenames <- paste0('H0_',1995:2016,'.dta')
filenames <- c(filenames,paste0('S2_final_',1962:2016,'.dta'))
#you'll have to prune this to the files you actually want, as this list is more than 45
finished_matrices <- NULL
for (j in 1:45) {
dat <- read.dta13(filenames[i]) #pickup
data = aggregate(dat$export_value, by = list(dat$exporter,dat$commoditycode), FUN = sum)
colnames(data) = c("land","product","value")
dt = split(data, f = data$product)
land = as.data.frame(sort(unique(data[, 1])))
nds = seq(1, nrow(land), by = 1)
texmat = cbind(nds, land)
colnames(texmat) = c("num", "land")
for (i in 1:length(unique(data[, 2]))) {
(join(texmat, dt[[i]], by = "land", type = "left")$value)
}
mt = sapply(1:length(unique(data[, 2])), function(i) join(texmat, dt[[i]], by = "land", type = "left")$value)
colnames(mt) = unique(data[, 2])
rownames(mt) = sort(unique(data[, 1]))
mt[is.na(mt)] = 0
rcamat=values(mt, category_row = FALSE, norm = "rca",filter = 1, binary = TRUE)
rcamat[is.na(rcamat)] = 0
tmat = rcamat[rowSums(rcamat) != 0, , drop = TRUE]
mat = t(tmat)
finished_matrices[[i]] <- mat
}

Accessing lists in a list with smbinning.gen() in a loop

Basically I'm trying to automate a scoring modeling workflow, and encountered a problem with inputting the results from smbinning() that are generated by a loop and hence recorded in a list. The result itself is a list, so I have a bunch of lists in a list. Problems arise when I'm trying to add the results (buckets for continuous variables) into a data frame. I just find it impossible to feed the syntax required to dive into the levels of the list. I tried work my way around this by referencing the column numbers and just trying to pass on the respective list names from the loop. Error I'm getting is:
Error in [.data.frame (df, , col_id) : undefined columns selected.
My code is as follows:
colcnt <- ncol(e_mod)
bucket_resultlist <- list()
for (i in 2:colcnt) {
#curvar = paste0('z', i)
curresult = smbinning(df = e_mod, y = "Bankrupt", x = colnames(e_mod)[i], p = 0.05)
bucket_resultlist[[paste0('Bin_Result_', colnames(e_mod)[i])]] = curresult #paste0('binresult', colnames(e)[i]) = curresult
}
e_mod2 = e_mod
for (i in 1:length(bucket_resultlist_trunc)) {
e_mod2 = smbinning.genCUSTOM(e_mod, bucket_resultlist_trunc[[i]] , chrname = i)
}
I've even tried to define a customer version of the smbinning.gen() function to allow for this, as in the standard form it just tries to concatenate $ivtable to the list reference, but I need to be able to skip one level from this generated list and then run the smbinning.gen() for each respective list in that list. Here's the custom code and the original definitions commented out:
smbinning.genCUSTOM = function(df, ivout, chrname = "NewChar") {
df = cbind(df, tmpname = NA)
ncol = ncol(df)
col_id = paste0(ivout, '[[6]]', collapse = NULL) # Original: ivout$col_id
# Updated 20160130
b = paste0(ivout, '[[4]]', collapse = NULL) # Original: ivout$bands
df[, ncol][is.na(df[, col_id])] = 0 # Missing
df[, ncol][df[, col_id] <= b[2]] = 1 # First valid
# Loop goes from 2 to length(b)-2 if more than 1 cutpoint
if (length(b) > 3) {
for (i in 2:(length(b) - 2)) {
df[, ncol][df[, col_id] > b[i] & df[, col_id] <= b[i + 1]] = i
}
}
df[, ncol][df[, col_id] > b[length(b) - 1]] = length(b) - 1 # Last
df[, ncol] = as.factor(df[, ncol]) # Convert to factor for modeling
blab = c(paste("01 <=", b[2]))
if (length(b) > 3) {
for (i in 3:(length(b) - 1)) {
blab = c(blab, paste(sprintf("%02d", i - 1), "<=", b[i]))
}
} else { i = 2 }
blab = c(blab, paste(sprintf("%02d", i), ">", b[length(b) - 1]))
# Are there ANY missing values
# any(is.na(df[,col_id]))
if (any(is.na(df[, col_id]))) {
blab = c("00 Miss", blab)
}
df[, ncol] = factor(df[, ncol], labels = blab)
names(df)[names(df) == "tmpname"] = chrname
return(df)
}
All help is much appreciated!
Here's the list structure
http://i.stack.imgur.com/iYau2.png
This is also posted in the Data Science section, but this only had 5 views during the whole of today
Thanks Stackoverflow for being my yellow rubber duck. The fix was to change the way of passing in the arguments:
smbinning.genCUSTOM = function(df, ivout, chrname = "NewChar") {
df = cbind(df, tmpname = NA)
ncol = ncol(df)
col_id = ivout[[6]] # paste0(ivout, '[[6]]', collapse = NULL) # Original: ivout$col_id
# Updated 20160130
b = ivout[[4]] # paste0(ivout, '[[4]]', collapse = NULL) # Original: ivout$bands
And to refer the new df e_mod2 instead of e_mod
for (i in 1:length(bucket_resultlist_trunc)) {
e_mod2 = smbinning.genCUSTOM(e_mod2, bucket_resultlist_trunc[[i]] , chrname = i)
}

Resources