R non overlapping sample - faster function - r

I have a table with more than one contract per client. I want to take a sample but not allowing more than one contract per client within 6 months. I created one function (that uses another) that does the job, but it is too slow.
The callable function is:
non_overlapping_sample <- function (tbla, date_field, id_field, window_days) {
base_evaluar = data.table(tbla)
base_evaluar[,(date_field):= ymd(base_evaluar[[date_field]]) ]
setkeyv(base_evaluar, date_field)
setkeyv(base_evaluar, id_field)
id_primero = sample(1:nrow(tbla), 1)
base_muestra = data.frame(base_evaluar[id_primero,])
base_evaluar = remove_rows(base_evaluar, id_primero, date_field, id_field, window_days)
while (nrow(base_evaluar) > 0) {
id_a_sacar = sample(1:nrow(base_evaluar), 1)
base_muestra = rbind(base_muestra,data.frame(base_evaluar[id_a_sacar,]))
base_evaluar = remove_rows(base_evaluar, id_a_sacar, date_field, id_field, window_days)
}
base_muestra = base_muestra[order(base_muestra[,id_field],base_muestra[,date_field]),]
return(base_muestra)
}
Ant the internal function is:
remove_rows <- function(tabla, indice_fila, date_field, id_field, window_days) {
fecha = tabla[indice_fila, get(date_field)]
element = tabla[indice_fila, get(id_field)]
lim_sup=fecha + window_days
lim_inf=fecha - window_days
queda = tabla[ tabla[[id_field]] != element | tabla[[date_field]] > lim_sup | tabla[[date_field]] < lim_inf]
return(queda)
}
An example to use it is:
set.seed(1)
library(lubridate)
sem = sample(seq.Date(ymd(20150101),ymd(20180101),1), 3000, replace = T)
base = data.frame(fc_fin_semana = sem, cd_cliente=round(runif(3000)*10,0))
base=base[!duplicated(base),]
non_overlapping_sample(base, date_field='fc_fin_semana', 'cd_cliente', 182)
Any ideas to make it work faster?
Thanks!
EDITION:
An example of what would be wrong and right:

rbind is slow in loops. Try something like this:
non_overlapping_sample2 <- function(tbla, date_field, id_field, window_days) {
dt <- data.table(tbla)
dt[, (date_field) := ymd(dt[[date_field]])]
setkeyv(dt, c(id_field, date_field))
# create vectors for while loop:
rowIDS <- 1:nrow(dt)
selected_rows <- NULL
use <- rep(T, nrow(dt))
dates <- dt[[date_field]]
ids <- dt[[id_field]]
rowIDS2 <- rowIDS
while (length(rowIDS2) > 0) {
sid <- sample.int(length(rowIDS2), 1) # as rowIDS2 can be length 1 vector, use this approach
row_selected <- rowIDS2[sid] # selected row
selected_rows <- c(selected_rows, row_selected)
sel_date <- dates[row_selected] # selected date
sel_ID <- ids[row_selected] # selected ID
date_max <- sel_date + window_days
date_min <- sel_date - window_days
use[ids == sel_ID & dates <= date_max & dates >= date_min] <- FALSE
rowIDS2 <- rowIDS[use == TRUE] # subset for next sample
}
result <- dt[selected_rows, ] # dt subset
setorderv(result, c(id_field, date_field))
return(result)
}
In loop we do not need to do data.table\data.frame subsets, operate only on vectors.
Sub-setting can be done in the end.

Related

Loop through columns multiple times, write each result to data.frame

I am trying to loop through various columns in a data frame then loop through the dataframe again after subsetting it by date (with multiple start and end dates). I would like the results to be appended to a new dataframe each time through, but the results are being overwritten each time through the loop. The final dataframe (finaldf) only has the results of the last pass through the loop. What is the best way to get the finaldf to contain all of the results from both loops (i.e. outlooks and columns)?
tempdf <- read_csv(path)
datalist <- list()
outlooks <- list("whole", "short", ...)
for(o in outlooks){
if(o == "whole" | o == "short"){ startDate = as.Date(tempdf$Date[[1]])
} else if( o == "mid"){ startDate = as.Date((tempdf$Date[[1]] + 30))
}
if(o == "whole" | o == "long"){ endDate = as.Date(tempdf$Date[[len]])
} else if( o == "mid"){ endDate = as.Date((tempdf$Date[[1]] + 60))
}
tempdf[tempdf$Date >= startDate & tempdf$Date <= endDate,]
colList <- names(tempdf)
target <- tempdf[[2]]
f <- substr(colList[[2]],1,3)
for(j in colList[-c(1,2)]){
fitted <- as.numeric(tempdf[[j]])
datalist[[j]] <- data.frame(
RMSE = RMSE(fitted, target),
R2 = R2(fitted, target),
NSE = NSE(fitted, target),
pbias = pbias(fitted, target)
)
}
}
finaldf <- do.call(rbind, datalist)

How to speed up a for loop using lapply?

I wrote a lapply-function in order to assign stock-prices around a certain date to specific companies. All my companies, for which I want to assign stock-prices are in the dataset "peers_per_deal_germany".
My first step is to identify based on the date and the company specific, ISIN, whether there is such a company in my Stock_Prices dataset. If "yes", I defined a certain timeframe for which I want to get the stock data. Then I further defined some restrictions such as not more than 40 NA's in the vector, as this would disturb my results.
The code works perfectly fine. However, for ~15'000 companies the code took around 1 hour to process. My full dataset contains around 1.8 Mio. companies for which I would need the stock prices.
Is there any way I can speed this lapply-function up? Thank you so much for your help.
get_return_vector_germany <- function(idx, peer_company, SIC, ISIN,
deal, announcement, peer_country) {
peer <- peer_company[idx]
SIC <- SIC[idx]
Deal_Nr <- deal[idx]
company_ticker <- ISIN[idx]
announcement_date <- announcement[idx]
peer_country <- peer_country[idx]
row <- c()
vector_stock_prices <- c()
vector_stock_return <- c()
vector_stock_prices_event <- c()
vector_stock_return_event <- c()
if (length(which(Stock_Prices_DE$datadate == announcement_date &
Stock_Prices_DE$isin == company_ticker, arr.ind = TRUE)) ==
0) {
row <- NA
} else {
row <- which(Stock_Prices_DE$datadate == announcement_date &
Stock_Prices_DE$isin == company_ticker, arr.ind = TRUE)
}
if (sum(is.na(row) == 1)) {
vector_stock_prices <- rep(NA, times = 179)
} else {
vector_stock_prices <- Stock_Prices_DE[(row - 218):(row - 39),
7]
}
if (sum(is.na(vector_stock_prices)) > 40) {
vector_stock_return <- list(rep(NA, times = 179))
} else {
vector_stock_return <- list(diff(vector_stock_prices)/
vector_stock_prices[-length(vector_stock_prices)])
}
if (sum(is.na(row) == 1)) {
vector_stock_prices_event <- rep(NA, times = 22)
} else {
vector_stock_prices_event <- Stock_Prices_DE[(row - 11):(row +
10), 7]
}
if (sum(is.na(vector_stock_prices_event)) > 0) {
vector_stock_return_event <- list(rep(NA, times = 21))
} else {
vector_stock_return_event <- list(diff(vector_stock_prices_event)/
vector_stock_prices_event[-length(vector_stock_prices_event)])
}
vector <- data.frame(cbind(peer, Deal_Nr, SIC, peer_country, vector_stock_return,
vector_stock_return_event))
return(vector)
}
results_vector_germany <- lapply(1:nrow(peers_per_deal_germany), get_return_vector_germany, peers_per_deal_germany$peer_company, peers_per_deal_germany$current_SIC, peers_per_deal_germany$ISIN_code, peers_per_deal_germany$deal_nr, peers_per_deal_germany$current_announcement, peers_per_deal_germany$peer_country)
try to do the task in parallel using mclapply.

create unique record data frame where multiple records of each applicant present

"I have one data frame in which bank related information of each applicant id is present. suppose applicants has multiple account and data frame reflects this information in multiple rows. Now I want to create a data frame in which each applicant all information is in one record"
I have tried it with for and if loop. Now I want to optimised code
com_data <- function(X) {
data_set <- data.frame(table(X$id))
a <- 3
n <- 3
for (i in 1:nrow(data_set)) {
for (j in 1:nrow(X[1:4])) {
if (data_set$Var1[i] == X$id[j]) {
count <- count + 1
#k <- j
}
if (count == 1) {
for (k in 3:ncol(X))
data_set[i, n] <- X[j, k]
n <- n + 1
} else{
for (k in 3:ncol(X))
data_set[i, n] <- X[j, k]
n <- n + 1
}
}
count = 0
n <- 3
}
return(data_set)
}
Gets a little messy assumes your dataframe isn't comprised of list vectors. "Var" should be applicant id:
# Sample data used:
df <- data.frame(
Date = as.Date(c("27/9/2019", "28/9/2019", "1/10/2019", "2/10/2019"), "%d/%m/%y"),
dateTime = as.POSIXct(c("27/9/2019", "28/9/2019", "1/10/2019", "2/10/2019"), "%d/%m/%y %H:M:S"),
Var = as.factor(c("A", "A", "B", "B")),
Value = c(56, 50, 90, 100),
stringsAsFactors = F
)
# Convert factors & dates to strings:
convert_descriptors_to_char <- function(df){
as.data.frame(lapply(df,
function(x){
if(is.factor(x) | inherits(x, "Date") | inherits(x, "POSIXct") | inherits(x, "POSIXlt")) {
as.character(trimws(x, which = "both"))
} else{
x
}
}
),
stringsAsFactors = FALSE)
}
# Convert data types:
df <- convert_descriptors_to_char(df)
# Merge the separate lists into one:
df_aggd <- lapply(df, function(x){
if(is.character(x)){
aggregate(x~df$Var, df, paste0, collapse = ", ")
}else if(is.numeric(x)){
aggregate(x~df$Var, df, sum)
}else{
x
}
}
)
# Vector to rename "x" to:
x_vect_names <- names(sapply(df_aggd, function(x){deparse(substitute(x))}))
# Iterate through list to rename:
for (i in seq_along(df_aggd)){
colnames(df_aggd[[i]]) <- c("Var", x_vect_names[i])
}
# Remove Var df:
df_aggd <- df_aggd[names(df_aggd) != "Var"]
# Merge the separate dataframes into one:
Reduce(function(x, y){merge(x, y, all = TRUE, by = intersect(colnames(x), colnames(y)))}, df_aggd)

The proper way to vectorise an if-else tower in R

I came across the following post: Vectorized IF statement in R?, which deals with a vecotrisation of one if-else construct in R. However, I do not want to build nested $ifelse$ functions in R, is there another way to do this?
As an example I have this:
library(data.table)
precalc_nrcolumns <- function(TERM_DATE, CHANGE_DATE){
if(CHANGE_DATE < TERM_DATE){
if(CHANGE_DATE == 0) {
if(TERM_DATE > 100) {
nrcolumns <- 100
}else{
nrcolumns <- TERM_DATE
}
}else{
nrcolumns <- CHANGE_DATE
}
}else{
nrcolumns <- 100
}
return(nrcolumns)
}
test.data <- data.table(TERM_DATE = sample(1:500, 100, replace=TRUE),
CHANGE_DATE = sample(1:500, 100, replace=TRUE))
test.data[, value := mapply(precalc_nrcolumns, TERM_DATE, CHANGE_DATE)]
I am fully aware of using mapply, which actually works, but I was wondering what other ways there are to deal with this.
A possible approach:
test.data[, val := pmin(CHANGE_DATE, TERM_DATE)][
CHANGE_DATE==0L, val := pmin(100L, TERM_DATE)][
CHANGE_DATE >= TERM_DATE, val := 100L]
data:
set.seed(0L)
library(data.table)
nr <- 300
test.data <- data.table(TERM_DATE = sample(0:150, nr, replace=TRUE),
CHANGE_DATE = sample(0:150, nr, replace=TRUE))

Accessing lists in a list with smbinning.gen() in a loop

Basically I'm trying to automate a scoring modeling workflow, and encountered a problem with inputting the results from smbinning() that are generated by a loop and hence recorded in a list. The result itself is a list, so I have a bunch of lists in a list. Problems arise when I'm trying to add the results (buckets for continuous variables) into a data frame. I just find it impossible to feed the syntax required to dive into the levels of the list. I tried work my way around this by referencing the column numbers and just trying to pass on the respective list names from the loop. Error I'm getting is:
Error in [.data.frame (df, , col_id) : undefined columns selected.
My code is as follows:
colcnt <- ncol(e_mod)
bucket_resultlist <- list()
for (i in 2:colcnt) {
#curvar = paste0('z', i)
curresult = smbinning(df = e_mod, y = "Bankrupt", x = colnames(e_mod)[i], p = 0.05)
bucket_resultlist[[paste0('Bin_Result_', colnames(e_mod)[i])]] = curresult #paste0('binresult', colnames(e)[i]) = curresult
}
e_mod2 = e_mod
for (i in 1:length(bucket_resultlist_trunc)) {
e_mod2 = smbinning.genCUSTOM(e_mod, bucket_resultlist_trunc[[i]] , chrname = i)
}
I've even tried to define a customer version of the smbinning.gen() function to allow for this, as in the standard form it just tries to concatenate $ivtable to the list reference, but I need to be able to skip one level from this generated list and then run the smbinning.gen() for each respective list in that list. Here's the custom code and the original definitions commented out:
smbinning.genCUSTOM = function(df, ivout, chrname = "NewChar") {
df = cbind(df, tmpname = NA)
ncol = ncol(df)
col_id = paste0(ivout, '[[6]]', collapse = NULL) # Original: ivout$col_id
# Updated 20160130
b = paste0(ivout, '[[4]]', collapse = NULL) # Original: ivout$bands
df[, ncol][is.na(df[, col_id])] = 0 # Missing
df[, ncol][df[, col_id] <= b[2]] = 1 # First valid
# Loop goes from 2 to length(b)-2 if more than 1 cutpoint
if (length(b) > 3) {
for (i in 2:(length(b) - 2)) {
df[, ncol][df[, col_id] > b[i] & df[, col_id] <= b[i + 1]] = i
}
}
df[, ncol][df[, col_id] > b[length(b) - 1]] = length(b) - 1 # Last
df[, ncol] = as.factor(df[, ncol]) # Convert to factor for modeling
blab = c(paste("01 <=", b[2]))
if (length(b) > 3) {
for (i in 3:(length(b) - 1)) {
blab = c(blab, paste(sprintf("%02d", i - 1), "<=", b[i]))
}
} else { i = 2 }
blab = c(blab, paste(sprintf("%02d", i), ">", b[length(b) - 1]))
# Are there ANY missing values
# any(is.na(df[,col_id]))
if (any(is.na(df[, col_id]))) {
blab = c("00 Miss", blab)
}
df[, ncol] = factor(df[, ncol], labels = blab)
names(df)[names(df) == "tmpname"] = chrname
return(df)
}
All help is much appreciated!
Here's the list structure
http://i.stack.imgur.com/iYau2.png
This is also posted in the Data Science section, but this only had 5 views during the whole of today
Thanks Stackoverflow for being my yellow rubber duck. The fix was to change the way of passing in the arguments:
smbinning.genCUSTOM = function(df, ivout, chrname = "NewChar") {
df = cbind(df, tmpname = NA)
ncol = ncol(df)
col_id = ivout[[6]] # paste0(ivout, '[[6]]', collapse = NULL) # Original: ivout$col_id
# Updated 20160130
b = ivout[[4]] # paste0(ivout, '[[4]]', collapse = NULL) # Original: ivout$bands
And to refer the new df e_mod2 instead of e_mod
for (i in 1:length(bucket_resultlist_trunc)) {
e_mod2 = smbinning.genCUSTOM(e_mod2, bucket_resultlist_trunc[[i]] , chrname = i)
}

Resources