Loop through columns multiple times, write each result to data.frame - r

I am trying to loop through various columns in a data frame then loop through the dataframe again after subsetting it by date (with multiple start and end dates). I would like the results to be appended to a new dataframe each time through, but the results are being overwritten each time through the loop. The final dataframe (finaldf) only has the results of the last pass through the loop. What is the best way to get the finaldf to contain all of the results from both loops (i.e. outlooks and columns)?
tempdf <- read_csv(path)
datalist <- list()
outlooks <- list("whole", "short", ...)
for(o in outlooks){
if(o == "whole" | o == "short"){ startDate = as.Date(tempdf$Date[[1]])
} else if( o == "mid"){ startDate = as.Date((tempdf$Date[[1]] + 30))
}
if(o == "whole" | o == "long"){ endDate = as.Date(tempdf$Date[[len]])
} else if( o == "mid"){ endDate = as.Date((tempdf$Date[[1]] + 60))
}
tempdf[tempdf$Date >= startDate & tempdf$Date <= endDate,]
colList <- names(tempdf)
target <- tempdf[[2]]
f <- substr(colList[[2]],1,3)
for(j in colList[-c(1,2)]){
fitted <- as.numeric(tempdf[[j]])
datalist[[j]] <- data.frame(
RMSE = RMSE(fitted, target),
R2 = R2(fitted, target),
NSE = NSE(fitted, target),
pbias = pbias(fitted, target)
)
}
}
finaldf <- do.call(rbind, datalist)

Related

How to speed up a for loop using lapply?

I wrote a lapply-function in order to assign stock-prices around a certain date to specific companies. All my companies, for which I want to assign stock-prices are in the dataset "peers_per_deal_germany".
My first step is to identify based on the date and the company specific, ISIN, whether there is such a company in my Stock_Prices dataset. If "yes", I defined a certain timeframe for which I want to get the stock data. Then I further defined some restrictions such as not more than 40 NA's in the vector, as this would disturb my results.
The code works perfectly fine. However, for ~15'000 companies the code took around 1 hour to process. My full dataset contains around 1.8 Mio. companies for which I would need the stock prices.
Is there any way I can speed this lapply-function up? Thank you so much for your help.
get_return_vector_germany <- function(idx, peer_company, SIC, ISIN,
deal, announcement, peer_country) {
peer <- peer_company[idx]
SIC <- SIC[idx]
Deal_Nr <- deal[idx]
company_ticker <- ISIN[idx]
announcement_date <- announcement[idx]
peer_country <- peer_country[idx]
row <- c()
vector_stock_prices <- c()
vector_stock_return <- c()
vector_stock_prices_event <- c()
vector_stock_return_event <- c()
if (length(which(Stock_Prices_DE$datadate == announcement_date &
Stock_Prices_DE$isin == company_ticker, arr.ind = TRUE)) ==
0) {
row <- NA
} else {
row <- which(Stock_Prices_DE$datadate == announcement_date &
Stock_Prices_DE$isin == company_ticker, arr.ind = TRUE)
}
if (sum(is.na(row) == 1)) {
vector_stock_prices <- rep(NA, times = 179)
} else {
vector_stock_prices <- Stock_Prices_DE[(row - 218):(row - 39),
7]
}
if (sum(is.na(vector_stock_prices)) > 40) {
vector_stock_return <- list(rep(NA, times = 179))
} else {
vector_stock_return <- list(diff(vector_stock_prices)/
vector_stock_prices[-length(vector_stock_prices)])
}
if (sum(is.na(row) == 1)) {
vector_stock_prices_event <- rep(NA, times = 22)
} else {
vector_stock_prices_event <- Stock_Prices_DE[(row - 11):(row +
10), 7]
}
if (sum(is.na(vector_stock_prices_event)) > 0) {
vector_stock_return_event <- list(rep(NA, times = 21))
} else {
vector_stock_return_event <- list(diff(vector_stock_prices_event)/
vector_stock_prices_event[-length(vector_stock_prices_event)])
}
vector <- data.frame(cbind(peer, Deal_Nr, SIC, peer_country, vector_stock_return,
vector_stock_return_event))
return(vector)
}
results_vector_germany <- lapply(1:nrow(peers_per_deal_germany), get_return_vector_germany, peers_per_deal_germany$peer_company, peers_per_deal_germany$current_SIC, peers_per_deal_germany$ISIN_code, peers_per_deal_germany$deal_nr, peers_per_deal_germany$current_announcement, peers_per_deal_germany$peer_country)
try to do the task in parallel using mclapply.

Save datasets with different names in for loop in R

I am trying to implement the following:
dataset_id_1 = subset(data, id == 1)
dataset_id_2 = subset(data, id == 2)
dataset_id_3 = subset(data, id == 3)
However, I need to do this for more than 100 IDs. I encounter a problem in generating the name of the dataset on the left. I tried the following:
for (i in 1:120) {
dataset_id_[[i]] = subset(data, id == i)
}
Do you know how to save the name of the dataset according to the specified id?
Thank you so much
Try split + list2env like below
lst <- split(volping, volping$id)
list2env(setNames(lst,paste0("dataset_id_",names(lst))), .GlobalEnv)
Try this:
#List
List <- list()
#Loop
for (i in 1:120) {
List[[i]] = subset(data, id == i)
}
#Names
names(List) <- paste0('dataset_id_',1:length(List))
#Set to envir
list2env(List,envir = .GlobalEnv)
Try this
for (i in 1:120) {
assign(paste("dataset_id_", i), subset(data, id == i) )
}

create unique record data frame where multiple records of each applicant present

"I have one data frame in which bank related information of each applicant id is present. suppose applicants has multiple account and data frame reflects this information in multiple rows. Now I want to create a data frame in which each applicant all information is in one record"
I have tried it with for and if loop. Now I want to optimised code
com_data <- function(X) {
data_set <- data.frame(table(X$id))
a <- 3
n <- 3
for (i in 1:nrow(data_set)) {
for (j in 1:nrow(X[1:4])) {
if (data_set$Var1[i] == X$id[j]) {
count <- count + 1
#k <- j
}
if (count == 1) {
for (k in 3:ncol(X))
data_set[i, n] <- X[j, k]
n <- n + 1
} else{
for (k in 3:ncol(X))
data_set[i, n] <- X[j, k]
n <- n + 1
}
}
count = 0
n <- 3
}
return(data_set)
}
Gets a little messy assumes your dataframe isn't comprised of list vectors. "Var" should be applicant id:
# Sample data used:
df <- data.frame(
Date = as.Date(c("27/9/2019", "28/9/2019", "1/10/2019", "2/10/2019"), "%d/%m/%y"),
dateTime = as.POSIXct(c("27/9/2019", "28/9/2019", "1/10/2019", "2/10/2019"), "%d/%m/%y %H:M:S"),
Var = as.factor(c("A", "A", "B", "B")),
Value = c(56, 50, 90, 100),
stringsAsFactors = F
)
# Convert factors & dates to strings:
convert_descriptors_to_char <- function(df){
as.data.frame(lapply(df,
function(x){
if(is.factor(x) | inherits(x, "Date") | inherits(x, "POSIXct") | inherits(x, "POSIXlt")) {
as.character(trimws(x, which = "both"))
} else{
x
}
}
),
stringsAsFactors = FALSE)
}
# Convert data types:
df <- convert_descriptors_to_char(df)
# Merge the separate lists into one:
df_aggd <- lapply(df, function(x){
if(is.character(x)){
aggregate(x~df$Var, df, paste0, collapse = ", ")
}else if(is.numeric(x)){
aggregate(x~df$Var, df, sum)
}else{
x
}
}
)
# Vector to rename "x" to:
x_vect_names <- names(sapply(df_aggd, function(x){deparse(substitute(x))}))
# Iterate through list to rename:
for (i in seq_along(df_aggd)){
colnames(df_aggd[[i]]) <- c("Var", x_vect_names[i])
}
# Remove Var df:
df_aggd <- df_aggd[names(df_aggd) != "Var"]
# Merge the separate dataframes into one:
Reduce(function(x, y){merge(x, y, all = TRUE, by = intersect(colnames(x), colnames(y)))}, df_aggd)

R non overlapping sample - faster function

I have a table with more than one contract per client. I want to take a sample but not allowing more than one contract per client within 6 months. I created one function (that uses another) that does the job, but it is too slow.
The callable function is:
non_overlapping_sample <- function (tbla, date_field, id_field, window_days) {
base_evaluar = data.table(tbla)
base_evaluar[,(date_field):= ymd(base_evaluar[[date_field]]) ]
setkeyv(base_evaluar, date_field)
setkeyv(base_evaluar, id_field)
id_primero = sample(1:nrow(tbla), 1)
base_muestra = data.frame(base_evaluar[id_primero,])
base_evaluar = remove_rows(base_evaluar, id_primero, date_field, id_field, window_days)
while (nrow(base_evaluar) > 0) {
id_a_sacar = sample(1:nrow(base_evaluar), 1)
base_muestra = rbind(base_muestra,data.frame(base_evaluar[id_a_sacar,]))
base_evaluar = remove_rows(base_evaluar, id_a_sacar, date_field, id_field, window_days)
}
base_muestra = base_muestra[order(base_muestra[,id_field],base_muestra[,date_field]),]
return(base_muestra)
}
Ant the internal function is:
remove_rows <- function(tabla, indice_fila, date_field, id_field, window_days) {
fecha = tabla[indice_fila, get(date_field)]
element = tabla[indice_fila, get(id_field)]
lim_sup=fecha + window_days
lim_inf=fecha - window_days
queda = tabla[ tabla[[id_field]] != element | tabla[[date_field]] > lim_sup | tabla[[date_field]] < lim_inf]
return(queda)
}
An example to use it is:
set.seed(1)
library(lubridate)
sem = sample(seq.Date(ymd(20150101),ymd(20180101),1), 3000, replace = T)
base = data.frame(fc_fin_semana = sem, cd_cliente=round(runif(3000)*10,0))
base=base[!duplicated(base),]
non_overlapping_sample(base, date_field='fc_fin_semana', 'cd_cliente', 182)
Any ideas to make it work faster?
Thanks!
EDITION:
An example of what would be wrong and right:
rbind is slow in loops. Try something like this:
non_overlapping_sample2 <- function(tbla, date_field, id_field, window_days) {
dt <- data.table(tbla)
dt[, (date_field) := ymd(dt[[date_field]])]
setkeyv(dt, c(id_field, date_field))
# create vectors for while loop:
rowIDS <- 1:nrow(dt)
selected_rows <- NULL
use <- rep(T, nrow(dt))
dates <- dt[[date_field]]
ids <- dt[[id_field]]
rowIDS2 <- rowIDS
while (length(rowIDS2) > 0) {
sid <- sample.int(length(rowIDS2), 1) # as rowIDS2 can be length 1 vector, use this approach
row_selected <- rowIDS2[sid] # selected row
selected_rows <- c(selected_rows, row_selected)
sel_date <- dates[row_selected] # selected date
sel_ID <- ids[row_selected] # selected ID
date_max <- sel_date + window_days
date_min <- sel_date - window_days
use[ids == sel_ID & dates <= date_max & dates >= date_min] <- FALSE
rowIDS2 <- rowIDS[use == TRUE] # subset for next sample
}
result <- dt[selected_rows, ] # dt subset
setorderv(result, c(id_field, date_field))
return(result)
}
In loop we do not need to do data.table\data.frame subsets, operate only on vectors.
Sub-setting can be done in the end.

Arguments imply differing number of rows for an iteration loop

code problem
Save the result from an iteration loop into a whole dataframe problem
library(rscopus)
library(dplyr)
auth_token_header("d2f02ad55dcfc907212f0e6b216bf847")
akey="d2f02ad55dcfc907212f0e6b216bf847"
set_api_key(akey)
df = data.frame(doi = c("10.1109/TPAMI.2018.2798607", "10.1109/CNS.2017.8228696"))
df_references <- NULL
for (i in 1:nrow(df)) {
x = abstract_retrieval(df$doi[i], identifier= "doi")
for (a in 1:length(x$content$`abstracts-retrieval-response`$`item`$bibrecord$tail$`bibliography`$reference)){
call_str <- paste("ref <- x$content$`abstracts-retrieval-response`$`item`$bibrecord$tail$`bibliography`$reference[[",a,"]]$`ref-info`$`ref-title`")
eval(parse(text = call_str))
df_references <- rbind(df_references, data.frame(initial_paper = df$doi[i],
ref_title = ref))
}
}
I expect the output to be saved results of every iteration into a dataframe

Resources