How to speed up a for loop using lapply? - r

I wrote a lapply-function in order to assign stock-prices around a certain date to specific companies. All my companies, for which I want to assign stock-prices are in the dataset "peers_per_deal_germany".
My first step is to identify based on the date and the company specific, ISIN, whether there is such a company in my Stock_Prices dataset. If "yes", I defined a certain timeframe for which I want to get the stock data. Then I further defined some restrictions such as not more than 40 NA's in the vector, as this would disturb my results.
The code works perfectly fine. However, for ~15'000 companies the code took around 1 hour to process. My full dataset contains around 1.8 Mio. companies for which I would need the stock prices.
Is there any way I can speed this lapply-function up? Thank you so much for your help.
get_return_vector_germany <- function(idx, peer_company, SIC, ISIN,
deal, announcement, peer_country) {
peer <- peer_company[idx]
SIC <- SIC[idx]
Deal_Nr <- deal[idx]
company_ticker <- ISIN[idx]
announcement_date <- announcement[idx]
peer_country <- peer_country[idx]
row <- c()
vector_stock_prices <- c()
vector_stock_return <- c()
vector_stock_prices_event <- c()
vector_stock_return_event <- c()
if (length(which(Stock_Prices_DE$datadate == announcement_date &
Stock_Prices_DE$isin == company_ticker, arr.ind = TRUE)) ==
0) {
row <- NA
} else {
row <- which(Stock_Prices_DE$datadate == announcement_date &
Stock_Prices_DE$isin == company_ticker, arr.ind = TRUE)
}
if (sum(is.na(row) == 1)) {
vector_stock_prices <- rep(NA, times = 179)
} else {
vector_stock_prices <- Stock_Prices_DE[(row - 218):(row - 39),
7]
}
if (sum(is.na(vector_stock_prices)) > 40) {
vector_stock_return <- list(rep(NA, times = 179))
} else {
vector_stock_return <- list(diff(vector_stock_prices)/
vector_stock_prices[-length(vector_stock_prices)])
}
if (sum(is.na(row) == 1)) {
vector_stock_prices_event <- rep(NA, times = 22)
} else {
vector_stock_prices_event <- Stock_Prices_DE[(row - 11):(row +
10), 7]
}
if (sum(is.na(vector_stock_prices_event)) > 0) {
vector_stock_return_event <- list(rep(NA, times = 21))
} else {
vector_stock_return_event <- list(diff(vector_stock_prices_event)/
vector_stock_prices_event[-length(vector_stock_prices_event)])
}
vector <- data.frame(cbind(peer, Deal_Nr, SIC, peer_country, vector_stock_return,
vector_stock_return_event))
return(vector)
}
results_vector_germany <- lapply(1:nrow(peers_per_deal_germany), get_return_vector_germany, peers_per_deal_germany$peer_company, peers_per_deal_germany$current_SIC, peers_per_deal_germany$ISIN_code, peers_per_deal_germany$deal_nr, peers_per_deal_germany$current_announcement, peers_per_deal_germany$peer_country)

try to do the task in parallel using mclapply.

Related

Filtering values in a list using R

I am working on a project in which I need to filter my list if it has certain values for each of my IDs but unfortunately, it isn't working.
So I have one list with 12 different matrixes with the same columns but diff
library(tidyverse)
trajectory_C <- list()
trajectory_D <- list()
file_list_C <- list.files(pattern=".trajectory_C.csv")
file_list_D <- list.files(pattern=".trajectory_D.csv")
for (i in 1:length(file_list_C)) {
trajectory_C[[i]] <- read.csv(file_list_C[i])
}
for (i in 1:length(file_list_D)) {
trajectory_D[[i]] <- read.csv(file_list_D[i])
}
So my two lists are trajectory_D and trajectory_C. I then created two other lists and I saved the unique values of a certain column called "ID" and added a validation column to it.
unique_ID_C <- list()
unique_ID_D <- list()
for (i in 1:12) {
unique_ID_C[[i]] <- unique(trajectory_C[[i]]["ID"])
unique_ID_D[[i]] <- unique(trajectory_D[[i]]["ID"])
}
for (i in 1:12) {
Turning <- matrix(data=0,nrow = length(unique_ID_C[[i]]), ncol = 1)
unique_ID_C[[i]] <- cbind(unique_ID_C[[i]],Turning)
names(unique_ID_C[[i]]) <- c("ID","Validation")
}
What I want to do right now is understand if each of my unique values has certain elements (28 and 29) in the variable "Segment". For all the twelve different levels of my list.
for (i in 1:12) {
for (ID in unique_ID_C[[1]]) {
c <- unique(trajectory_C[[i]][trajectory_C[[i]]["ID"] == ID,"Segment"])
unique_ID_C[[i]][unique_ID_C[[i]]["ID"] == ID,2] <- ifelse(any(28 == c) == TRUE & any(29 == c) == TRUE,1,0)
}
}
I am new in programming and this is the first time I am using Lists so this might be my problem.

R generates NA_real vector in while loop, but not when code line is run separately, how to fix the loop?

I'm trying to "pseudo-randomize" a vector in R using a while loop.
I have a vector delays with the elements that need to be randomized.
I am using sample on a vector values to index randomly into delays. I cannot have more than two same values in a row, so I am trying to use an if else statement. If the condition are met, the value should be added to random, and removed from delays.
When I run the individual lines outside the loop they are all working, but when I try to run the loop, one of the vector is populated as NA_real, and that stops the logical operators from working.
I'm probably not great at explaining this, but can anyone spot what I'm doing wrong? :)
delay_0 <- rep(0, 12)
delay_6 <- rep(6, 12)
delays <- c(delay_6, delay_0)
value <- c(1:24)
count <- 0
outcasts <- c()
random <- c(1,2)
while (length(random) < 27) {
count <- count + 1
b <- sample(value, 1, replace = FALSE)
a <- delays[b]
if(a == tail(random,1) & a == head(tail(random,2),1) {
outcast <- outcasts + 1
}
else {
value <- value[-(b)]
delays <- delays[-(b)]
random <- c(random,a)
}
}
Two problems with your code:
b can take a value that is greater than the number of elements in delays. I fixed this by using sample(1:length(delays), 1, replace = FALSE)
The loop continues when delays is empty. You could either change length(random) < 27 to length(random) < 26 I think or add length(delays) > 0.
The code:
delay_0 <- rep(0, 12)
delay_6 <- rep(6, 12)
delays <- c(delay_6, delay_0)
value <- c(1:24)
count <- 0
outcasts <- c()
random <- c(1, 2)
while (length(random) < 27 & length(delays) > 0) {
count <- count + 1
b <- sample(1:length(delays), 1, replace = FALSE)
a <- delays[b]
if (a == tail(random, 1) & a == head((tail(random, 2)), 1))
{
outcast <- outcasts + 1
}
else {
value <- value[-(b)]
delays <- delays[-(b)]
random <- c(random, a)
}
}

R non overlapping sample - faster function

I have a table with more than one contract per client. I want to take a sample but not allowing more than one contract per client within 6 months. I created one function (that uses another) that does the job, but it is too slow.
The callable function is:
non_overlapping_sample <- function (tbla, date_field, id_field, window_days) {
base_evaluar = data.table(tbla)
base_evaluar[,(date_field):= ymd(base_evaluar[[date_field]]) ]
setkeyv(base_evaluar, date_field)
setkeyv(base_evaluar, id_field)
id_primero = sample(1:nrow(tbla), 1)
base_muestra = data.frame(base_evaluar[id_primero,])
base_evaluar = remove_rows(base_evaluar, id_primero, date_field, id_field, window_days)
while (nrow(base_evaluar) > 0) {
id_a_sacar = sample(1:nrow(base_evaluar), 1)
base_muestra = rbind(base_muestra,data.frame(base_evaluar[id_a_sacar,]))
base_evaluar = remove_rows(base_evaluar, id_a_sacar, date_field, id_field, window_days)
}
base_muestra = base_muestra[order(base_muestra[,id_field],base_muestra[,date_field]),]
return(base_muestra)
}
Ant the internal function is:
remove_rows <- function(tabla, indice_fila, date_field, id_field, window_days) {
fecha = tabla[indice_fila, get(date_field)]
element = tabla[indice_fila, get(id_field)]
lim_sup=fecha + window_days
lim_inf=fecha - window_days
queda = tabla[ tabla[[id_field]] != element | tabla[[date_field]] > lim_sup | tabla[[date_field]] < lim_inf]
return(queda)
}
An example to use it is:
set.seed(1)
library(lubridate)
sem = sample(seq.Date(ymd(20150101),ymd(20180101),1), 3000, replace = T)
base = data.frame(fc_fin_semana = sem, cd_cliente=round(runif(3000)*10,0))
base=base[!duplicated(base),]
non_overlapping_sample(base, date_field='fc_fin_semana', 'cd_cliente', 182)
Any ideas to make it work faster?
Thanks!
EDITION:
An example of what would be wrong and right:
rbind is slow in loops. Try something like this:
non_overlapping_sample2 <- function(tbla, date_field, id_field, window_days) {
dt <- data.table(tbla)
dt[, (date_field) := ymd(dt[[date_field]])]
setkeyv(dt, c(id_field, date_field))
# create vectors for while loop:
rowIDS <- 1:nrow(dt)
selected_rows <- NULL
use <- rep(T, nrow(dt))
dates <- dt[[date_field]]
ids <- dt[[id_field]]
rowIDS2 <- rowIDS
while (length(rowIDS2) > 0) {
sid <- sample.int(length(rowIDS2), 1) # as rowIDS2 can be length 1 vector, use this approach
row_selected <- rowIDS2[sid] # selected row
selected_rows <- c(selected_rows, row_selected)
sel_date <- dates[row_selected] # selected date
sel_ID <- ids[row_selected] # selected ID
date_max <- sel_date + window_days
date_min <- sel_date - window_days
use[ids == sel_ID & dates <= date_max & dates >= date_min] <- FALSE
rowIDS2 <- rowIDS[use == TRUE] # subset for next sample
}
result <- dt[selected_rows, ] # dt subset
setorderv(result, c(id_field, date_field))
return(result)
}
In loop we do not need to do data.table\data.frame subsets, operate only on vectors.
Sub-setting can be done in the end.

Compare list of complex structures

I have two lists of complex structures (each list is a multiPhylo object containing phylogenetic trees), and I would like to find out how many times each element of the first one appears in the second one. Pretty straightforward, but for some reasons my code returns wrong results.
library(devtools)
install_github('santiagosnchez/rBt')
library(rBt)
beast_output <- read.annot.beast('strict_BD_starBEAST_logcomb.species.trees')
beast_output_rooted <- root(beast_output, c('taxon_A', 'taxon_B')) # length == 20,000
unique_multiphylo <- unique.multiPhylo(beast_output_rooted) # length == 130
count <- function(item, list) {
total = 0
for (i in 1:length(list)) {
if (all.equal.phylo(item, list[[i]])) {
total = total + 1
}
}
return(total)
}
result <- data.frame(un_tr = rep(0, 130), count = rep(0, 130))
for (i in 1:length(unique_multiphylo)) {
result[i, ] <- c(i, count(unique_multiphylo[[i]], beast_output_rooted))
}
The function all.equal.phylo() takes two phylo objects and returns TRUE if they are the same. See docs. The function count() takes an item and a list and returns the number of times the item appears in the list using all.equal.phylo().
The issue is that the function count() returns 0 most of the time. This should not be possible as the list unique_multiphylo is a sublist of beast_output_rooted, which means that count() should at least return 1.
What is wrong with my code? And how can I correct it? Many thanks for the help!
EDIT: here is a reproducible example:
install.packages('ape')
library(ape)
set.seed(42)
trees <- lapply(rep(c(10, 25, 50, 100), 3), rtree) # length == 12
class(trees) <- 'multiPhylo'
unique_multiphylo <- unique.multiPhylo(trees) # length == 12
count <- function(item, list) {
total = 0
for (i in 1:length(list)) {
if (all.equal.phylo(item, list[[i]])) {
total = total + 1
}
}
return(total)
}
result <- data.frame(un_tr = rep(0, 12), count = rep(0, 12))
for (i in 1:length(unique_multiphylo)) {
result[i, ] <- c(i, count(unique_multiphylo[[i]], trees))
}
However, it seems to be working perfectly fine with these simulated data...
I finally managed to get proper results. In the function all.equal.phylo(), I needed to set the parameter use.edge.length to FALSE so that only the topologies of the phylogenetic trees are compared.
Here is my code:
(I changed the names of a couple of variables to make it clearer what I was trying to do)
install.packages('devtools')
library(devtools)
install_github('santiagosnchez/rBt')
library(rBt)
beast_output <- read.annot.beast('beast_output.trees')
beast_output_rooted <- root.multiPhylo(beast_output, c('taxon_A', 'taxon_B'))
unique_topologies <- unique.multiPhylo(beast_output_rooted)
count <- function(item, list) {
total = 0
for (i in 1:length(list)) {
if (all.equal.phylo(item, list[[i]], use.edge.length = FALSE)) {
total = total + 1
}
}
return(total)
}
result <- data.frame(unique_topology = rep(0, length(unique_topologies)),
count = rep(0, length(unique_topologies)))
for (i in 1:length(unique_topologies)) {
result[i, ] <- c(i, count(unique_topologies[[i]], beast_output_rooted))
}
result$percentage <- ((result$count/length(beast_output_rooted))*100)
There is a shorter solution to your problem:
table( attr(unique_multiphylo, "old.index") )
as unique_multiphylo contains an attribute with the information you are after (see ?unique.multiPhylo).

Finding maximum data with corresponding value from other column in R

I have daily data (4011 days) together with indicator (1-weekdays, 2-weekend). I want to find weekly maxima with the corresponding indicator. For example (let say) the data is:
mydat <- matrix(c(0.027,0.034,0.019,0.021,0.026,0.024,0.058,0.026,0.064,
0.066,0.026,0.101,0.069,0.054,rep(2,2),rep(1,5),rep(2,2),rep(1,5)), ncol=2)
I have try with the following code. I manage to get the maximum sequences (in this case, weekly maxima) but I dont want maximum sequences in indicator. Here is the code
week.max <- function(vec){
if(length(vec[is.na(vec)]) == 7){
return(NA)
}
else{
return(max(vec, na.rm = T))
}
}
max.week.dat <- apply(mydat, 2, function(x) tapply(x, rep(1:(length(x)/7),
each=7, len=length(x)), week.max))
and the result
matrix(c(0.058,0.101,2,2),ncol=2)
I want the output like this:
matrix(c(0.058,0.101,1,1),ncol=2)
Many thanks in advance.
Here is the data (with an extra day in the third week)
mydat <- data.frame(value = c(0.027,0.034,0.019,0.021,0.026,0.024,0.058,0.026,0.064,
0.066,0.026,0.101,0.069,0.054,0.95), ind = c(rep(2,2),rep(1,5),rep(2,2),rep(1,5),2))
Your function
week.max <- function(vec){
if(length(vec[is.na(vec)]) == 7){
return(NA)
}
else{
return(max(vec, na.rm = T))
}
}
Add the week information
mydat$week <- c(rep(1:2,each=7),3)
Use the same solution as for here
library(plyr)
ddply(mydat, .(week), subset, subset = value==week.max(value), select = -week)

Resources