How to rewrite loop to run faster in R? - r

Given a dataset of > 900,000 rows, of which length(duplicates) = >300,000, the following loop takes appr 4h to run in R, which is unacceptable.
for(i in duplicates) {
couple_table <- filter(data, pnr == i) # filter patients
min_date <- min(couple_table$date) # determine date of first operation
max_date <- max(couple_table$date) # determine date of second operation
data$first[data$pnr == i & data$date == min_date] <- 1 # assign 1 to column first
data$second[data$pnr == i & data$date == max_date] <- 1 # assign 1 to column second
}
How can I tweak this code to run faster in R? I have had a look at *apply but I am not familiar with it at all, any ideas?
Dummy data:
data <- data.frame(pnr = c('a43','a4945', 'a43', 'a231', 'a231', 'a6901'),
date = c(as.Date('2011-12-19'), as.Date('2012-09-11'), as.Date('2013-10-01'),
as.Date('2012-05-09'), as.Date('2009-09-10'), as.Date('2015-06-12')))
duplicates <- as.character(data$pnr[duplicated(data$pnr)])

A group by operation would be more faster
library(dplyr)
data %>%
group_by(pnr) %>%
mutate(Min = if(n() > 1) NA^(date != min(date)) else NA,
Max = if(n() > 1) NA^(date != max(date)) else NA) %>%
ungroup
-output
# A tibble: 6 x 4
# pnr date Min Max
# <chr> <date> <dbl> <dbl>
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
Similar logic in data.table would be
library(data.table)
setDT(data)[, c('Min', 'Max') := .(if(.N > 1)
NA^(date != min(date)) else NA, if(.N> 1)
NA^(date != max(date)) else NA), .(pnr)]
Or may use collapse for faster execution
library(collapse)
data %>%
ftransform(n = fNobs(date, pnr, TRA = 'replace_fill')) %>%
ftransform(Min = NA^(fmin(date, pnr, TRA = "replace_fill") != date | n == 1),
Max = NA^(fmax(date, pnr, TRA = "replace_fill") != date | n == 1), n = NULL )
# pnr date Min Max
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
Or use base R with duplicated
i1 <- with(data, duplicated(pnr)|duplicated(pnr, fromLast = TRUE))
data$Min <- with(data, i1 & date == ave(date, pnr, FUN = min))
data$Max <- with(data, i1 & date == ave(date, pnr, FUN = max))

With data.table
library(data.table)
setDT(data)
data[pnr %in% duplicates, ":="(
Min = (date == min(date)) * 1L,
Max = (date == max(date)) * 1L
), by = pnr
]
data[, c("Min", "Max") := lapply(.SD, function(x) ifelse(x == 0, NA, x)), .SDcols = c("Min", "Max")]

Here is a base R solution with ave. It uses the trick in akrun's answer, that
NA^0 == 1
(More precisely, that NA^FALSE == NA^0 == 1)
data$first <- with(data, ave(as.integer(date), pnr, FUN = function(d) NA^(d == max(d))))
data$second <- with(data, ave(as.integer(date), pnr, FUN = function(d) NA^(d == min(d))))
data
# pnr date first second
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA

A data.table option
setDT(data)[
,
`:=`(
first = ifelse(min(date) == date & .N > 1, 1, NA_integer_),
second = ifelse(max(date) == date & .N > 1, 1, NA_integer_)
),
pnr
]
gives
pnr date first second
1: a43 2011-12-19 1 NA
2: a4945 2012-09-11 NA NA
3: a43 2013-10-01 NA 1
4: a231 2012-05-09 NA 1
5: a231 2009-09-10 1 NA
6: a6901 2015-06-12 NA NA

Related

If statement with two conditions and NA

I am looking to use a conditional statement to access date rows which are before 0021-01-11 and have NA value in a specific column (People_vaccinated for example). For those rows I wanted to impute with zero.
I want to use an IF statement with (condition1 AND condition 2).
Condition1 can be df$People_vaccinated == NA and condition2 can be df$date < 'given date'
Maybe this will help -
df <- data.frame(Date = c('0021-01-07', '0021-01-08','0021-01-11', '0021-01-12'),
a = c(2, NA, 3, NA),
b = c(1, NA, 2, 3))
ind <- match('0021-01-11', df$Date)
df$a[1:ind][is.na(df$a[1:ind])] <- 0
df
# Date a b
#1 0021-01-07 2 1
#2 0021-01-08 0 NA
#3 0021-01-11 3 2
#4 0021-01-12 NA 3
Or using dplyr -
library(dplyr)
df <- df %>%
mutate(a = replace(a,
row_number() <= match('0021-01-11', Date) & is.na(a), 0))
df

apply two functions by row condition data.table

I have the following data.table
df <- data.table(
id = c(rep(1,6),rep(2,6),rep(3,6)),
grp = c(rep("x",6),rep("y",6),rep("z",6)),
val1 = 1:18,
val2 = 13:30
)
I want two apply two different functions by row condition
for example:
cols <- paste0("val",1:2)
df[id == 1,lapply(.SD, function (x) tail(x,2)),.SDcols = cols,by = list(id,grp)]
df[id != 1,lapply(.SD, function (x) tail(x,3)),.SDcols = cols,by = list(id,grp)]
I'm quite new to working with data.table so there is maybe a more efficient way than carrying out separate calculations then joining the two tables above
If the conditions are disjunct, i.e., id == 1 and id != 1, and if id is also one of the grouping variables (in the by = clause), two different functions can be applied by
df[, lapply(.SD, function (x) if (first(id) == 1) tail(x, 2) else tail(x, 3)),
.SDcols = cols, by = .(id, grp)]
id grp val1 val2
1: 1 x 5 17
2: 1 x 6 18
3: 2 y 10 22
4: 2 y 11 23
5: 2 y 12 24
6: 3 z 16 28
7: 3 z 17 29
8: 3 z 18 30
So, subsetting is not by row but by grouping variable and has been moved into the anonymous function definition within lapply(). This avoids to rbind() the subsets afterwards.
For the sake of completeness, in the particular case of the tail() function being called with different parameters we can write more concisely
df[, lapply(.SD, tail, n = fifelse(first(id) == 1, 2, 3)),
.SDcols = cols, by = .(id, grp)]
Here is another option:
df[.N:1L, ri := rowid(id, grp)]
rbindlist(list(
df[id == 1L & ri <= 2L], #for the first, df[id == 1L, tail(.SD, 2L), .(id, grp), .SDcols = cols]
df[id != 1L & ri <= 3L] #and for df[id != 2, tail(.SD, 3L), .(id,grp), .SDcols = cols]
))
output:
id grp val1 val2 ri
1: 1 x 5 17 2
2: 1 x 6 18 1
3: 2 y 10 22 3
4: 2 y 11 23 2
5: 2 y 12 24 1
6: 3 z 16 28 3
7: 3 z 17 29 2
8: 3 z 18 30 1
Would be interested to know the size of your dataset and the speedup.

Replace certain variables with NA, one variable is NA

What is the best function to use if I want to replace certain variables with NA based on a conditional?
If status = NA, then score_1:score_3 will be NA
tried:
if(df2$status == NA){
df2$score_2 <- NA
}else{
df2$score_2 <- df$score_2
}
Thanks in advance
One option is to find the NAs in 'status' and assign the columns that having 'score' as column name to NA in base R
i1 <- is.na(df2$Status)
df2[i1, grep("^Score_\\d+$", names(df2))] <- NA
Or an option in dplyr
library(dplyr)
df2 %>%
mutate_at(vars(starts_with('Score')), ~ replace(., is.na(Status), NA))
You can do this by finding out which rows in the data frame are NA and then setting the columns in those rows to NA.
df <- data.frame(client_id = 1:4,
Date = 1:4,
Status = c(1, NA, 1, NA),
Score1 = runif(4)*100,
Score2 = runif(4)*100,
Score3 = runif(4)*100)
idx <- is.na(df$Status)
df[idx, 4:6] <- NA
df
#> client_id Date Status Score1 Score2 Score3
#> 1 1 1 1 48.08677 16.62185 91.80062
#> 2 2 2 NA NA NA NA
#> 3 3 3 1 14.04552 64.55724 56.45998
#> 4 4 4 NA NA NA NA

Using "by-argument" in "outer" data.table to filter "inner" data.table

I still have some problems understanding the data.table notation. Could anyone explain why the following is not working?
I'm trying to classify dates into groups using cut. The breaks used can be found in another data.table and depend on the by argument of the outer "data" data.table
data <- data.table(A = c(1, 1, 1, 2, 2, 2),
DATE = as.POSIXct(c("01-01-2012", "30-05-2015", "01-01-2020", "30-06-2012", "30-06-2013", "01-01-1999"), format = "%d-%m-%Y"))
breaks <- data.table(B = c(1, 1, 2, 2),
BREAKPOINT = as.POSIXct(c("01-01-2015", "01-01-2016", "30-06-2012", "30-06-2013"), format = "%d-%m-%Y"))
data[, bucket := cut(DATE, breaks[B == A, BREAKPOINT], ordered_result = T), by = A]
I can get the desired result doing
# expected
data[A == 1, bucket := cut(DATE, breaks[B == 1, BREAKPOINT], ordered_result = T)]
data[A == 2, bucket := cut(DATE, breaks[B == 2, BREAKPOINT], ordered_result = T)]
data
# A DATE bucket
# 1: 1 2012-01-01 NA
# 2: 1 2015-05-30 2015-01-01
# 3: 1 2020-01-01 NA
# 4: 2 2012-06-30 2012-06-30
# 5: 2 2013-06-30 NA
# 6: 2 1999-01-01 NA
Thanks,
Michael
The problem is that cut produces factors and those are not being handled correctly in the data.table by operation (this is a bug and should be reported - the factor levels should be handled the same way they are handled in rbind.data.table or rbindlist). An easy fix to your original expression is to convert to character:
data[, bucket := as.character(cut(DATE, breaks[B == A, BREAKPOINT], ordered_result = T))
, by = A]
# A DATE bucket
#1: 1 2012-01-01 NA
#2: 1 2015-05-30 2015-01-01
#3: 1 2020-01-01 NA
#4: 2 2012-06-30 2012-06-30
#5: 2 2013-06-30 NA
#6: 2 1999-01-01 NA

How to delete columns that contain ONLY NAs?

I have a data.frame containing some columns with all NA values. How can I delete them from the data.frame?
Can I use the function,
na.omit(...)
specifying some additional arguments?
One way of doing it:
df[, colSums(is.na(df)) != nrow(df)]
If the count of NAs in a column is equal to the number of rows, it must be entirely NA.
Or similarly
df[colSums(!is.na(df)) > 0]
Here is a dplyr solution:
df %>% select_if(~sum(!is.na(.)) > 0)
Update: The summarise_if() function is superseded as of dplyr 1.0. Here are two other solutions that use the where() tidyselect function:
df %>%
select(
where(
~sum(!is.na(.x)) > 0
)
)
df %>%
select(
where(
~!all(is.na(.x))
)
)
Another option is the janitor package:
df <- janitor::remove_empty(df, which = "cols")
https://github.com/sfirke/janitor
It seeems like you want to remove ONLY columns with ALL NAs, leaving columns with some rows that do have NAs. I would do this (but I am sure there is an efficient vectorised soution:
#set seed for reproducibility
set.seed <- 103
df <- data.frame( id = 1:10 , nas = rep( NA , 10 ) , vals = sample( c( 1:3 , NA ) , 10 , repl = TRUE ) )
df
# id nas vals
# 1 1 NA NA
# 2 2 NA 2
# 3 3 NA 1
# 4 4 NA 2
# 5 5 NA 2
# 6 6 NA 3
# 7 7 NA 2
# 8 8 NA 3
# 9 9 NA 3
# 10 10 NA 2
#Use this command to remove columns that are entirely NA values, it will leave columns where only some values are NA
df[ , ! apply( df , 2 , function(x) all(is.na(x)) ) ]
# id vals
# 1 1 NA
# 2 2 2
# 3 3 1
# 4 4 2
# 5 5 2
# 6 6 3
# 7 7 2
# 8 8 3
# 9 9 3
# 10 10 2
If you find yourself in the situation where you want to remove columns that have any NA values you can simply change the all command above to any.
Another option with Filter
Filter(function(x) !all(is.na(x)), df)
NOTE: Data from #Simon O'Hanlon's post.
An intuitive script: dplyr::select_if(~!all(is.na(.))). It literally keeps only not-all-elements-missing columns. (to delete all-element-missing columns).
> df <- data.frame( id = 1:10 , nas = rep( NA , 10 ) , vals = sample( c( 1:3 , NA ) , 10 , repl = TRUE ) )
> df %>% glimpse()
Observations: 10
Variables: 3
$ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
$ nas <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
$ vals <int> NA, 1, 1, NA, 1, 1, 1, 2, 3, NA
> df %>% select_if(~!all(is.na(.)))
id vals
1 1 NA
2 2 1
3 3 1
4 4 NA
5 5 1
6 6 1
7 7 1
8 8 2
9 9 3
10 10 NA
Because performance was really important for me, I benchmarked all the functions above.
NOTE: Data from #Simon O'Hanlon's post. Only with size 15000 instead of 10.
library(tidyverse)
library(microbenchmark)
set.seed(123)
df <- data.frame(id = 1:15000,
nas = rep(NA, 15000),
vals = sample(c(1:3, NA), 15000,
repl = TRUE))
df
MadSconeF1 <- function(x) x[, colSums(is.na(x)) != nrow(x)]
MadSconeF2 <- function(x) x[colSums(!is.na(x)) > 0]
BradCannell <- function(x) x %>% select_if(~sum(!is.na(.)) > 0)
SimonOHanlon <- function(x) x[ , !apply(x, 2 ,function(y) all(is.na(y)))]
jsta <- function(x) janitor::remove_empty(x)
SiboJiang <- function(x) x %>% dplyr::select_if(~!all(is.na(.)))
akrun <- function(x) Filter(function(y) !all(is.na(y)), x)
mbm <- microbenchmark(
"MadSconeF1" = {MadSconeF1(df)},
"MadSconeF2" = {MadSconeF2(df)},
"BradCannell" = {BradCannell(df)},
"SimonOHanlon" = {SimonOHanlon(df)},
"SiboJiang" = {SiboJiang(df)},
"jsta" = {jsta(df)},
"akrun" = {akrun(df)},
times = 1000)
mbm
Results:
Unit: microseconds
expr min lq mean median uq max neval cld
MadSconeF1 154.5 178.35 257.9396 196.05 219.25 5001.0 1000 a
MadSconeF2 180.4 209.75 281.2541 226.40 251.05 6322.1 1000 a
BradCannell 2579.4 2884.90 3330.3700 3059.45 3379.30 33667.3 1000 d
SimonOHanlon 511.0 565.00 943.3089 586.45 623.65 210338.4 1000 b
SiboJiang 2558.1 2853.05 3377.6702 3010.30 3310.00 89718.0 1000 d
jsta 1544.8 1652.45 2031.5065 1706.05 1872.65 11594.9 1000 c
akrun 93.8 111.60 139.9482 121.90 135.45 3851.2 1000 a
autoplot(mbm)
mbm %>%
tbl_df() %>%
ggplot(aes(sample = time)) +
stat_qq() +
stat_qq_line() +
facet_wrap(~expr, scales = "free")
Try as follows:
df <- df[,colSums(is.na(df))<nrow(df)]
Another option using the map_lgl function from the purrr package, which returns a logical vector and using the [ to remove the columns with all NA. Here is a reproducible example:
set.seed(7)
df <- data.frame(id = 1:5 , nas = rep(NA, 5) , vals = sample(c(1:3,NA), 5, repl = TRUE))
df
#> id nas vals
#> 1 1 NA 2
#> 2 2 NA 3
#> 3 3 NA 3
#> 4 4 NA NA
#> 5 5 NA 3
library(purrr)
df[!map_lgl(df, ~ all(is.na(.)))]
#> id vals
#> 1 1 2
#> 2 2 3
#> 3 3 3
#> 4 4 NA
#> 5 5 3
Created on 2022-08-28 with reprex v2.0.2

Resources