Mix values from 2 dataframes in a mutate function with conditions - r

I have 2 dataframes with these forms:
DF1 <- data.frame(
idCarte = c('a', 'a', 'b', 'b', 'b'),
idPack = c('1', '2', '2', '3', '3'),
timeIn = c('10:00:02', '12:00:50', '11:40:00', '12:10:35', '15:15:00'),
timeOut = c('12:00:00', '14:00:00', '11:50:00', '15:00:00', '16:00:00')
)
DF1
idCarte idPack timeIn timeOut
a 1 10:00:02 12:00:00
a 2 12:00:50 14:00:00
b 2 11:40:00 11:50:00
b 3 12:10:35 15:00:00
b 3 15:15:35 16:00:00
DF2 <- data.frame(
idCarte = c('a', 'a', 'b', 'b', 'b'),
idPack = c('1', '2', '3', '3', '2'),
timeBetween = c('11:00:02', '13:00:50', '14:10:35', '15:20:00', '18:00:00')
)
DF2
idCarte idPack timeBetween
a 1 11:00:02
a 2 13:00:50
b 3 14:10:35
b 3 15:20:00
b 2 18:00:00
And I want to get this result
idCarte idPack timeIn timeOut timeBetween
a 1 10:00:02 12:00:00 11:00:02
a 2 12:00:50 14:00:00 13:00:50
b 2 11:40:00 11:50:00 NA
b 3 12:10:35 15:00:00 14:10:35
b 3 15:15:00 16:00:00 15:20:00
I can do it with a for loop like this but it's really slow
for (i in 1:nrow(DF1)) {
timeBetweenLocal <- DF2 %>%
filter(
idCarte == DF1[i,"idCarte"] &
idPack == DF1[i,"idPack"] &
timeBetween >= DF1[i,"timeIn"] &
timeBetween <= DF1[i,"timeOut"]
)
if (nrow(timeBetweenLocal) > 0) {
DF1[i, "timeBetween"] <- timeBetweenLocal[1, "timeBetween"]
} else {
DF1[i, "timeBetween"] <- NA
}
}
I want to do it in a vectorised way with dplyr::mutate to go faster but it seems a little bit tricky.
DF1 %>%
mutate (
timeBetween = ifelse (
nrow(DF2 %>%
dplyr::filter(
idCarte == .$idCarte &
idPack == .$idPack &
timeBetween >=.$timeIn &
timeBetween <= .$timeOut
)
) > 0,
DF2 %>%
dplyr::filter(
idCarte == .$idCarte &
idPack == .$idPack &
timeBetween >=.$timeIn &
timeBetween <= .$timeOut
),
NA
)
)
# Error : Result must have length 4, not 0
My problem is that I need test the matching time because there are multiple idCarte, idPack
Is anyone have an idea to vectorise this algorithm ?
Thanks

Here is a solution with left_join and case_when. left_join can lead to duplicated rows, you can use na.omit or filter(!duplicated(...)) if you want remove some duplications.
library(lubridate)
library(dplyr)
# Yours data
DF1 <- data.frame(stringsAsFactors = F,
idCarte = c('a', 'a', 'b', 'b', 'b'),
idPack = c('1', '2', '2', '3', '3'),
timeIn = c('10:00:02', '12:00:50', '11:40:00', '12:10:35', '15:15:00'),
timeOut = c('12:00:00', '14:00:00', '11:50:00', '15:00:00', '16:00:00')
)
DF2 <- data.frame(stringsAsFactors = F,
idCarte = c('a', 'a', 'b', 'b', 'b'),
idPack = c('1', '2', '3', '3', '2'),
timeBetween = c('11:00:02', '13:00:50', '14:10:35', '15:20:00', '18:00:00')
)
# Solution with left_join lead to duplicate rows
df = left_join(x = DF1, y = DF2, by = c("idCarte"="idCarte","idPack"="idPack")) %>%
mutate(timeBetween = case_when(hms(timeBetween)>= hms(timeIn) & hms(timeBetween)<= hms(timeOut) ~ timeBetween,
T ~ NA_character_
)
)
# The output
#
# idCarte idPack timeIn timeOut timeBetween
# 1 a 1 10:00:02 12:00:00 11:00:02
# 2 a 2 12:00:50 14:00:00 13:00:50
# 3 b 2 11:40:00 11:50:00 <NA>
# 4 b 3 12:10:35 15:00:00 14:10:35
# 5 b 3 12:10:35 15:00:00 <NA>
# 6 b 3 15:15:00 16:00:00 <NA>
# 7 b 3 15:15:00 16:00:00 15:20:00

Here is the dplyr solution as mentioned in the comments:
library(dplyr)
library(lubridate)
DF1 %>%
left_join(DF2) %>%
mutate(timeIn = as_datetime(hms(timeIn)),
timeOut = as_datetime(hms(timeOut)),
timeBetween = as_datetime(hms(timeBetween))) %>%
filter(timeBetween > timeIn & timeBetween < timeOut | is.na(timeBetween))
#Joining, by = c("idCarte", "idPack")
# idCarte idPack timeIn timeOut timeBetween
#1 a 1 1970-01-01 10:00:02 1970-01-01 12:00:00 1970-01-01 11:00:02
#2 a 2 1970-01-01 12:00:50 1970-01-01 14:00:00 1970-01-01 13:00:50
#3 b 2 1970-01-01 11:40:00 1970-01-01 11:50:00 <NA>
#4 b 3 1970-01-01 12:10:35 1970-01-01 15:00:00 1970-01-01 14:10:35

To check for matches in the first two columns we may use outer. For multiple matches we want to check whether the time is between timeIn and timeOut. Therefore it's advantageous to convert the times into POSIXct format.
DF1[3:4] <- lapply(DF1[3:4], as.POSIXct, format="%H:%M:%S")
DF2[3] <- as.POSIXct(DF2[[3]], format="%H:%M:%S")
For the outer we code a convenience function.
rp <- function(x) Reduce(paste, x)
Now we create a list w with indices which of the first two columns of both data frames do match using outer.
w <- apply(outer(rp(DF1[1:2]), rp(DF2[1:2]), `==`), 1, which)
Look at the lapply(... in following line; we call each list entry of w, throw either NA if it's empty or chose that entry that falls within the time frame of DF1. Empty elements we again turn to NA. The do.call("c", ...) concatenates the resulting list into a vector, that we can cbind to DF1.
res <- cbind(DF1, timeBetween=do.call("c", lapply(seq(w), function(i) {
r <- DF2[w[[i]], 3]
if (length(r) == 0) r <- NA
else r <- r[r > DF1[i, 3] & r < DF1[i, 4]]
if (length(r) == 0) r <- NA
return(r)
})))
Optionally, we can strip of the dates at the end.
res[3:5] <- lapply(res[3:5], strftime, format="%H:%M:%S")
Result
res
# idCarte idPack timeIn timeOut timeBetween
# 1 a 1 10:00:02 12:00:00 11:00:02
# 2 a 2 12:00:50 14:00:00 13:00:50
# 3 b 2 11:40:00 11:50:00 <NA>
# 4 b 3 12:10:35 15:00:00 14:10:35
# 5 b 3 15:15:00 16:00:00 15:20:00

Related

How to rewrite loop to run faster in R?

Given a dataset of > 900,000 rows, of which length(duplicates) = >300,000, the following loop takes appr 4h to run in R, which is unacceptable.
for(i in duplicates) {
couple_table <- filter(data, pnr == i) # filter patients
min_date <- min(couple_table$date) # determine date of first operation
max_date <- max(couple_table$date) # determine date of second operation
data$first[data$pnr == i & data$date == min_date] <- 1 # assign 1 to column first
data$second[data$pnr == i & data$date == max_date] <- 1 # assign 1 to column second
}
How can I tweak this code to run faster in R? I have had a look at *apply but I am not familiar with it at all, any ideas?
Dummy data:
data <- data.frame(pnr = c('a43','a4945', 'a43', 'a231', 'a231', 'a6901'),
date = c(as.Date('2011-12-19'), as.Date('2012-09-11'), as.Date('2013-10-01'),
as.Date('2012-05-09'), as.Date('2009-09-10'), as.Date('2015-06-12')))
duplicates <- as.character(data$pnr[duplicated(data$pnr)])
A group by operation would be more faster
library(dplyr)
data %>%
group_by(pnr) %>%
mutate(Min = if(n() > 1) NA^(date != min(date)) else NA,
Max = if(n() > 1) NA^(date != max(date)) else NA) %>%
ungroup
-output
# A tibble: 6 x 4
# pnr date Min Max
# <chr> <date> <dbl> <dbl>
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
Similar logic in data.table would be
library(data.table)
setDT(data)[, c('Min', 'Max') := .(if(.N > 1)
NA^(date != min(date)) else NA, if(.N> 1)
NA^(date != max(date)) else NA), .(pnr)]
Or may use collapse for faster execution
library(collapse)
data %>%
ftransform(n = fNobs(date, pnr, TRA = 'replace_fill')) %>%
ftransform(Min = NA^(fmin(date, pnr, TRA = "replace_fill") != date | n == 1),
Max = NA^(fmax(date, pnr, TRA = "replace_fill") != date | n == 1), n = NULL )
# pnr date Min Max
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
Or use base R with duplicated
i1 <- with(data, duplicated(pnr)|duplicated(pnr, fromLast = TRUE))
data$Min <- with(data, i1 & date == ave(date, pnr, FUN = min))
data$Max <- with(data, i1 & date == ave(date, pnr, FUN = max))
With data.table
library(data.table)
setDT(data)
data[pnr %in% duplicates, ":="(
Min = (date == min(date)) * 1L,
Max = (date == max(date)) * 1L
), by = pnr
]
data[, c("Min", "Max") := lapply(.SD, function(x) ifelse(x == 0, NA, x)), .SDcols = c("Min", "Max")]
Here is a base R solution with ave. It uses the trick in akrun's answer, that
NA^0 == 1
(More precisely, that NA^FALSE == NA^0 == 1)
data$first <- with(data, ave(as.integer(date), pnr, FUN = function(d) NA^(d == max(d))))
data$second <- with(data, ave(as.integer(date), pnr, FUN = function(d) NA^(d == min(d))))
data
# pnr date first second
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
A data.table option
setDT(data)[
,
`:=`(
first = ifelse(min(date) == date & .N > 1, 1, NA_integer_),
second = ifelse(max(date) == date & .N > 1, 1, NA_integer_)
),
pnr
]
gives
pnr date first second
1: a43 2011-12-19 1 NA
2: a4945 2012-09-11 NA NA
3: a43 2013-10-01 NA 1
4: a231 2012-05-09 NA 1
5: a231 2009-09-10 1 NA
6: a6901 2015-06-12 NA NA

How to drop all rows of data frame after specific row number or date in R?

I have a data frame with a certain number of rows.
Would like to drop all rows after a specific row number or after a date.
Any suggestions?
Could not find anything on the web that works for me for the moment...
Here's a way how you can do this:
df <- df[1:2, ] ## one way of selecting rows from first row to row number you want in a data frame
# a b c date
#1 1 2 3 2017-01-01
#2 1 2 3 2017-01-02
df <- df[-(3:nrow(df)), ] ## another way of filtering rows from starting from row which you don't want to total number of rows in a data frame
# a b c date
#1 1 2 3 2017-01-01
#2 1 2 3 2017-01-02
df <- df[df$date < "2017-01-03", ] ## subset based on a date value
# a b c date
#1 1 2 3 2017-01-01
#2 1 2 3 2017-01-02
data
df = data.frame(a = c(1,1,4,4), b = c(2,2,5,5), c = c(3,3,6,6),
date = seq(from = as.Date("2017-01-01"), to = as.Date("2017-01-04"), by = 'day')) ## creating a dummy data frame
We can use head
n <- 5
df2 <- head(df1, n)
df2
# date col2
#1 2019-01-01 -0.5458808
#2 2019-02-01 0.5365853
#3 2019-03-01 0.4196231
#4 2019-04-01 -0.5836272
#5 2019-05-01 0.8474600
Or create a logical vector
df1[seq_len(nrow(df1)) <= n, ]
Or another option is slice
library(dplyr)
df1 %>%
slice(seq_len(n))
Or with data.table
library(data.table)
setDT(df1)[seq_len(n)]
If it is based on a date value
date1 <- as.Date("2019-05-01")
subset(df1, date <= date1)
data
set.seed(24)
df1 <- data.frame(date = seq(as.Date("2019-01-01"), length.out = 10,
by = "month"), col2 = rnorm(10))

comparing dates in R not working well (equal)

I want to compare 2 column dataframe with dates and include one column to indicate whether dates "A" are <= dates "B" or >
df <- data.frame( list (A=c("15-10-2000", "15-10-2000", "15-10-2000","20-10-2000"),
B=c("15-10-2000", "16-10-2000", "14-10-2000","19-10-2000")))
What I would like to include is new column C = ( 1 , 1, 0, 0).
I have tried:
df$C = ifelse (df$A <= df$B, 1, 0)
It works except for the "equal" comparation.
I get: C = ( 0 , 1, 0, 0)
sorry but before doing the comparation I changed the format to Date and still does not works
df$A= as.Date(df$A, format = "%d-%m-%Y")
df$B = as.Date(df$B, format = "%d-%m-%Y")
The date columns are factors. You need to first convert them to Date class and then compare
library(dplyr)
df %>%
mutate_at(vars(A:B), as.Date, format = "%d-%m-%Y") %>%
mutate(C = as.integer(A <= B))
# A B C
#1 2000-10-15 2000-10-15 1
#2 2000-10-15 2000-10-16 1
#3 2000-10-15 2000-10-14 0
#4 2000-10-20 2000-10-19 0
Or in base R that would be
df[1:2] <- lapply(df[1:2], as.Date, format = "%d-%m-%Y")
df$C <- as.integer(df$A <= df$B)
You should convert the factors to dates (As Jon Spring pointed out). Then it should work
library(dplyr)
df %>%
mutate_all(lubridate::dmy) %>%
mutate(C = ifelse(A<=B,1,0))
A B C
1 2000-10-15 2000-10-15 1
2 2000-10-15 2000-10-16 1
3 2000-10-15 2000-10-14 0
4 2000-10-20 2000-10-19 0

How to do Countifs in R

Data:
set.seed(42)
df1 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),1),
value = sample(1:30),
Y = sample(c("yes", "no"), 30, replace = TRUE)
)
df2 = data.frame(
Date = seq.Date(as.Date("2018-01-01"),as.Date("2018-01-30"),7)
)
For sum if data falls within range this works (from my previous question):
library(data.table)
df1$start <- df1$Date
df1$end <- df1$Date
df2$start <- df2$Date
df2$end <- df2$Date + 6
setDT(df1, key = c("start", "end"))
setDT(df2, key = c("start", "end"))
d = foverlaps(df1, df2)[, list(mySum = sum(value)), by = Date ]
How can I do countif ?
because when I try
d = foverlaps(df1, df2)[, list(mySum = count(value)), by = Date ]
I get error
no applicable method for 'groups' applied to an object of class "c('double', 'numeric')"
We can use .N:
foverlaps(df1, df2)[, list(myCount = .N), by = Date ]
# Date myCount
# 1: 2018-01-01 7
# 2: 2018-01-08 7
# 3: 2018-01-15 7
# 4: 2018-01-22 7
# 5: 2018-01-29 2
d = foverlaps(df1, df2)[, .N, by = Date]
If you want to count the number of rows per Date, you can try .N
foverlaps(df1, df2)[, .(mysum = .N), by = Date ]
Date mysum
1: 2018-01-01 7
2: 2018-01-08 7
3: 2018-01-15 7
4: 2018-01-22 7
5: 2018-01-29 2
If you want the count of unique values per Date you can try uniqueN()
foverlaps(df1, df2)[, .(mysum = uniqueN(value)), by = Date ]
Date mysum
1: 2018-01-01 7
2: 2018-01-08 7
3: 2018-01-15 7
4: 2018-01-22 7
5: 2018-01-29 2
Both .N and uniqueN() are from {data.table}.
Instead of list(mySum = count(value)) try c(mySum = count(value)). The Code runs for me then.
d2 <- foverlaps(df1, df2)[, c(mySum = count(value)), by = Date ]

How to overwrite some rows of a tibble with another tibble

Suppose I have data like the following:
# A tibble: 10 x 4
# Groups: a.month, a.group [10]
a.month a.group other.group amount
<date> <chr> <chr> <dbl>
1 2016-02-01 A X 15320
2 2016-05-01 A Z 50079
3 2016-06-01 A Y 60564
4 2016-08-01 A X 10540
5 2017-01-01 B X 30020
6 2017-03-01 B X 76310
7 2017-04-01 B Y 44215
8 2017-05-01 A Y 67241
9 2017-06-01 A Z 17180
10 2017-07-01 B Z 31720
And I want to produce rows for every possible combination of a.group, other.group and for every month in between (with amount being zero if not present on the data above)
I managed to produce a tibble with the default amounts through:
another.tibble <- as_tibble(expand.grid(
a.month = months.list,
a.group = unique.a.groups,
other.group = unique.o.groups,
amount = 0
));
How should I proceed to populate another.tibble with the values from the first one?
It is important to invoke expand.grid with stringsAsFactors=FALSE. Then, we simply make a LEFT_JOIN() to complete the combinations where we have data
library(tidyverse)
df <- tribble(
~a.month, ~a.group, ~other.group, ~amount,
'2016-02-01', 'A', 'X', 15320,
'2016-05-01', 'A', 'Z', 50079,
'2016-06-01', 'A', 'Y', 60564,
'2016-08-01', 'A', 'X', 10540,
'2017-01-01', 'B', 'X', 30020,
'2017-03-01', 'B', 'X', 76310,
'2017-04-01', 'B', 'Y', 44215,
'2017-05-01', 'A', 'Y', 67241,
'2017-06-01', 'A', 'Z', 17180,
'2017-07-01', 'B', 'Z', 31720
)
another.tibble <- as_tibble(expand.grid(
a.month = unique(df$a.month),
a.group = unique(df$a.group),
other.group = unique(df$other.group),
amount = 0, stringsAsFactors=F)
)
another.tibble %>%
left_join(df, by= c("a.month" = "a.month", "a.group" = "a.group", "other.group" = "other.group")) %>%
mutate(amount.x = ifelse(is.na(amount.y), 0, amount.y)) %>%
rename(amount = amount.x) %>%
select(1:4)

Resources