I need to filter a large dataset (100K + observations) in R so that it only includes data from 2014-present. The raw data contain observations from 2001-present. Here is the sample data to work from:
df <- data.frame(student = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), GPA = c(4,3.7,2.0,1.3,2.9,2.4,4.0,3.0,2.0,3.3),
Failed_Course = c(1,0,1,1,1,1,1,1,1,0),
Exam_date = c ("01/06/2010 06:55:00 AM", "03/30/2020 11:55:00 PM","12/30/2014 12:55:00 AM","04/20/2016 11:55:00 PM","09/28/2014 11:12:00 PM","07/30/2017 11:55:00 PM", "4/3/2005 09:55:00 PM",
"8/20/2004 11:55:00 PM","8/20/2015 11:22:00 AM","6/22/2001 08:55:00 PM"))
Using dplyr and lubridate
library(lubridate)
library(dplyr)
# Converts variable Exam_date into date format (month,date,year_hours,mins,secs)
df$Exam_date <- mdy_hms(df$Exam_date)
# Creates a new variable called date_year that only contains the year,
#filters for years greater than or equal to 2014,
#and drops the date_year variable
df <- df %>%
mutate(date_year = year(Exam_date)) %>%
filter(date_year >= 2014) %>%
select(-date_year)
Here is a base R approach.
df$Exam_date <- as.POSIXct(df$Exam_date,format = "%m/%d/%Y %I:%M:%S %p", tz="UTC")
df[df$Exam_date > as.POSIXct("2014-01-01 00:00:00"),]
# student GPA Failed_Course Exam_date
#2 2 3.7 0 2020-03-30 23:55:00
#3 3 2.0 1 2014-12-30 00:55:00
#4 4 1.3 1 2016-04-20 23:55:00
#5 5 2.9 1 2014-09-28 23:12:00
#6 6 2.4 1 2017-07-30 23:55:00
#9 9 2.0 1 2015-08-20 11:22:00
Related
I am working on a project where I have to only include patients who had lab tests ordered at least 12 hours apart, and to keep the timestamp of each included lab test. The issue is that many patients get several labs done within the 12 hour window, but the client has asked to not include those tests. I have made it this far:
#Create dummy dataset
df = data.frame(
"Encounter" = c(rep("12345", times=16), rep("67890", times = 5)),
"Timestamp" = c("01/06/2022 04:00:00", "01/07/2022 08:00:00",
"01/08/2022 00:00:00", "01/08/2022 04:00:00",
"01/08/2022 08:00:00", "01/08/2022 20:00:00",
"01/09/2022 04:00:00", "01/09/2022 08:00:00",
"01/09/2022 20:00:00", "01/09/2022 23:26:00",
"01/10/2022 00:00:00", "01/10/2022 08:00:00",
"01/10/2022 20:00:00", "01/11/2022 00:00:00",
"01/11/2022 20:00:00", "01/12/2022 04:00:00",
"11/10/2021 11:00:00", "11/10/2021 12:00:00",
"11/10/2021 13:00:00", "11/10/2021 14:00:00",
"11/11/2021 00:00:00"))
#Convert timestamp to POSIXlt format
df$Timestamp <- strptime(as.character(df$Timestamp), format="%m/%d/%Y %H:%M")
#Calculate time (in hours) between each previous timestamp by Encounter
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(difftime(Timestamp, lag(Timestamp), units="hours"))
I can't seem to figure out what to do next. It seems like I need to calculate a rolling 12-hours that then resets to 0 once a row hits 12 hours, but I'm not sure how to go about it. Below is my ideal result:
df$Keep.Row <- c(1,1,1,0,0,1,0,1,1,0,0,1,1,0,1,0,1,0,0,0,1)
There is absolutely nothing elegant about this, but I believe it gives you what you’re looking for. I use a temporary variable to store the “rolling” sum before it’s reset once the hours between is 12 or greater.
library(tidyverse)
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_diff = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
replace_na(list(time_diff = 0)) %>%
mutate(temp = ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(time_diff) + time_diff),
temp = ifelse(is.na(temp), 0, temp),
hours_between = ifelse(time_diff >= 12, time_diff,
ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(temp) + time_diff)),
keep = ifelse(hours_between >= 12 | is.na(hours_between), 1, 0)) %>%
select(-temp)
Created on 2022-01-27 by the reprex package (v2.0.1)
Here is an alternative option using accumulate. Here, you can use you differences, and once they exceed the threshold of 12 hours, reset by just using the diff value (starting over) instead of using the cumulative sum. To include the first time for each Encounter, you can either make that diff 12 hours, or add a separate mutate and check where Timestamp == first(Timestamp) and in those cases set keep to 1.
library(tidyverse)
thresh <- 12
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(diff = difftime(Timestamp, lag(Timestamp, default = first(Timestamp) - (thresh * 60 * 60)), units = "hours"),
keep = +(accumulate(diff, ~if_else(.x >= thresh, .y, .x + .y)) >= thresh))
Output
Encounter Timestamp diff keep
<chr> <dttm> <drtn> <int>
1 12345 2022-01-06 04:00:00 12.0000000 hours 1
2 12345 2022-01-07 08:00:00 28.0000000 hours 1
3 12345 2022-01-08 00:00:00 16.0000000 hours 1
4 12345 2022-01-08 04:00:00 4.0000000 hours 0
5 12345 2022-01-08 08:00:00 4.0000000 hours 0
6 12345 2022-01-08 20:00:00 12.0000000 hours 1
7 12345 2022-01-09 04:00:00 8.0000000 hours 0
8 12345 2022-01-09 08:00:00 4.0000000 hours 1
9 12345 2022-01-09 20:00:00 12.0000000 hours 1
10 12345 2022-01-09 23:26:00 3.4333333 hours 0
11 12345 2022-01-10 00:00:00 0.5666667 hours 0
12 12345 2022-01-10 08:00:00 8.0000000 hours 1
13 12345 2022-01-10 20:00:00 12.0000000 hours 1
14 12345 2022-01-11 00:00:00 4.0000000 hours 0
15 12345 2022-01-11 20:00:00 20.0000000 hours 1
16 12345 2022-01-12 04:00:00 8.0000000 hours 0
17 67890 2021-11-10 11:00:00 12.0000000 hours 1
18 67890 2021-11-10 12:00:00 1.0000000 hours 0
19 67890 2021-11-10 13:00:00 1.0000000 hours 0
20 67890 2021-11-10 14:00:00 1.0000000 hours 0
21 67890 2021-11-11 00:00:00 10.0000000 hours 1
Probably missing something, but wouldn't this work:
library(dplyr)
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_dif = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
filter(time_dif > 12)
require(dplyr)
df <- data.frame(Date.time = c("2015-01-01 00:00:00", "2015-01-01 00:30:00", "2015-01-01 01:00:00", "2015-01-01 01:30:00", "2015-01-01 02:00:00"),
RH33HMP = c(99.6,99.6,99.5,99.3,98.63),
RH33HMP_f = c(9,9,92,93,9),
RH38HMP = c(99.6,99.6,99.5,99.3,98.63),
RH38HMP_f = c(9,902,9,9,91))
Here is some example data.frame.
I'd like to set every value to NA where the corresponding quality column (_f) contains something else than 9. First, I grep the column number with the actual measurements:
col_var <- grep("^Date.|_f$", names(df), invert = T)
Then I use dplyr and mutate_at with an if_else function. My problem is, that mutate_at iterates through all the columns of col_val, but the function itself does not. I tried several examples that I found on stackoverflow, but none of them seem to work.
# does not work
df_qc <- df %>%
mutate_at(.vars = col_var,
.funs = list(~ ifelse(df[, col_var+1] == 9, ., NA)))
i=1
df_qc <- df %>%
mutate_at(.vars = col_var,
.funs = list(~ ifelse(df[, i+1] == 9, ., NA)))
I think I am quite close, any help appreciated.
We can use Map :
df[col_var] <- Map(function(x, y) {y[x != 9] <- NA;y},df[col_var + 1],df[col_var])
df
# Date.time RH33HMP RH33HMP_f RH38HMP RH38HMP_f
#1 2015-01-01 00:00:00 99.60 9 99.6 9
#2 2015-01-01 00:30:00 99.60 9 NA 902
#3 2015-01-01 01:00:00 NA 92 99.5 9
#4 2015-01-01 01:30:00 NA 93 99.3 9
#5 2015-01-01 02:00:00 98.63 9 NA 91
Similarly, you can use map2 in purrr if you prefer tidyverse.
df[col_var] <- purrr::map2(df[col_var + 1],df[col_var], ~{.y[.x != 9] <- NA;.y})
One dplyr and purrr option could be:
map2_dfr(.x = df %>%
select(ends_with("HMP")),
.y = df %>%
select(ends_with("_f")),
~ replace(.x, .y != 9, NA)) %>%
bind_cols(df %>%
select(-ends_with("HMP")))
RH33HMP RH38HMP Date.time RH33HMP_f RH38HMP_f
<dbl> <dbl> <fct> <dbl> <dbl>
1 99.6 99.6 2015-01-01 00:00:00 9 9
2 99.6 NA 2015-01-01 00:30:00 9 902
3 NA 99.5 2015-01-01 01:00:00 92 9
4 NA 99.3 2015-01-01 01:30:00 93 9
5 98.6 NA 2015-01-01 02:00:00 9 91
I have a column of times that have been entered as raw text. An example is below (code for data input at the bottom of the post):
#> id time
#> 1 NA <NA>
#> 2 1 7:50 pm
#> 3 2 7:20 pm
#> 4 3 3:20 pm
I would like to add indicator variables, that for example, indicate if the time is:
after 7pm
between 7pm and 7.30pm
So my desired output would look like this:
#> id time before_1930 between_1900_1930
#> 1 NA <NA> NA NA
#> 2 1 7:50 pm 0 0
#> 3 2 7:20 pm 1 1
#> 4 3 3:20 pm 1 0
So far, I have tried reading in the times with parse_date_time, but this adds on a date:
library(lubridate)
df <- df %>% mutate(time = lubridate::parse_date_time(time, '%I:%M %p'))
df
#> id time
#> 1 NA <NA>
#> 2 1 0000-01-01 19:50:00
#> 3 2 0000-01-01 19:20:00
#> 4 3 0000-01-01 15:20:00
Is there an easy way to work directly with the hours and minutes, and then create the dummy variables I mentioned?
Code for data input
df <- data.frame(
id = c(NA, 1, 2, 3),
time = c(NA, "7:50 pm", "7:20 pm", "3:20 pm")
)
Try this one:
library(dplyr)
library(lubridate)
data.frame(
id = c(NA, 1, 2, 3),
time = c(NA, "7:50 pm", "7:20 pm", "3:20 pm")
) %>%
mutate(real_time = lubridate::parse_date_time(time, '%I:%M %p'),
is_before = case_when(
hour(real_time) < 19 ~ "Before 19",
hour(real_time) == 19 & minute(real_time) < 30 ~ "19:00 - 19:30",
T ~ "After 19:30"
))
id time real_time is_before
1 NA <NA> <NA> After 19:30
2 1 7:50 pm 0000-01-01 19:50:00 After 19:30
3 2 7:20 pm 0000-01-01 19:20:00 19:00 - 19:30
4 3 3:20 pm 0000-01-01 15:20:00 Before 19
Rather than trying to deal with it as a date/time, use your output from parse_date_time to calculate the number of hours since midnight on 0000-01-01.
df <- data.frame(
id = c(NA, 1, 2, 3),
time = c(NA, "7:50 pm", "7:20 pm", "3:20 pm")
)
library(dplyr)
library(lubridate)
df <- df %>% mutate(time = lubridate::parse_date_time(time, '%I:%M %p'),
time = difftime(time,
as.POSIXct("0000-01-01", tz = "UTC"),
units = "hours"),
before_1930 = as.numeric(time < 19.5),
between_1900_1930 = as.numeric(time > 19 & time < 19.5))
df
I have 2 columns
one is date :
2011-04-13
2013-07-29
2010-11-23
the other is time :
3
22
15
I want to make a new column contains date time
it will be like this
2011-04-13 3:00:00
2013-07-29 22:00:00
2010-11-23 15:00:00
I managed to combine them as string
but when i convert them to datetime i get only date the time disappears
any idea how to get date and time in one column?
my script
data <- read.csv("d:\\__r\\hour.csv")
data$date <- as.POSIXct(paste(data$dteday , paste(data$hr, ":00:00", sep=""), sep=" "))
as example you can use ymd_hm function from lubridate:
a <- c("2014-09-08", "2014-09-08", "2014-09-08")
b <- c(3, 4, 5)
library(lubridate)
library(tidyverse)
tibble(a, b) %>%
mutate(time = paste0(a, " ", b, "-0"),
time = ymd_hm(time))
output would be:
# A tibble: 3 x 3
a b time
<chr> <dbl> <dttm>
1 2014-09-08 3 2014-09-08 03:00:00
2 2014-09-08 4 2014-09-08 04:00:00
3 2014-09-08 5 2014-09-08 05:00:00
found this fixed the problem
data$date <- as.POSIXct(strptime(paste(data$dteday , paste(data$hr, ":00:00", sep=""), sep=" "), "%Y-%m-%d %H:%M:%S"))
I have a dataset that looks like this.
id1 = c(1,1,1,1,1,1,1,1,2,2)
id2 = c(3,3,3,3,3,3,3,3,3,3)
lat = c(-62.81559,-62.82330, -62.78693,-62.70136, -62.76476,-62.48157,-62.49064,-62.45838,42.06258,42.06310)
lon = c(-61.15518, -61.14885,-61.17801,-61.00363, -59.14270, -59.22009, -59.32967, -59.04125 ,154.70579, 154.70625)
start_date= as.POSIXct(c('2016-03-24 15:30:00', '2016-03-24 15:30:00','2016-03-24 23:40:00','2016-03-25 12:50:00','2016-03-29 18:20:00','2016-06-01 02:40:00','2016-06-01 08:00:00','2016-06-01 16:30:00','2016-07-29 20:20:00','2016-07-29 20:20:00'), tz = 'UTC')
end_date = as.POSIXct(c('2016-03-24 23:40:00', '2016-03-24 18:50:00','2016-03-25 03:00:00','2016-03-25 19:20:00','2016-04-01 03:30:00','2016-06-02 01:40:00','2016-06-01 14:50:00','2016-06-02 01:40:00','2016-07-30 07:00:00','2016-07-30 07:00:00'),tz = 'UTC')
speed = c(2.9299398, 2.9437502, 0.0220565, 0.0798409, 1.2824859, 1.8685429, 3.7927680, 1.8549291, 0.8140249,0.8287073)
df = data.frame(id1, id2, lat, lon, start_date, end_date, speed)
id1 id2 lat lon start_date end_date speed
1 1 3 -62.81559 -61.15518 2016-03-24 15:30:00 2016-03-24 23:40:00 2.9299398
2 1 3 -62.82330 -61.14885 2016-03-24 15:30:00 2016-03-24 18:50:00 2.9437502
3 1 3 -62.78693 -61.17801 2016-03-24 23:40:00 2016-03-25 03:00:00 0.0220565
4 1 3 -62.70136 -61.00363 2016-03-25 12:50:00 2016-03-25 19:20:00 0.0798409
5 1 3 -62.76476 -59.14270 2016-03-29 18:20:00 2016-04-01 03:30:00 1.2824859
6 1 3 -62.48157 -59.22009 2016-06-01 02:40:00 2016-06-02 01:40:00 1.8685429
7 1 3 -62.49064 -59.32967 2016-06-01 08:00:00 2016-06-01 14:50:00 3.7927680
8 1 3 -62.45838 -59.04125 2016-06-01 16:30:00 2016-06-02 01:40:00 1.8549291
9 2 3 42.06258 154.70579 2016-07-29 20:20:00 2016-07-30 07:00:00 0.8140249
10 2 3 42.06310 154.70625 2016-07-29 20:20:00 2016-07-30 07:00:00 0.8287073
The actual dataset is larger. What I would like to do is consolidate this dataset based on date ranges and grouped by id1 and id2, such that if the date/time range on one row is within 12 hours of the next date/time range 'ABS(end_date[1] - start_date[2]) < 12hrs' the rows should be consolidated with the new start_date being the earliest date and the end_date being the latest. All other values (lat, lon, speed) will be averaged. This is some sense a 'deduping' effort as rows that are within 12 hours actually represent the same 'event'. For the above example the final result would be
id1 id2 lat lon start_date end_date speed
1 1 3 -62.7818 -61.12142 2016-03-24 15:30:00 2016-03-25 19:20:00 1.493897
2 1 3 -62.76476 -59.14270 2016-03-29 18:20:00 2016-04-01 03:30:00 1.2824859
3 1 3 -62.47686 -59.197 2016-06-01 02:40:00 2016-06-02 01:40:00 2.505413
4 2 3 42.06284 154.706 2016-07-29 20:20:00 2016-07-30 07:00:00 0.8213661
With the first four rows consolidated (into row1), the 5 row left alone (row2), the 6-8 rows consolidated (row3), and the 9-10 rows consolidated (row4).
I have been trying to do this with dplyr group_by and summarize, but I can't seem to get the get the date ranges to come out correctly.
Hopefully someone can determine a simple means of solving the problem. Extra points if you know how to do it in SQL ;-) so I can dedupe before even pulling this into R.
Here is a first very naive implementation. Warning: it is slow, not pretty and still missing the start and end dates in the output! Note that it expects the rows to be ordered by date and time. If that's not the case in the data set, you can do it in R or SQL first. Sorry that I can't think of a dplyr or SQL solution. I'd also like to see those two, if anyone has got an idea.
dedupe <- function(df) {
counter = 1
temp_vector = unlist(df[1, ])
summarized_df = df[0, c(1, 2, 3, 4, 7)]
colnames(summarized_df) = colnames(df)[c(1, 2, 3, 4, 7)]
summarized_df$counter = NULL
for (i in 2:nrow(df)) {
if (((abs(difftime(df[i, "start_date"], df[i - 1, "end_date"], units = "h")) <
12) ||
abs(difftime(df[i, "start_date"], df[i - 1, "start_date"], units = "h")) <
12) &&
df[i, "id1"] == df[i - 1, "id1"] &&
df[i, "id2"] == df[i - 1, "id2"]) {
#group events because id is the same and time range overlap
#sum up columns and select maximum end_date
temp_vector[c(3, 4, 7)] = temp_vector[c(3, 4, 7)] + unlist(df[i, c(3, 4, 7)])
temp_vector["end_date"] = max(temp_vector["end_date"], df[i, "end_date"])
counter = counter + 1
if (i == nrow(df)) {
#in the last iteration we need to create a new group
summarized_df[nrow(summarized_df) + 1, c(1, 2)] = df[i, c(1, 2)]
summarized_df[nrow(summarized_df), 3:5] = temp_vector[c(3, 4, 7)] / counter
summarized_df[nrow(summarized_df), "counter"] = counter
}
} else {
#new event so we calculate group statistics for temp_vector and reset its value as well as counter
summarized_df[nrow(summarized_df) + 1, c(1, 2)] = df[i, c(1, 2)]
summarized_df[nrow(summarized_df), 3:5] = temp_vector[c(3, 4, 7)] / counter
summarized_df[nrow(summarized_df), "counter"] = counter
counter = 1
temp_vector[c(3, 4, 7)] = unlist(df[i, c(3, 4, 7)])
}
}
return(summarized_df)
}
Function call
> dedupe(df)
id1 id2 lat lon speed counter
5 1 3 -62.78179 -61.12142 1.4938968 4
6 1 3 -62.76476 -59.14270 1.2824859 1
9 2 3 -62.47686 -59.19700 2.5054133 3
10 2 3 42.06284 154.70602 0.8213661 2
This can be easily achieved by using insurancerating::reduce():
df |>
insurancerating::reduce(begin = start_date, end = end_date, id1, id2,
agg_cols = c(lat, lon, speed), agg = "mean",
min.gapwidth = 12 * 3600)
#> id1 id2 index end_date start_date lat lon
#> 1 1 3 0 2016-03-25 19:20:00 2016-03-24 15:30:00 -62.78180 -61.12142
#> 2 1 3 1 2016-04-01 03:30:00 2016-03-29 18:20:00 -62.76476 -59.14270
#> 3 1 3 2 2016-06-02 01:40:00 2016-06-01 02:40:00 -62.47686 -59.19700
#> 4 2 3 0 2016-07-30 07:00:00 2016-07-29 20:20:00 42.06284 154.70602
#> speed
#> 1 1.4938969
#> 2 1.2824859
#> 3 2.5054133
#> 4 0.8213661
Created on 2022-06-13 by the reprex package (v2.0.1)