I have data in the following format:
DATE GROUP EVENT ELIGIBLE
2021-3-9 A 1 1
2021-3-1 A 0 0
2021-3-1 B 0 1
2021-2-20 B 1 1
I would like to group the data by the GROUP column and then add three new columns that calculate by group the sum of (EVENT / ELIGIBLE) for the following time frames. Last 3 months, 3 months back to six months back, and the last year.
I have calculated the overall percentage without separate timeframes by doing the following:
grouped <- data %>%
filter(ELIGIBLE == 1 ) %>%
group_by(GROUP) %>%
mutate(count_Eligible = sum(ELIGIBLE == 1 )) %>%
mutate(count_events = sum(EVENT == 1 )) %>%
mutate(Percentage = round(100*count_events/count_Eligible,2))
I am wondering what the cleanest way would be to add the three different percentages within the timeframes. So far I have pulled the dates to do the filtering with the following code:
today <- Sys.Date()
three_month_lookback <- as.Date(today) - months(3)
six_month_lookback <- as.Date(today) - months(6)
one_year_lookback <- as.Date(today) - months(12)
We can create a function to do the calculation
library(dplyr)
library(purrr)
f1 <- function(data) {
data %>%
filter(ELIGIBLE == 1 ) %>%
group_by(GROUP) %>%
transmute(count_Eligible = sum(ELIGIBLE == 1 ),
count_events = sum(EVENT == 1 ),
Percentage = round(100*count_events/count_Eligible,2))
}
Then, loop over the 'lookback' periods, subset the data based on the 'DATE' column and apply the function
map2_dfr(list(three_month_lookback, six_month_lookback,
one_year_lookback) list(today(), three_month_lookback, today()),
~ data %>%
mutate(DATE = as.Date(DATE)) %>%
filter(DATE >= .x, DATE <= .y) %>%
f1(.), .id = 'grp'
)
If we need to combine by columns
map2(list(three_month_lookback, six_month_lookback,
one_year_lookback) list(today(), three_month_lookback, today()),
~ data %>%
mutate(DATE = as.Date(DATE)) %>%
filter(DATE >= .x, DATE <= .y) %>%
f1(.)
) %>%
reduce(full_join, by = "GROUP")
Related
I have a dataset on a group of individuals that was collected starting at different times for each individual.
I need to subset the data from 1 year since their first entry, like so: myData[myDate >= "first entry" & myDate += "1 year"]
Example data:
df_date <- data.frame( Name = c("Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim",
"Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue"),
Dates = c("2010-1-1", "2010-2-2", "2010-3-5","2010-4-17","2010-5-20",
"2010-6-29","2010-7-6","2010-8-9","2010-9-16","2010-10-28","2010-11-16","2010-12-28","2011-1-16","2011-2-28",
"2010-4-1", "2010-5-2", "2010-6-5","2010-7-17","2010-8-20",
"2010-9-29","2010-10-6","2010-11-9","2012-12-16","2011-1-28","2011-2-28","2011-3-28","2011-2-28","2011-3-28"),
Event = c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1) )
The desired output would be Jim would have data from 1/1/2010 - 12/28/2010 and Sue from 4/4/2010 - 3/28/2011 and so on. The actual dataset had > 20 samples, all starting at different times.
Use a combination of tidyverse and lubridate functions:
library(tidyverse)
library(lubridate)
df_date %>%
mutate(Dates = as_datetime(Dates)) %>%
group_by(Name) %>%
arrange(Dates, .by_group = T) %>%
filter(Dates <= first(Dates) + duration(1, units = "year"))
Similar to Martin C. Arnold's answer, I got another answer based on dplyr and lubridate. min(Dates) + years(1) means add one year to the minimum date.
library(dplyr)
library(lubridate)
df_date2 <- df_date %>%
mutate(Dates = ymd(Dates)) %>%
group_by(Name) %>%
filter(Dates <= min(Dates) + years(1)) %>%
ungroup()
A data wrangling question:
I have a dataframe of hourly animal tracking points with columns for id, time, and whether the animal is on land or in water (0 = water; 1 = land). It looks something like this:
set.seed(13)
n <- 100
dat <- data.frame(id = rep(1:5, each = 10),
datetime=seq(as.POSIXct("2020-12-26 00:00:00"), as.POSIXct("2020-12-30 3:00:00"), by = "hour"),
land = sample(0:1, n, replace = TRUE))
What I need to do is flag the first row after which the animal uses land at least once for 3 straight days. I tried doing something like this:
dat$ymd <- ymd(dat$datetime[1]) # make column for year-month-day
# add land points within each id group
land.pts <- dat %>%
group_by(id, ymd) %>%
arrange(id, datetime) %>%
drop_na(land) %>%
mutate(all.land = cumsum(land))
#flag days that have any land points
flag <- land.pts %>%
group_by(id, ymd) %>%
arrange(id, datetime) %>%
slice(n()) %>%
mutate(flag = if_else(all.land == 0,0,1))
# Combine flagged dataframe with full dataframe
comb <- left_join(land.pts, flag)
comb[is.na(comb)] <- 1
and then I tried this:
x = comb %>%
group_by(id) %>%
arrange(id, datetime) %>%
mutate(time.land=ifelse(land==0 | is.na(lag(land)) | lag(land)==0 | flag==0,
0,
difftime(datetime, lag(datetime), units="days")))
But I still can't quite wrap my head around what to do to make it so that I can figure out when the animal has been on land at least once for three days straight, and then flag that first point on land. Thanks so much for any help you can provide!
Create a date column from the timestamp. Summarise the data and keep only 1 row for each id and date which shows whether the animal was on land even once in the entire day.
Use zoo's rollapply function to mark the first day as TRUE if the next 3 days the animal was on land.
library(dplyr)
library(zoo)
dat <- dat %>% mutate(date = as.Date(datetime))
dat %>%
group_by(id, date) %>%
summarise(on_land = any(land == 1)) %>%
mutate(consec_three = rollapply(on_land, 3,all, align = 'left', fill = NA)) %>%
ungroup %>%
#If you want all the rows of the data
left_join(dat, by = c('id', 'date'))
I have 2 codes that manipulate and filter (by date) my data.frame and that work perfectly. Now I want to run the code for not only one day, but for every day in vector:
seq(from=as.Date('2020-03-02'), to=Sys.Date(),by='days')` #.... 538 days
The code I want to run for all the days between 2020-03-02 and today is:
KOKOKO <- data.frame %>%
filter(DATE < '2020-03-02')%>%
summarize(DATE = '2020-03-02', CZK = sum(Objem.v.CZK,na.rm = T)
STAVPTF <- data.frame %>%
filter (DATE < '2020-03-02')%>%
group_by(CP) %>%
summarize(mnozstvi = last(AKTUALNI_MNOZSTVI_AKCIE), DATE = '2020-03-02') %>%
select(DATE,CP,mnozstvi) %>%
rbind(KOKOKO)%>%
drop_na() %>%
So instead of '2020-03-02' I want to fill in all days since '2020-03-02' one after another. And each of the KOKOKO and STAVPTF created for the unique day like this I want to save as a separate data.frame and all of them store in a list.
We could use map to loop over the sequence and apply the code
library(dplyr)
library(purrr)
out <- map(s1, ~ data.frame %>%
filter(DATE < .x)%>%
summarize(DATE = .x, CZK = sum(Objem.v.CZK,na.rm = TRUE))
As this is repeated cycle, a function would make it cleaner
f1 <- function(dat, date_col, group_col, Objem_col, aktualni_col, date_val) {
filtered <- dat %>%
filter({{date_col}} < date_val)
KOKOKO <- filtered %>%
summarize({{date_col}} := date_val,
CZK = sum({{Objem_col}}, na.rm = TRUE)
STAVPTF <- filtered %>%
group_by({{group_col}}) %>%
summarize(mnozstvi = last({{aktualni_col}}),
{{date_col}} := date_val) %>%
select({{date_col}}, {{group_col}}, mnozstvi) %>%
bind_rows(KOKOKO)%>%
drop_na()
return(STAVPTF)
}
and call as
map(s1, ~ f1(data.frame, DATE, CP, Objem.v.CZK, AKTUALNI_MNOZSTVI_AKCIE, !!.x))
where
s1 <- seq(from=as.Date('2020-03-02'), to=Sys.Date(), by='days')
It would be easier to answer your question, if you would provide a minimal reproducible example. It's easy done with tidyverses reprex packages
However, your KOKOKO code can be rewritten as simple cumulative sum:
KOKOKO =
data.frame %>%
arrange(DATE) %>% # if necessary
group_by(DATE) %>%
summarise(CZK = sum(Objem.v.CZK), .groups = 'drop') %>% # summarise per DATE (if necessary)
mutate(CZK = cumsum(CZK) - CZK) # cumulative sum excluding current row (current DATE)
Even STAVPTF code can probably be rewritten without iterations. First find the last value of AKTUALNI_MNOZSTVI_AKCIE per CP and DATE. Then this value is assigned to the next DATE:
STAVPTF <-
data.frame %>%
group_by(CP, DATE) %>%
summarise(mnozstvi = last(AKTUALNI_MNOZSTVI_AKCIE), .groups='drop_last') %>%
arrange(DATE) %>% # if necessary
mutate(DATE = lead(DATE))
I have some time series data where I use the rolling_origin function to apply different time series splits to the data which generates a number of lists. The time series starts from 2020-03-01 until 2020-10-30.
I want to start from 2020-04-15 such that I have 1 month before (2020-03-15) and 1 month after (2020-05-15). I can use an ifelse statement to add a 1 for observations after and a 0 for observations before.
rolledData %>%
map(., ~mutate(.x,
treatment_control = ifelse(date >= as.Date("2020-04-15"), 1, 0)
))
But what I want to do is to increment the ifelse date when mapped over the list. So the first one might start on the 2020-04-15 but in the next list in the sequence it would be changed to 2020-04-16, and the next list 2020-04-17, .... , until the end.
I could manually write out the results:
lst1 <- rolledData[[12]] %>%
mutate(
treatment_control = ifelse(date >= as.Date("2020-04-15"), 1, 0)
)
lst2 <- rolledData[[13]] %>%
mutate(
treatment_control = ifelse(date >= as.Date("2020-04-16"), 1, 0)
)
lst3 <- rolledData[[14]] %>%
mutate(
treatment_control = ifelse(date >= as.Date("2020-04-17"), 1, 0)
)
How can I map over the list and increment the treatment_control mutate?
Note: Because I am using financial data (which was just the easiest to obtain for a reproducible example) the weekends are removed (in my data I have a full week)
Data:
library(tidyquant)
library(rsample)
data <- tq_get(c("AAPL"),
get = "stock.prices",
from = "2020-03-01",
to = "2020-10-30")
rolledData <- data %>%
rolling_origin(
data = .,
initial = 60, # 2 months of data
assess = 0,
cumulative = FALSE,
skip = 0
)
rolledData <- rolledData$splits %>%
map(., ~analysis(.x))
If the dates are different, we can pass a vector of custom dates that have the same length as the rolledData in map2
library(dplyr)
library(purrr)
rolleData2 <- rolledData %>%
map2(., newdates,
~ .x %>%
mutate(treatment_control = +(date >= .y)
))
where
newdates <- seq(as.Date("2020-03-15"), length.out = length(rolledData), by = "1 day")
If it is based on the next month from the first 'date' value
library(lubridate)
rolledData2 <- rolledData %>%
map(~ .x %>%
mutate(treatment_control =
+(date >= (first(date) %m+% months(1)))))
I have a data frame with COVID data and I'm trying to make a column calculating the number of recovered people based off of the number of positive tests.
My data has a location, a date, and the number of tests administered/positive results/negative results each day. Here's a few lines using one location as an example (the real data has several months worth of dates):
loc date tests pos neg active
spot1 2020-04-10 1 1 0 5
spot1 2020-04-11 2 1 1 6
spot1 2020-04-12 0 0 0 6
spot1 2020-04-13 11 1 10 7
I want to make a new column that cumulatively counts each positive test in each location 14 days after it is recorded. On 2020-04-24, the 5 active classes are not active anymore, so I want a recovered column with 5. For each date I want the newly nonactive cases to be added.
My first thought was to try it in a loop:
df1 <- df %>%
mutate(date = as.Date(date)) %>%
group_by(loc) %>%
mutate(rec = for (i in 1:nrow(df)) {
#getting number of new cases
x <- df$pos[i]
#add 14 days to the date
d <- df$date + 14
df$rec <- sum(x)
})
As you can see, I'm not the best at writing for loops. That gives me a bunch of numbers, but bear very little meaningful relationship to the data.
Also tried it with map_dbl:
df1 <- df %>%
mutate(date = as.Date(date)) %>%
group_by(loc) %>%
mutate(rec = map_dbl(date, ~sum(pos[(date <= . + 14) & date >= .])))
Which resulted in the same number printed on the entire rec column.
Any suggestions? (Sorry for the lengthy explanation, just want to make sure this all makes sense)
Your sample data shows that -
you have all continuous dates despite 0 tests (12 April)
Active column seems like already a cumsum
Therefore I think you can simply use lag function with argument 14
example code
df %>% group_by(loc) %>% mutate(recovered = lag(active, 14)) %>% ungroup()
You could use aggregate to sum the specific column and then applying
cut in order to set a 14 day time frame for each sum:
df <- data.frame(loc = rep("spot1", 30),
date = seq(as.Date('2020-04-01'), as.Date('2020-04-30'),by = 1),
test = seq(1:30),
positive = seq(1:30),
active = seq(1:30))
output <- aggregate(positive ~ cut(date, "14 days"), df, sum)
output
Console output:
cut(date, "14 days") positive
1 2020-04-01 105
2 2020-04-15 301
3 2020-04-29 59
my solution:
library(dplyr)
date_seq <- seq(as.Date("2020/04/01"), by = "day", length.out = 30)
pos <- rpois(n = 60, lambda = 10)
mydf <-
data.frame(loc = c(rep('loc1', 30), rep('loc2', 30)),
date = date_seq,
pos = pos)
head(mydf)
getPosSum <- function(max, tbl, myloc, daysBack = 14) {
max.Date <- as.Date(max)
sum(tbl %>%
filter(date >= max.Date - (daysBack - 1) &
date <= max.Date & loc == myloc) %>%
select(pos))
}
result <-
mydf %>%
group_by(date, loc) %>%
mutate(rec = getPosSum(max = date, tbl = mydf, myloc = loc))
library(tidyverse)
library(lubridate)
data %>%
mutate(date = as_date(date),
cut = cut(date, '14 days') %>%
group_by(loc) %>%
arrange(cut) %>%
mutate(cum_pos = accumulate(pos, `+`)) # accumulate(pos, sum) should also work
As a general rule of thumb, avoid loops, especially within mutate - that won't work. Instead of map_dbl you should check out purrr::accumulate. There's specialized functions for this in R's base library such as cumsum and cummin but their behavior is a lot less predictable in relation to purrr's.