Select Data - First entry + set time period (1 year) R - r

I have a dataset on a group of individuals that was collected starting at different times for each individual.
I need to subset the data from 1 year since their first entry, like so: myData[myDate >= "first entry" & myDate += "1 year"]
Example data:
df_date <- data.frame( Name = c("Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim","Jim",
"Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue","Sue"),
Dates = c("2010-1-1", "2010-2-2", "2010-3-5","2010-4-17","2010-5-20",
"2010-6-29","2010-7-6","2010-8-9","2010-9-16","2010-10-28","2010-11-16","2010-12-28","2011-1-16","2011-2-28",
"2010-4-1", "2010-5-2", "2010-6-5","2010-7-17","2010-8-20",
"2010-9-29","2010-10-6","2010-11-9","2012-12-16","2011-1-28","2011-2-28","2011-3-28","2011-2-28","2011-3-28"),
Event = c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1) )
The desired output would be Jim would have data from 1/1/2010 - 12/28/2010 and Sue from 4/4/2010 - 3/28/2011 and so on. The actual dataset had > 20 samples, all starting at different times.

Use a combination of tidyverse and lubridate functions:
library(tidyverse)
library(lubridate)
df_date %>%
mutate(Dates = as_datetime(Dates)) %>%
group_by(Name) %>%
arrange(Dates, .by_group = T) %>%
filter(Dates <= first(Dates) + duration(1, units = "year"))

Similar to Martin C. Arnold's answer, I got another answer based on dplyr and lubridate. min(Dates) + years(1) means add one year to the minimum date.
library(dplyr)
library(lubridate)
df_date2 <- df_date %>%
mutate(Dates = ymd(Dates)) %>%
group_by(Name) %>%
filter(Dates <= min(Dates) + years(1)) %>%
ungroup()

Related

Finding the first row after which x rows meet some criterium in R

A data wrangling question:
I have a dataframe of hourly animal tracking points with columns for id, time, and whether the animal is on land or in water (0 = water; 1 = land). It looks something like this:
set.seed(13)
n <- 100
dat <- data.frame(id = rep(1:5, each = 10),
datetime=seq(as.POSIXct("2020-12-26 00:00:00"), as.POSIXct("2020-12-30 3:00:00"), by = "hour"),
land = sample(0:1, n, replace = TRUE))
What I need to do is flag the first row after which the animal uses land at least once for 3 straight days. I tried doing something like this:
dat$ymd <- ymd(dat$datetime[1]) # make column for year-month-day
# add land points within each id group
land.pts <- dat %>%
group_by(id, ymd) %>%
arrange(id, datetime) %>%
drop_na(land) %>%
mutate(all.land = cumsum(land))
#flag days that have any land points
flag <- land.pts %>%
group_by(id, ymd) %>%
arrange(id, datetime) %>%
slice(n()) %>%
mutate(flag = if_else(all.land == 0,0,1))
# Combine flagged dataframe with full dataframe
comb <- left_join(land.pts, flag)
comb[is.na(comb)] <- 1
and then I tried this:
x = comb %>%
group_by(id) %>%
arrange(id, datetime) %>%
mutate(time.land=ifelse(land==0 | is.na(lag(land)) | lag(land)==0 | flag==0,
0,
difftime(datetime, lag(datetime), units="days")))
But I still can't quite wrap my head around what to do to make it so that I can figure out when the animal has been on land at least once for three days straight, and then flag that first point on land. Thanks so much for any help you can provide!
Create a date column from the timestamp. Summarise the data and keep only 1 row for each id and date which shows whether the animal was on land even once in the entire day.
Use zoo's rollapply function to mark the first day as TRUE if the next 3 days the animal was on land.
library(dplyr)
library(zoo)
dat <- dat %>% mutate(date = as.Date(datetime))
dat %>%
group_by(id, date) %>%
summarise(on_land = any(land == 1)) %>%
mutate(consec_three = rollapply(on_land, 3,all, align = 'left', fill = NA)) %>%
ungroup %>%
#If you want all the rows of the data
left_join(dat, by = c('id', 'date'))

Manipulating data.frame while using cycles and storing values in a list

I have 2 codes that manipulate and filter (by date) my data.frame and that work perfectly. Now I want to run the code for not only one day, but for every day in vector:
seq(from=as.Date('2020-03-02'), to=Sys.Date(),by='days')` #.... 538 days
The code I want to run for all the days between 2020-03-02 and today is:
KOKOKO <- data.frame %>%
filter(DATE < '2020-03-02')%>%
summarize(DATE = '2020-03-02', CZK = sum(Objem.v.CZK,na.rm = T)
STAVPTF <- data.frame %>%
filter (DATE < '2020-03-02')%>%
group_by(CP) %>%
summarize(mnozstvi = last(AKTUALNI_MNOZSTVI_AKCIE), DATE = '2020-03-02') %>%
select(DATE,CP,mnozstvi) %>%
rbind(KOKOKO)%>%
drop_na() %>%
So instead of '2020-03-02' I want to fill in all days since '2020-03-02' one after another. And each of the KOKOKO and STAVPTF created for the unique day like this I want to save as a separate data.frame and all of them store in a list.
We could use map to loop over the sequence and apply the code
library(dplyr)
library(purrr)
out <- map(s1, ~ data.frame %>%
filter(DATE < .x)%>%
summarize(DATE = .x, CZK = sum(Objem.v.CZK,na.rm = TRUE))
As this is repeated cycle, a function would make it cleaner
f1 <- function(dat, date_col, group_col, Objem_col, aktualni_col, date_val) {
filtered <- dat %>%
filter({{date_col}} < date_val)
KOKOKO <- filtered %>%
summarize({{date_col}} := date_val,
CZK = sum({{Objem_col}}, na.rm = TRUE)
STAVPTF <- filtered %>%
group_by({{group_col}}) %>%
summarize(mnozstvi = last({{aktualni_col}}),
{{date_col}} := date_val) %>%
select({{date_col}}, {{group_col}}, mnozstvi) %>%
bind_rows(KOKOKO)%>%
drop_na()
return(STAVPTF)
}
and call as
map(s1, ~ f1(data.frame, DATE, CP, Objem.v.CZK, AKTUALNI_MNOZSTVI_AKCIE, !!.x))
where
s1 <- seq(from=as.Date('2020-03-02'), to=Sys.Date(), by='days')
It would be easier to answer your question, if you would provide a minimal reproducible example. It's easy done with tidyverses reprex packages
However, your KOKOKO code can be rewritten as simple cumulative sum:
KOKOKO =
data.frame %>%
arrange(DATE) %>% # if necessary
group_by(DATE) %>%
summarise(CZK = sum(Objem.v.CZK), .groups = 'drop') %>% # summarise per DATE (if necessary)
mutate(CZK = cumsum(CZK) - CZK) # cumulative sum excluding current row (current DATE)
Even STAVPTF code can probably be rewritten without iterations. First find the last value of AKTUALNI_MNOZSTVI_AKCIE per CP and DATE. Then this value is assigned to the next DATE:
STAVPTF <-
data.frame %>%
group_by(CP, DATE) %>%
summarise(mnozstvi = last(AKTUALNI_MNOZSTVI_AKCIE), .groups='drop_last') %>%
arrange(DATE) %>% # if necessary
mutate(DATE = lead(DATE))

Calculate average of variable between two dates based on a different date column

I have a data set similar to the xdata tibble below. I'd like to calculate the average of the values variable when the month of the start date is different than the month of date, between dates defined by start date and start date five days after.
[EDITED FOR CLARITY]
library(lubridate)
library(tibble)
xdata <- tibble(date=ymd('2015-01-01')+days(seq(1:(365*3))), values=seq(1,365*3))
xdata <- xdata %>% mutate(start_date = case_when(wday(date)==2 ~ date+days(14))) %>%
fill(start_date)
xdata %>% mutate(avg = case_when(month(start_date) != month(date) ~ mean(values[between(date, start_date, start_date+days(5))])))
I've also tried
xdata %>% group_by(start_date) %>% mutate( . . .
but that does not solve the problem.
What I'm expecting is the following:
The first non-NA start_date occurs in row 4 and is start_date='2015-01-19'
I'd like to calculate the mean of the values (the column of data) for date='2015-01-19' and the next 4 days. Those values are 18,19,20,21 and 22. The mean should be 20.
What is the best way to calculate the average of values between dates defined by a range based on start_date?
Thanks,
jfd118
ANSWER -
Jon Spring led me to this being the solution:
xdata %>%
filter(!is.na(start_date)) %>%
mutate(avg = slide_index_dbl(values, date, mean, .before=1, .after = 5)) %>%
select(start_date = date, avg) %>%
right_join(xdata ) %>%
select(date, values, everything()) %>%
arrange(date) %>% View()
Here's an approach using slider, which is great for windowed calcs like this. I couldn't figure out how to skip the slider calc for NA start_date rows, so this filters them out and then brings them back in with a join:
library(slider); library(lubridate); library(dplyr)
xdata %>%
filter(!is.na(start_date), month(start_date) != month(date)) %>%
mutate(avg = slide_index_dbl(values, start_date, .after = 4, mean)) %>%
right_join(xdata) %>% arrange(date)
EDIT:
I think I understand now. Here I'm calculating the "current + next 4 days avg" using date, and then renaming that start_date to join to the original data.
library(dplyr); library(slider)
xdata %>%
filter(!is.na(start_date)) %>%
mutate(avg = slide_index_dbl(values, date, .after = 4, mean)) %>%
select(start_date = date, avg) %>%
right_join(xdata ) %>%
select(date, values, everything()) %>%
arrange(date) %>% View()
This places a NA in the rows with the month of the start date being the same as the month of date. Note that in your toy dataset the days are always spaced 7 days apart, so the code doesn't actually capture any days in the next five days.
val=numeric()
for (i in 1:nrow(xdata)) {
if (is.na(xdata$start_date[i]) | month((xdata$date)[i])==month((xdata$start_date)[i])) {
val[i]=NA
} else {
dat=filter(xdata, start_date >= (xdata$start_date)[i] & start_date<= (xdata$start_date)[i]+5)
val[i]=mean(dat$values)
}
}
xdata2=mutate(xdata, val)

summarize weekly average using daily data in R

How to add one column price.wk.average to the data such that price.wk.average is equal to the average price of last week, and also add one column price.mo.average to the data such that it equals to the average price of last month? The price.wk.average will be the same for the entire week.
Dates Price Demand Price.wk.average Price.mo.average
2010-1-1 x x
2010-1-2 x x
......
2015-1-1 x x
jkl,
try to post reproducible examples. It will make it easier to help you. you can use dplyr:
library(dplyr)
df <- data.frame(date = seq(as.Date("2017-1-1"),by="day",length.out = 100), price = round(runif(100)*100+50,0))
df <- df %>%
group_by(week = week(date)) %>%
mutate(Price.wk.average = mean(price)) %>%
ungroup() %>%
group_by(month = month(date)) %>%
mutate(Price.mo.average = mean(price))
(Since I don't have enough points to comment)
I wanted to point out that Eric's answer will not distinguish average weekly price by year. Therefore, if you are interested in unique weeks (Week 1 of 2012 != Week 1 of 2015 ), you will need to do extra work to group by unique weeks.
df <- data.frame( Dates = c("2010-1-1", "2010-1-2", "2015-01-3"),
Price = c(50, 20, 40) )
Dates Price
1 2010-1-1 50
2 2010-1-2 20
3 2015-01-3 40
Just to keep your data frame tidy, I suggest converting dates to POSIX format then sorting the data frame:
library(lubridate)
df <- df %>%
mutate(Dates = lubridate::parse_date_time(Dates,"ymd")) %>%
arrange( Dates )
To group by unique weeks:
df <- df %>%
group_by( yw = paste( year(Dates), week(Dates)))
Then mutate and ungroup.
To group by unique months:
df <- df %>%
group_by( ym = paste( year(Dates), month(Dates)))
and mutate and ungroup.

lubridate - select first non-Monday of every week.

Having a tibble of financial data, I would like to filter it by only selecting the first non-Monday of every week. Usually it will be a Tuesday, but sometimes it can be a Wednesday if Tuesday is a Holiday.
Here is my code that works in most cases
XLF <- quantmod::getSymbols("XLF", from = "2000-01-01", auto.assign = FALSE)
library(tibble)
library(lubridate)
library(dplyr)
xlf <- as_tibble(XLF) %>% rownames_to_column(var = "date") %>%
select(date, XLF.Adjusted)
xlf$date <- ymd(xlf$date)
# We create Month, Week number and Days of the week columns
# Then we remove all the Mondays
xlf <- xlf %>% mutate(Year = year(date), Month = month(date),
IsoWeek = isoweek(date), WDay = wday(date)) %>%
filter(WDay != 2)
# Creating another tibble just for ease of comparison
xlf2 <- xlf %>%
group_by(Year, IsoWeek) %>%
filter(row_number() == 1) %>%
ungroup()
That said, there are some issues that I have not been able to solve so far.
The issue is for instance that it is skipping "2002-12-31" which is a Tuesday because it is considered as part of the first ISO week of 2003.
There are a few similar issues.
My question is how could I select of the first non-Monday of every week without such issues while staying in the tidyverse (ie. not having to use xts / zoo class)?
You can create a consistently increasing week number yourself. Perhaps not the most elegant solution but it works fine for me.
as_tibble(XLF) %>%
rownames_to_column(var = "date")%>%
select(date, XLF.Adjusted)%>%
mutate(date = ymd(date),
Year = year(date),
Month = month(date),
WDay = wday(date),
WDay_label = wday(date, label = T))%>%
# if the weekday number is higher in the line above or
# if the date in the previous line is more than 6 days ago
# the week number should be incremented
mutate(week_increment = (WDay < lag(WDay) | difftime(date, lag(date), unit = 'days') > 6))%>%
# the previous line causes the first element to be NA due to
# the fact that the lag function can't find a line above
# we correct this here by setting the first element to TRUE
mutate(week_increment = ifelse(row_number() == 1,
TRUE,
week_increment))%>%
# we can sum the boolean elements in a cumulative way to get a week number
mutate(week_number = cumsum(week_increment))%>%
filter(WDay != 2)%>%
group_by(Year, week_number) %>%
filter(row_number() == 1)

Resources