I'm looking for a way to tramfor the df into the dfres.
Dfres is
obj <- date #where type == I5,
min <- min(date) #where type == I6,
max <- max(date) #where type == I6,
all of this grouped by year.
year <- c('2014','2015','2016','2017','2014','2015','2016','2017','2016','2014','2015')
type <- c('I6','I6','I6','I6','I6','I6','I6','I6','I5','I5','I5')
date <- c('2014-06-03','2015-08-01','2016-06-01','2017-05-15',
'2014-04-11','2015-03-14','2016-03-17','2017-03-08','2016-11-05',
'2014-09-04','2015-05-01')
df <- data.frame(year,type,date)
year <- c('2014','2015','2016','2017')
obj <- c('2014-09-04','2015-05-01','2016-11-05',NA)
min <- c('2014-04-11','2015-03-14','2016-03-17','2017-03-08')
max <- c('2014-06-03', '2015-08-01','2016-06-01','2017-05-15')
dfres <- data.frame(year,obj,min,max)
If anyone can help me, not to prepare the data in order to solve this one way around but an "easy" way throwing a sentence, I would be graceful.
An idea using dplyr would be,
library(dplyr)
df %>%
filter(type == 'I6') %>%
group_by(year) %>%
summarise(min_d = min(date), max_d = max(date)) %>%
full_join(df[df$type == 'I5',], ., by = 'year') %>%
select(-type) %>%
arrange(year)
# year date min_d max_d
#1 2014 2014-09-04 2014-04-11 2014-06-03
#2 2015 2015-05-01 2015-03-14 2015-08-01
#3 2016 2016-11-05 2016-03-17 2016-06-01
#4 2017 <NA> 2017-03-08 2017-05-15
A data.table approach would be:
library(data.table)
setDT(df)
i5 <- df[type == 'I5', .(obj = date), by = year]
i6 <- df[type == 'I6', .(min = min(as.Date(date)), max = max(as.Date(date))), by = year]
dfres <- merge(i5, i6, by = 'year', all = TRUE)
Related
Apologies I am not sure this warrants another question. But am basing this off of another questions (Group by ID and Outcome and take the earliest earliest Dates of specific outcomes and assign numbers (i.e outcome1, outcome2)).
mydata = data.frame (Id =c (1,1,1,1,1,1,1,1,2,2,2,2),
Date = c("2001-01-31", "2001-02-13","2001-05-31",
"2001-06-02","2018-01-31","2018-03-31","2018-07-31",
"2019-04-04","2014-01-31","2014-02-02","2014-04-31",
"2014-05-18"),Outcome = c("Relapse","CR","Relapse","Relapse",
"CR","CR","CR","Relapse","CR", "CR","Relapse","CR"))
Which outputs the below.
Id Date Outcome
1 2001-01-31 Relapse
1 2001-02-13 CR
1 2001-05-31 Relapse
1 2001-06-02 Relapse
1 2018-01-31 CR
1 2018-03-31 CR
1 2018-07-31 CR
1 2019-04-04 Relapse
2 2014-01-31 CR
2 2014-02-02 CR
2 2014-04-31 Relapse
2 2014-05-18 CR
As you can see each patient is in certain phases at different times and I would like to capture the earliest dates of when each new phase starts per patient. I would like then to rename these phases to CR1, Relapse1, CR2, Relapse2 and so forth. But I would only like to start naming AFTER patient achieves their first CR1. So if a patient has a relapse before CR, those would be ignored.
Im hoping to get an output like this:
Id CR1 Relapse1 CR2 Relapse2
1 2001-02-13 2001-05-31 2018-01-31 2019-04-04
2 2014-01-31 2014-04-31 2014-05-18 NA
As you see patient 1 had a relapse at 2001-01-31, but was ignored because it occured before the first CR1.
The following code was so helpful to the previous question, but was wondering if I can modify it so that it will start the counting after the first CR? Thank you!!
mydata %>%
group_by(Id) %>%
mutate(Grp = rleid(Outcome)) %>%
group_by(Grp, .add = T) %>%
slice(1) %>%
group_by(Id, Outcome) %>%
mutate(n = row_number()) %>%
pivot_wider(id_cols = Id, names_from = c(Outcome, n), values_from = Date)
Using data.table package
library(data.table)
mydata <- setDT(mydata)
# identify consecutive same Outcome for each Id
mydata[, flag:=rleid(Id, Outcome)]
# collapse identified consecutive values (keep min date)
tmp <- mydata[, by="flag,Id,Outcome", .(Date=min(Date))]
# mark relapse if first
tmp <- tmp[, by=Id,
flag := (Outcome[1]=="Relapse") & (seq_len(.N)==1)
]
# remove marked relapse
tmp <- tmp[(!flag)]
# Outcome numbering
tmp[, by="Id,Outcome", n:=seq_len(.N)]
# cast (widen)
dcast(tmp, Id~paste0(Outcome, n), value.var = "Date")
Got
Id CR1 CR2 Relapse1 Relapse2
1: 1 2001-02-13 2018-01-31 2001-05-31 2019-04-04
2: 2 2014-01-31 2014-05-18 2014-04-31 <NA>
using %>% notation
mydata[, flag:=rleid(Id,Outcome)] %>%
.[, by="flag,Id,Outcome", .( Date=min(Date) )] %>%
.[, by="Id", flag := (Outcome[1]=="Relapse") & (seq_len(.N)==1)] %>%
.[(!flag)] %>%
.[, by="Id,Outcome", n := seq_len(.N)] %>%
dcast(., Id ~ paste0(Outcome, n), value.var = "Date")
That is my approach :
library(data.table)
my_df <- data.frame (Id =c (1,1,1,1,1,1,1,1,2,2,2,2),
Date = c("2001-01-31", "2001-02-13","2001-05-31",
"2001-06-02","2018-01-31","2018-03-31","2018-07-31",
"2019-04-04","2014-01-31","2014-02-02","2014-04-31",
"2014-05-18"),
Outcome = c("Relapse","Relapse","Relapse","Relapse",
"CR","CR","CR","Relapse","CR", "CR","Relapse","CR"),
stringsAsFactors = FALSE)
my_df <- my_df %>% group_by(Id) %>% arrange(Id, Date)
my_df <- my_df %>%
group_by(Id) %>%
mutate(Value = seq_along(Outcome)) %>%
mutate(first_v = Outcome == "CR" & !duplicated(Outcome == "CR")) %>%
mutate(first_a = first_v == "FALSE" & Value > Value[Outcome == "CR" & !duplicated(Outcome == "CR")] | Outcome == "CR") %>%
filter(first_a == "TRUE")
my_df %>%
group_by(Id) %>%
mutate(Grp = rleid(Outcome)) %>%
group_by(Grp, .add = T) %>%
group_by(Id, Outcome) %>%
mutate(n = row_number()) %>%
pivot_wider(id_cols = Id, names_from = c(Outcome, n), values_from = Date)
You could try adding:
filter(Date >= first(Date[Outcome == "CR"]))
to filter out rows before the first "CR". This assumes your Date is sorted/arranged first.
library(tidyverse)
library(data.table)
mydata %>%
group_by(Id) %>%
filter(Date >= first(Date[Outcome == "CR"])) %>%
mutate(Grp = rleid(Outcome)) %>%
group_by(Grp, .add = T) %>%
slice(1) %>%
group_by(Id, Outcome) %>%
mutate(n = row_number()) %>%
pivot_wider(id_cols = Id, names_from = c(Outcome, n), values_from = Date)
Output
Id CR_1 Relapse_1 CR_2 Relapse_2
<dbl> <chr> <chr> <chr> <chr>
1 1 2001-02-13 2001-05-31 2018-01-31 2019-04-04
2 2 2014-01-31 2014-04-31 2014-05-18 NA
I have a continuous list of dates (yyyy-mm-dd) from 1985 to 2018 in one column (Colname = date). What I wish to do is generate another column which outputs a water season and year given the date.
To make it clearer I have two water season:
Summer = yyyy-04-01 to yyyy-09-31;
Winter = yyyy-10-01 to yyyy(+1)-03-31.
So for 2018 - Summer = 2018-04-01 to 2018-09-31; Winter 2018-10-01 to 2019-03-31.
What I would like to output is something like the following:
Many thanks.
A tidy verse approach
library(tidyverse)
df <-tibble(date = seq(from = as.Date('2000-01-01'), to = as.Date('2001-12-31'), by = '1 month'))
df
df %>%
mutate(water_season_year = case_when(
lubridate::month(date) %in% c(4:9) ~str_c('Su_', lubridate::year(date)),
lubridate::month(date) %in% c(10:12) ~str_c('Wi_', lubridate::year(date)),
lubridate::month(date) %in% c(1:3)~str_c('Wi_', lubridate::year(date) -1),
TRUE ~ 'Error'))
You can compare just the month part of the data to get the season, in base R consider doing
month <- as.integer(format(df$date, "%m"))
year <- format(df$date, "%Y")
inds <- month >= 4 & month <= 9
df$water_season_year <- NA
df$water_season_year[inds] <- paste("Su", year[inds], sep = "_")
df$water_season_year[!inds] <- paste("Wi", year[!inds], sep = "_")
#To add previous year for month <= 3 do
df$water_season_year[month <= 3] <- paste("Wi",
as.integer(year[month <= 3]) - 1, sep = "_")
df
# date water_season_year
#1 2019-01-03 Wi_2019
#2 2000-06-01 Su_2000
Make sure that date variable is of "Date" class.
data
df <-data.frame(date = as.Date(c("2019-01-03", "2000-06-01")))
I want to get the missing part of several date's intervals in 2017.
here for example, each "id" of following dataframe:
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
id start end
a 2017-01-01 2017-01-15
a 2017-01-10 2017-01-20
a 2017-02-10 2017-02-20
b 2017-03-01 2017-03-28
b 2017-04-20 2017-04-29
I want to get:
df_final <- data.frame(id=c(rep("a",2),rep("b",3)),
start=c("2017-01-21","2017-02-21","2017-01-01","2017-03-29","2017-04-30"),
end=c("2017-02-09","2017-12-31","2017-02-28","2017-04-19","2017-12-31"))
id start end
a 2017-01-21 2017-02-09
a 2017-02-21 2017-12-31
b 2017-01-01 2017-02-28
b 2017-03-29 2017-04-19
b 2017-04-30 2017-12-31
Thank you!
First, confirm whether start and end are Date class.
df$start <- as.Date(df$start)
df$end <- as.Date(df$end)
Use by() to split the data into a list of two data frames according to the ids.
library(purrr)
by(df, df$id, function(x){
year <- seq(as.Date("2017-01-01"), as.Date("2017-12-31"), 1)
ind <- map2(x$start, x$end, function(start, end){
which(year < start | year > end)
}) %>% reduce(intersect)
gap <- which(diff(ind) > 1)
head <- ind[c(1, gap + 1)] ; tail <- ind[c(gap, length(ind))]
return(data.frame(id = unique(x$id), start = year[head], end = year[tail]))
}) %>% reduce(rbind)
Description:
year : All days in 2017.
ind : Get rid of the dates between start and end along the rows and the outcome represents the indices of missing dates.
gap : The discontinuous indices.
Output:
# id start end
# 1 a 2017-01-21 2017-02-09
# 2 a 2017-02-21 2017-12-31
# 3 b 2017-01-01 2017-02-28
# 4 b 2017-03-29 2017-04-19
# 5 b 2017-04-30 2017-12-31
I think my solution is still cumbersome. Hope to help you.
I encountered a similar problem recently, and I found that expanding the table to get one row for each relevant date, and then collapsing back down to ranges, was easier than trying to work out the correct logic from the range endpoints alone.
Here's how that approach would work. Alternatively, it might be possible to do something like this or this, but those approaches don't have the "not in range" issue you're dealing with.
library(dplyr)
library(fuzzyjoin)
library(lubridate)
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
# All the dates in 2017.
all.2017.dates = data.frame(date = seq.Date(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))
# Start by expanding the original dataframe so that we get one record for each
# id for each date in any of that id's ranges.
df.expanded = df %>%
# Convert the strings to real dates.
mutate(start.date = as.Date(start),
end.date = as.Date(end)) %>%
# Left join to 2017 dates on dates that are in the range of this record.
fuzzy_left_join(all.2017.dates,
by = c("start.date" = "date", "end.date" = "date"),
match_fun = list(`<=`, `>=`)) %>%
# Filter to distinct ids/dates.
select(id, date) %>%
distinct()
# Now, do an anti-join that gets dates NOT in an id's ranges, and collapse back
# down to ranges.
df.final = expand.grid(id = unique(df$id),
date = all.2017.dates$date) %>%
# Anti-join on id and date.
anti_join(df.expanded,
by = c("id", "date")) %>%
# Sort by id, then date, so that the lead/lag functions behave as expected.
arrange(id, date) %>%
# Check whether this record is an endpoint (i.e., is it adjacent to the
# previous/next record?).
mutate(prev.day.included = coalesce(date == lag(date) + 1 &
id == lag(id), F),
next.day.included = coalesce(date == lead(date) - 1 &
id == lag(id), F)) %>%
# Filter to just endpoint records.
filter(!prev.day.included | !next.day.included) %>%
# Fill in both start and end dates on "start" records. The start date is the
# date in the record; the end date is the date of the next record.
mutate(start.date = as.Date(ifelse(!prev.day.included, date, NA),
origin = lubridate::origin),
end.date = as.Date(ifelse(!prev.day.included, lead(date), NA),
origin = lubridate::origin)) %>%
filter(!is.na(start.date))
Here's my solution:
library(tidyverse)
library(lubridate)
library(wrapr)
df %>%
mutate_at(2:3, ymd) %>%
group_by(id) %>%
gather('start_end', 'date', start:end) %>%
mutate(date = if_else(start_end == 'start', min(date), max(date))) %>%
unique() %>%
mutate(
start = if_else(
start_end == 'start',
date %>% min() %>% year() %>% paste0('-01-01') %>% ymd(),
date
),
end = if_else(
start_end == 'end',
date %>% max() %>% year() %>% paste0('-12-31') %>% ymd(),
date
)) %>%
filter(start != end) %>%
select(id, start, end) %>%
mutate(supp = TRUE) %>%
bind_rows(mutate(df, supp = FALSE) %>% mutate_at(2:3, ymd)) %>%
arrange(id, start) %>%
mutate(rn = row_number()) %.>%
left_join(., mutate(., rn = rn - 1), by = c('id', 'rn')) %>%
na.omit() %>%
mutate(
start = case_when(
(start.y >= end.x) & !supp.x ~ end.x + 1,
(start.y >= end.x) & supp.x ~ start.x,
TRUE ~ as.Date(NA)
),
end = case_when(
(start.y >= end.x) & supp.y ~ end.y,
(start.y >= end.x) & !supp.y ~ start.y - 1,
TRUE ~ as.Date(NA)
)
) %>%
select(id, start, end) %>%
na.omit()
I'm trying to group ids with date in this dataset, but I want to summarise based on one of the features outside of the group.
library(dplyr)
library(lubridate)
set.seed(100)
df <- data.frame(ids = sample(c('436247', '2465347', '346654645'), 10000, replace=TRUE),
date = sample(seq.Date(ymd('2018-03-01'), ymd('2018-05-01'), by=1), 10000, replace=TRUE))
new_df <- df %>%
group_by(ids, date) %>%
summarise(events = length(ids[date >= date - 30 & date <= date]))
I'm trying to take this dataframe and answer the question - "for each of the ids, and each date, how many other records within that id, are within the past 30 days of that date". Unfortunately, when I group_by both the ids and date, it only looks within the grouped date. I've created the solution below, but not sure if there is a better one with dplyr?
groupby_function <- function(df, spec_date){
result <- df %>%
group_by(ids) %>%
summarise(events = length(ids[date >= spec_date - 30 & date <= spec_date])) %>%
mutate(date = spec_date)
return(result)
}
date_vector <- seq.Date(ymd('2018-03-01'), ymd('2018-05-01'), by=1)
list_results <- lapply(date_vector, groupby_function, df=df)
x <- do.call(rbind, list_results)
"for each of the ids, and each date, how many other records within that id, are within the past 30 days of that date"
For that, a "join by" condition makes sense, but isn't yet included in dplyr. Until it is, you could use data.table inside your dplyr chain:
# enumerate id-date combos of interest
grid_df = expand.grid(
id = unique(df$ids),
d = seq(min(df$date), max(df$date), by="day")
)
# helper function
library(data.table)
count_matches = function(DF, targetDF, ...){
onexpr = substitute(list(...))
data.table(targetDF)[DF, on=eval(onexpr), .N, by=.EACHI]$N
}
# use a non-equi join to count matching rows
res = grid_df %>%
mutate(d_dn = d - 30) %>%
mutate(n = count_matches(., df, ids = id, date >= d_dn, date <= d)) %>%
as.tibble
# A tibble: 186 x 4
id d d_dn n
<fctr> <date> <date> <int>
1 436247 2018-03-01 2018-01-30 72
2 2465347 2018-03-01 2018-01-30 69
3 346654645 2018-03-01 2018-01-30 51
4 436247 2018-03-02 2018-01-31 123
5 2465347 2018-03-02 2018-01-31 120
6 346654645 2018-03-02 2018-01-31 100
7 436247 2018-03-03 2018-02-01 170
8 2465347 2018-03-03 2018-02-01 166
9 346654645 2018-03-03 2018-02-01 154
10 436247 2018-03-04 2018-02-02 228
# ... with 176 more rows
It should work fine for equality conditions to write either ids = id or ids == id, I think.
If you're interested, the syntax is x[i, on=, j, by=.EACHI] where x and i are tables. For each row of i, we look up rows of x based on the on= criteria (left-hand side refers to columns in x; right-hand to columns in i); then we do j for each ("by each row of i" so by=.EACHI). In this case, j = .N means that we count matched rows of x, returned as a column of counts N.
You can look at the "ungrouped" data by just going back to the original data frame(calling df$date or df$ids). So I think what you are after is
test_df <- df %>%
group_by(ids, date) %>%
summarise(events = length(df$ids[df$date >= date[1] - 30 & df$date <= date[1] & df$ids == ids[1]]))
Also, I ran your proposed function, but I did not see any difference in the result from your original group_by solution, so I don't think that is what you want.
If a 'non dplyr' solution is acceptable, this gives you what you want.
df$diff <- as.vector(
sapply(unique(df$ids), function(x)
sapply(df$date[df$ids == x], function(y)
sum(abs(y - df$date[df$ids == x]) >= 30)
)
)
)
Alternatively, in dplyr, you can get a result like the above using:
f <- function(x) {
sapply(x, function(y) sum(abs(y - x) >= 30))
}
df$diff <- unlist(
df %>%
group_by(ids) %>%
do(diff = f(.$date)) %>%
.$diff
)
Here's an answer. But it assumes there's a continuous sequence of dates in each id.
df %>%
group_by(ids, date) %>%
count() %>%
arrange(ids, date) %>%
group_by(ids) %>%
mutate(
events = cumsum(n) - cumsum(lag(n, 30, 0))
)
I have a dataframe (lets call it df1) that looks something like this...
Date Price
2014-08-06 22
2014-08-06 89
2014-09-15 56
2014-06-04 41
2015-01-19 11
2015-05-23 5
2014-07-21 108
There are other variables in the dataframe but we will ignore them for now, as I do not require them.
I have previously ordered it using
df2 <- df1[order(as.Date(df1$Date, format="%Y/%m/%d")),]
And then created a dataframe containing the values for just one month, for example, just September 2015 dates...
september2015 <- df2[df2$Date >= "2015-09-01" & df2$Date <= "2015-09-30",]
I have done this for all the months in 2015 and 2014.
Then I need to create an average of prices within each given month. I have done this by...
mean(september2015$Price, na.rm = TRUE)
Obviously, this is very long and tedious and involves many lines of code. I am trying to make my code more efficient through using the dplyr package.
So far I have...
datesandprices <- select(df2, Date, Price)
datesandprices <- arrange(datesandprices, Date)
summarise(datesandprices, avg = mean(Price, na.rm = TRUE))
Or in a simpler form...
df1 %>%
select(Date, Price) %>%
arrange(Date) %>%
filter(Date >= 2014-08-06 & Date =< 2014-08-30)
summarise(mean(Price, na.rm = TRUE))
The filter line is not working for me and I can't figure out how to filter by dates using this method. I would like to get the mean for each month without having to calculate it one by one - and ideally extract the monthly means into a new dataframe or column that looks like...
Month Average
Jan 2014 x
Feb 2014 y
...
Nov 2015 z
Dec 2015 a
I hope this makes sense. I can't find anything on stackoverflow that works with dates, attempting to do something similar to this (unless I am searching for the wrong functions). Many thanks!
I made a separate column in your data set that contains only year and month. Then, I did a group_by on that column to get the means for each month.
Date <- c("2014-08-06", "2014-08-06", "2014-09-15", "2014-06-04", "2015-01-19", "2015-05-23", "2014-07-21")
Price <- c(22,89,56,41,11,5,108)
Date <- as.Date(Date, format="%Y-%m-%d")
df <- data.frame(Date, Price)
df$Month_Year <- substr(df$Date, 1,7)
library(dplyr)
df %>%
#select(Date, Price) %>%
group_by(Month_Year) %>%
summarise(mean(Price, na.rm = TRUE))
For the sake of completeness, here is also a data.table solution:
library(data.table)
# in case Date is of type character
setDT(df1)[, .(Average = mean(Price, na.rm = TRUE)), keyby = .(Yr.Mon = substr(Date, 1,7))]
# in case Date is of class Date or POSIXct
setDT(df2)[, .(Average = mean(Price, na.rm = TRUE)), keyby = .(Yr.Mon = format(Date, "%Y-%m"))]
Yr.Mon Average
1: 2014-06 41.0
2: 2014-07 108.0
3: 2014-08 55.5
4: 2014-09 56.0
5: 2015-01 11.0
6: 2015-05 5.0
Note that the grouping variable Yr.Mon is created "on-the-fly" in the keyby clause.
Data
library(data.table)
df1 <- fread(
"Date Price
2014-08-06 22
2014-08-06 89
2014-09-15 56
2014-06-04 41
2015-01-19 11
2015-05-23 5
2014-07-21 108")
df2 <- df1[, Date := as.Date(Date)]
I managed to do it using all dplyr functions, with help from #user108636
df %>%
select(Date, Price) %>%
arrange(Date) %>%
mutate(Month_Year = substr(Date, 1,7)) %>%
group_by(Month_Year) %>%
summarise(mean(Price, na.rm = TRUE))
The select function selects the date and price columns.
The arrange function arranges my dataframe according to the date - with the earliest date first. The mutate function adds another column which excludes the day and leaves us with, for example...
Month_Year
2015-10
2015-10
2015-11
2015-12
2015-12
The group by function groups all the months together and the summarise function calculates the mean of the price of each month.
This should mean your price data by month-year.
library(zoo)
#Pull out columns
Price<-df1["Price"]
Date<-df1["Date"]
#Put in Zoo
zooPrice <- zoo(Price,Date)
#Monthly mean with year (vector)
monthly.avg <- apply.monthly(zooPrice, mean)
#function to change back to DF
zooToDf <- function(z) {
df <- as.data.frame(z)
df$Date <- time(z) #create a Date column
rownames(df) <- NULL #so row names not filled with dates
df <- df[,c(ncol(df), 1:(ncol(df)-1))] #reorder columns so Date first
return(df)
}
#Apply function to create new Df with data!
MonthYearAvg<-zooToDf(monthly.avg)
Convert your column to a Date object and use format
df <- data.frame(
Date = c("2014-08-06", "2014-08-06", "2014-09-15", "2014-06-04", "2015-01-19", "2015-05-23", "2014-07-21"),
Price = c(22, 89, 56, 41, 11, 5, 108))
library(dplyr)
df %>%
group_by(Month_Year = as.Date(Date) %>% format("%b %Y")) %>%
summarise(avg = mean(Price, na.rm = TRUE))
# A tibble: 6 x 2
Month_Year avg
<chr> <dbl>
1 août 2014 55.5
2 janv. 2015 11
3 juil. 2014 108
4 juin 2014 41
5 mai 2015 5
6 sept. 2014 56