R Programming: Creating rows on basis of date difference

R Programming: Creating rows on basis of date difference - r

I am trying to create a dataset which is based on the difference in the days of start & end date. as an example
Name Start_Date End_Date
Alice 1-1-2017 3-1-2017
John 4-3-2017 5-3-2017
Peter 12-3-2017 12-3-2017
So, the final dataset will be inclusive of the start, end date and also the difference. And eventually it should look something like
Name Date
Alice 1-1-2017
Alice 2-1-2017
Alice 3-1-2017
John 4-3-2017
John 5-3-2017
Peter 12-3-2017
Every help is Great Help. Thanks !

We can use Map to get the sequence and melt the list to data.frame`
df1[-1] <- lapply(df1[-1], as.Date, format = "%d-%m-%Y")
lst <- setNames(Map(function(x, y) seq(x, y, by = "1 day"),
df1$Start_Date, df1$End_Date), df1$Name)
library(reshape2)
melt(lst)[2:1]
data
df1 <- structure(list(Name = c("Alice", "John", "Peter"), Start_Date = structure(c(17167,
17229, 17237), class = "Date"), End_Date = structure(c(17169,
17230, 17237), class = "Date")), .Names = c("Name", "Start_Date",
"End_Date"), row.names = c(NA, -3L), class = "data.frame")

This uses the expandRows function from the package splitstackshape:
df = df %>%
mutate(days_between = as.numeric(End_Date - Start_Date),
id = row_number(Name)) %>%
expandRows("days_between") %>%
group_by(id) %>%
mutate(Date = seq(first(Start_Date),
first(End_Date) - 1,
by = 1)) %>%
ungroup()

using a for loop:
library(data.table)
library(foreach)
library(lubridate)
setDT(df)
names = df[, unique(Name)]
l = foreach(i = 1:length(names)) %do% {
# make a date sequence per name
d = df[Name == names[i], ]
s = seq(from = dmy(d$Start_Date), to = dmy(d$End_Date), by = "days")
# bind the results in a data.table
dx = data.table(name = rep(names[i], length(s)))
dx = cbind(dx, date = s)
}
rbindlist(l)

Related

Trying to pivot a table in R

I am trying to pivot a table in R for example:
To
Code to create the starting table:
df <- data.frame (ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021"))
Assume there is a max of three dates for the above example.

You are missing a column with the identifier "Date1", "Date2", "Date3". You can create it with mutate(), then use pivot_wider() from the tidyverse library.
dt <- data.frame (ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021"))
library(tidyverse)
dt %>% group_by(ID) %>%
mutate(col = paste0("Date",row_number())) %>%
pivot_wider(id_cols = ID, names_from = col, values_from = Dates)

This is my approach :
my_df <- data.frame (ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021"),
stringsAsFactors = FALSE)
my_df <- my_df %>% group_by(ID) %>% mutate(value = paste("Date", seq_along(ID), sep = ""))
my_df <- dcast(my_df, ID ~ value, value.var = "Dates")

Here's an approach similar to what you're requesting.
library("maditr")
df <- dcast(df, Dates ~ ID,fun.aggregate = length)

Another solution, using data.table
df <- data.frame(
ID = c("A","A","A","B","B","C"),
Dates = c("01/01/2021", "10/02/2021", "30/03/2021","04/04/2021","06/05/2021","20/06/2021")
)
library(data.table)
setDT(df)
df <- df[, .(dates = lapply(.SD, function(x) paste(x, collapse = ", "))), by = ID, .SDcols = c("Dates")]
df[, c("Date1", "Date2", "Date3") := tstrsplit(dates, ", ")]
df[, dates := NULL]
df
# ID Date1 Date2 Date3
# 1: A 01/01/2021 10/02/2021 30/03/2021
# 2: B 04/04/2021 06/05/2021 <NA>
# 3: C 20/06/2021 <NA> <NA>

A base R option using reshape
reshape(
transform(
df,
q = ave(seq_along(ID), ID, FUN = seq_along)
),
direction = "wide",
idvar = "ID",
timevar = "q"
)
gives
ID Dates.1 Dates.2 Dates.3
1 A 01/01/2021 10/02/2021 30/03/2021
4 B 04/04/2021 06/05/2021 <NA>
6 C 20/06/2021 <NA> <NA>

The union of all intervals for each group

I have a following example dataset:
df <- data.frame("id" = c(1,2,3,3,4),
"start" = c(01-01-2018,01-06-2018,01-05-2018,01-05-2018,01-05-2018, 01-10-2018),
"end" = c(01-03-2018,01-07-2018,01-09-2018,01-06-2018,01-06-2018,01-11-2018))
df$start <- as.Date(df$start, "%d-%m-%Y")
df$end <- as.Date(df$end, "%d-%m-%Y")
What I want to do with it is for each group to get a union of all date intervals), i.e.
01-01-2018 - 01-03-2018 for group 1
01-06-2018 - 01-06-2018 for group 2
01-05-2018 - 01-09-2018 for group 3
01-05-2018 - 01-06-2018 and 01-10-2018 - 01-11-2018 for group 4
The purpose of this is to have an interval as an output, because I need it to determine whether certain observation dates for the group fall in the intervals or not.

We convert the 'start', 'end' to Date class, then grouped by'id', created an interval column in summarise based on the min and max of the 'start', and 'end' columns respectively
library(dplyr)
library(stringr)
library(lubridate)
df %>%
mutate(across(c(start, end), mdy)) %>%
group_by(id) %>%
summarise(interval = interval(min(start), max(end)), .groups = 'drop')
data
df <- structure(list(id = c(1, 2, 3, 3, 4), start = c("01-01-2018",
"01-06-2018", "01-05-2018", "01-05-2018", "01-10-2018"), end = c("01-03-2018",
"01-07-2018", "01-09-2018", "01-06-2018", "01-11-2018")),
class = "data.frame", row.names = c(NA,
-5L))

Creating all possible variable combinations in R

I am having a daily dataset of 4 parameters which I have converted into monthly data using following code
library(zoo)
library(hydroTSM)
library(lubridate)
library(tidyverse)
set.seed(123)
df <- data.frame("date"= seq(from = as.Date("1983-1-1"), to = as.Date("2018-12-31"), by = "day"),
"Parameter1" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 15, 35),
"Parameter2" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 11, 29),
"Parameter3" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 50, 90),
"Parameter4" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 0, 27))
Monthly_data <- daily2monthly(df, FUN=mean, na.rm=TRUE)
After that, I have reshaped it to represent each column as month using following code
#Function to convert month abbreviation to a numeric month
mo2Num <- function(x) match(tolower(x), tolower(month.abb))
Monthly_data %>%
dplyr::as_tibble(rownames = "date") %>%
separate("date", c("Month", "Year"), sep = "-", convert = T) %>%
mutate(Month = mo2Num(Month))%>%
tidyr::pivot_longer(cols = -c(Month, Year)) %>%
pivot_wider(names_from = Month, values_from = value, names_prefix = "Mon",
names_sep = "_") %>%
arrange(name)
Now, I want to create parameter combinations like Parameter1 * Parameter2, Parameter1 * Parameter3, Parameter1 * Parameter4, Parameter2 * Parameter3, Parameter2 * Parameter4, Parameter3 * Parameter4 which will be added to the pivoted monthly data as rbind. The new dataframe Parameter1 * Parameter2 means to multiply their monthly values and then rbind to the above result. Likewise for all other above said combinations. How can I achieve this?

You can use this base R approach using combn assuming data is present for all the years for all parameters where df1 is the dataframe from the above output ending with arrange(name).
data <- combn(unique(df1$name), 2, function(x) {
t1 <- subset(df1, name == x[1])
t2 <- subset(df1, name == x[2])
t3 <- t1[-(1:2)] * t2[-(1:2)]
t3$name <- paste0(x, collapse = "_")
cbind(t3, t1[1])
}, simplify = FALSE)
You can then rbind it to original data.
new_data <- rbind(df1, do.call(rbind, data))

replace historical data of a data.frame with the most recent year data in R?

I want to replace Jan 01 to Jun 25 of all the years in FakeData with data from Ob2020 for the two variables (Level & Flow) of my data.frame. Here is what i have started and am looking for suggestions to achieving my goal.
library(tidyverse)
library(lubridate)
set.seed(1500)
FakeData <- data.frame(Date = seq(as.Date("2010-01-01"), to = as.Date("2018-12-31"), by = "days"),
Level = runif(3287, 0, 30), Flow = runif(3287, 1,10))
Ob2020 <- data.frame(Date = seq(as.Date("2020-01-01"), to = as.Date("2020-06-25"), by = "days"),
Level = runif(177, 0, 30), Flow = runif(177, 1,10))

Here's a way using dplyr and lubridate :
library(dplyr)
library(lubridate)
FakeData %>%
mutate(day = day(Date), month = month(Date)) %>%
left_join(Ob2020 %>%
mutate(day = day(Date), month = month(Date)),
by = c('day', 'month')) %>%
mutate(Level = coalesce(Level.y, Level.x),
Flow = coalesce(Flow.y, Flow.x)) %>%
select(Date = Date.x, Level, Flow)

If you dont mind a data.table solution, here is an update join:
library(data.table)
#extract year and month of the date
setDT(FakeData)[, c("day", "mth") := .(mday(Date), month(Date))]
setDT(Ob2020)[, c("day", "mth") := .(mday(Date), month(Date))]
#print to console to show old values
head(FakeData)
head(Ob2020)
cols <- c("Level", "Flow")
FakeData[Ob2020[mth<=6L & day<=25], on=.(day, mth),
(cols) := mget(paste0("i.", cols))]
#print to console to show new values
head(FakeData)

Time data manipulation

I have my time and date as
tt <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 3L, 3L), .Label = c("2018-10-27",
"2018-10-28", "2018-11-15"), class = "factor"), Time = c("21:07:30.004",
"21:07:31.000", "21:07:32.998", "21:07:32.000", "21:07:33.989",
"21:08:33.989")), row.names = c(NA, 6L), class = "data.frame")
I would like to get the smallest to largest time range for each date as:
Date Time
2018-10-27 21:07:30 to 21:07:32
2018-11-15 21:07:33 to 21:08:33
Code I tried:
time.range <- function(field_or_seed){
paste0(gsub(".* ", "", range(strptime(field_or_seed$Time, format = "%H:%M:%S", tz = 'UTC'))[[1]]), " to ",
gsub(".* ", "", range(strptime(field_or_seed$Time, format = "%H:%M:%S", tz = 'UTC'))[[2]]))
}
aggregate(Time~Date, data = tt, FUN = time.range)
I would really appreciate any suggestion I could use to fix this. Thanks!

A more robust approach is to use appropriate objects, e.g. Date and hms:
library(tidyverse)
tt <- tibble(
Date = c("2018-10-27", "2018-10-27", "2018-10-27", "2018-10-27", "2018-11-15", "2018-11-15"),
Time = c("21:07:30.004", "21:07:31.000", "21:07:32.998", "21:07:32.000", "21:07:33.989", "21:08:33.989")
)
tt %>%
mutate(Date = as.Date(Date),
Time = hms::as_hms(Time)) %>%
group_by(Date) %>%
summarise(start = Time[which.min(Time)], # which.min instead of min to avoid dropping class
end = Time[which.max(Time)])
#> # A tibble: 2 x 3
#> Date start end
#> <date> <time> <time>
#> 1 2018-10-27 21:07:30.004 21:07:32.998
#> 2 2018-11-15 21:07:33.989 21:08:33.989
If you like you could use lubridate::interval to represent the interval, but you'd have to use POSIXct.

Try:
tt$date_time <- as.POSIXct(paste(tt$Date, tt$Time))
do.call(rbind, by(
tt,
tt$Date,
FUN = function(x) {
within(x, Time <-
paste(strftime(min(date_time), "%H:%M:%S"),
"to",
strftime(max(date_time), "%H:%M:%S")))[1, -3]
}
))

Also:
aggregate(
data.frame(Time = strptime(do.call(paste, tt), '%F %R:%OS', tz = 'GMT')),
by = list(Date = tt$Date),
function(Time){
paste(
format(min(Time), '%T'),
format(max(Time), '%T'),
sep = ' to '
)
}
)
# Date Time
#1 2018-10-27 21:07:30 to 21:07:32
#2 2018-11-15 21:07:33 to 21:08:33