Calculate Mean for Each Unique Value up to a certain date - r

Data for my example.
date1 = seq(as.Date("2019/01/01"), by = "month", length.out = 48)
date2 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date3 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date4 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date = c(date1,date2,date3,date4)
subproducts1=rep("1",48)
subproducts2=rep("2",48)
subproductsx=rep("x",48)
subproductsy=rep("y",48)
b1 <- c(rnorm(48,5))
b2 <- c(rnorm(48,5))
b3 <-c(rnorm(48,5) )
b4 <- c(rnorm(48,5))
dfone <- data.frame(
"date"= date,
"subproduct"=
c(subproducts1,subproducts2,subproductsx,subproductsy),
"actuals"= c(b1,b2,b3,b4))
This creates Jan 2019 for date2,3,4 with value 0.
dfone <-dfone %>%
complete(date = seq.Date(from = min(date), to = as.Date('2021-06-01'), by = 'month'),
nesting(subproduct), fill = list(actuals = 0))
QUESTION: This calculates the mean for each unique sub product and replaces 0's with the mean of each, but how do I have a hard cutoff so the mean is only based off Jan-2019 to Dec-2020 and not Jan 2019 to Dec 2022?
library(dplyr)
dfone_new <- dfone %>%
group_by(subproduct) %>%
mutate(actuals = replace(actuals, actuals == 0,
mean(actuals[actuals != 0], na.rm = TRUE))) %>%
ungroup

We may need one more logical expression while subsetting the 'actuals' i.e. the 'date' should be between the 2019 Jan and 2020 Dec while calculating the mean
library(dplyr)
library(tidyr)
dfone %>%
group_by(subproduct) %>%
mutate(actuals = replace(actuals, actuals == 0,
mean(actuals[actuals != 0 &
between(date, as.Date("2019-01-01"), as.Date("2020-12-31"))],
na.rm = TRUE)))

Related

Creating list with the same number of values

I have a data set with a date, ID, and coordinates that I would like to split into seasonal months. For example for winter I have January to winter1, February to winter2, and March to winter3. I have done the same for the summer months.
I would like to filter out the IDs that have all of these months, so that when I split the data by ID and year, I would have identical list lengths.
I wasn't sure how to simulate uneven values for each ID in the sample code below, but in my actual data some IDs only have summer1 and not winter1, while it could be flipped around for summer2 and winter2`.
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),1000)
ID <- rep(seq(1, 5), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
df$month <- month(df$date)
df$year <- year(df$date)
df1 <- df %>%
mutate(season_categ = case_when(month %in% 6 ~ 'summer1',
month %in% 7 ~ 'summer2',
month %in% 8 ~ 'summer3',
month %in% 1 ~ 'winter1',
month %in% 2 ~ 'winter2',
month %in% 3 ~ 'winter3')) %>%
group_by(year, ID )%>%
filter(any(month %in% 6:8) &
any(month %in% 1:3))
summer_list <- df1 %>%
filter(season_categ == "summer1") %>%
group_split(year, ID)
# Renames the names in the list to AnimalID and year
names(summer_list) <- sapply(summer_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
# Creates a list for each year and by ID
winter_list <- df1 %>%
filter(season_categ == "winter1") %>%
group_split(year, ID)
names(winter_list) <- sapply(winter_list,
function(x) paste(x$ID[1],
x$year[1], sep = '_'))
Not sure if that is what you want, but I understood that you would want to get rid of IDs that have less than the 6 months of Q1 and Q3 in any of the years, but you could modify the filter or grouping if that assumption was wrong.
Here is one approach:
library(lubridate)
library(dplyr)
set.seed(12345)
# random sampling of dates with this seed gives no July date for ID 2 in 2010
df <- tibble(
date = sample(seq(dmy("01-01-2010"), dmy("31-12-2013"), by = "days"),
1000, replace = TRUE),
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID = rep(1:5, 200),
month = month(date),
year =year(date)) %>%
arrange(ID, date)
df %>%
filter(month %in% c(1:3, 6:8)) %>%
group_by(ID, year) %>%
mutate(complete = length(unique(month)) == 6) %>%
group_by(ID) %>%
filter(all(complete)) %>%
group_by(ID, year) %>%
group_split()
To me it is not really clear as to what your are looking for. Before you split the data into a list sort the rows by columns
df1<-df1[order(ID,season_categ),]
### Determine which ID's have uneven numbers ###
df1 %>%
group_by(ID) %>%
summarize(month_seq = paste(season_categ , collapse = "_"),
number_of_months = n(season_categ))
#### Remove odd numbers###

Creating all possible variable combinations in R

I am having a daily dataset of 4 parameters which I have converted into monthly data using following code
library(zoo)
library(hydroTSM)
library(lubridate)
library(tidyverse)
set.seed(123)
df <- data.frame("date"= seq(from = as.Date("1983-1-1"), to = as.Date("2018-12-31"), by = "day"),
"Parameter1" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 15, 35),
"Parameter2" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 11, 29),
"Parameter3" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 50, 90),
"Parameter4" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 0, 27))
Monthly_data <- daily2monthly(df, FUN=mean, na.rm=TRUE)
After that, I have reshaped it to represent each column as month using following code
#Function to convert month abbreviation to a numeric month
mo2Num <- function(x) match(tolower(x), tolower(month.abb))
Monthly_data %>%
dplyr::as_tibble(rownames = "date") %>%
separate("date", c("Month", "Year"), sep = "-", convert = T) %>%
mutate(Month = mo2Num(Month))%>%
tidyr::pivot_longer(cols = -c(Month, Year)) %>%
pivot_wider(names_from = Month, values_from = value, names_prefix = "Mon",
names_sep = "_") %>%
arrange(name)
Now, I want to create parameter combinations like Parameter1 * Parameter2, Parameter1 * Parameter3, Parameter1 * Parameter4, Parameter2 * Parameter3, Parameter2 * Parameter4, Parameter3 * Parameter4 which will be added to the pivoted monthly data as rbind. The new dataframe Parameter1 * Parameter2 means to multiply their monthly values and then rbind to the above result. Likewise for all other above said combinations. How can I achieve this?
You can use this base R approach using combn assuming data is present for all the years for all parameters where df1 is the dataframe from the above output ending with arrange(name).
data <- combn(unique(df1$name), 2, function(x) {
t1 <- subset(df1, name == x[1])
t2 <- subset(df1, name == x[2])
t3 <- t1[-(1:2)] * t2[-(1:2)]
t3$name <- paste0(x, collapse = "_")
cbind(t3, t1[1])
}, simplify = FALSE)
You can then rbind it to original data.
new_data <- rbind(df1, do.call(rbind, data))

replace historical data of a data.frame with the most recent year data in R?

I want to replace Jan 01 to Jun 25 of all the years in FakeData with data from Ob2020 for the two variables (Level & Flow) of my data.frame. Here is what i have started and am looking for suggestions to achieving my goal.
library(tidyverse)
library(lubridate)
set.seed(1500)
FakeData <- data.frame(Date = seq(as.Date("2010-01-01"), to = as.Date("2018-12-31"), by = "days"),
Level = runif(3287, 0, 30), Flow = runif(3287, 1,10))
Ob2020 <- data.frame(Date = seq(as.Date("2020-01-01"), to = as.Date("2020-06-25"), by = "days"),
Level = runif(177, 0, 30), Flow = runif(177, 1,10))
Here's a way using dplyr and lubridate :
library(dplyr)
library(lubridate)
FakeData %>%
mutate(day = day(Date), month = month(Date)) %>%
left_join(Ob2020 %>%
mutate(day = day(Date), month = month(Date)),
by = c('day', 'month')) %>%
mutate(Level = coalesce(Level.y, Level.x),
Flow = coalesce(Flow.y, Flow.x)) %>%
select(Date = Date.x, Level, Flow)
If you dont mind a data.table solution, here is an update join:
library(data.table)
#extract year and month of the date
setDT(FakeData)[, c("day", "mth") := .(mday(Date), month(Date))]
setDT(Ob2020)[, c("day", "mth") := .(mday(Date), month(Date))]
#print to console to show old values
head(FakeData)
head(Ob2020)
cols <- c("Level", "Flow")
FakeData[Ob2020[mth<=6L & day<=25], on=.(day, mth),
(cols) := mget(paste0("i.", cols))]
#print to console to show new values
head(FakeData)

how to sum every row within date range in variables

df1:
library(tidyverse)
library(lubridate)
ex1 <- tibble(date = seq.Date(from = ymd('20200101'), length.out = 100, by = 'day'),
a = rnorm(100, mean = 1, sd = 2),
b = runif(100, min = 1, max = 2),
c = rnorm(100, mean = 3, sd = 1),
d = runif(100, min = 50, max = 60))
df2:
cal_c <- tibble(variable = c('a', 'b', 'c','d'),
start = c(ymd('20200101', '20200103', '20200203', '20200103')),
end = c(ymd('20200204', '20200405', '20200301', '20200401')),
total = c('NA', 'NA', 'NA', 'NA'))
I want to calc every row in df2 within the date range in the start and end based on df1, say a$toal between '2020-1-1' to '2020-2-4', b$total between '2020-1-3' to '2020-4-5', any help, thanks a lot.
We can create a sequence of start and end dates for cal_c data, get ex1 in long format and join. We can then sum value for each variable.
library(tidyverse)
cal_c %>%
mutate(date = map2(start, end, seq, by = 'day')) %>%
unnest(date) %>%
left_join(ex1 %>% pivot_longer(cols = -date, names_to = 'variable'),
by = c('variable', 'date')) %>%
group_by(variable, start, end) %>%
summarise(value = sum(value, na.rm = TRUE))
# variable start end value
# <chr> <date> <date> <dbl>
#1 a 2020-01-01 2020-02-04 34.3
#2 b 2020-01-03 2020-04-05 136.
#3 c 2020-02-03 2020-03-01 79.5
#4 d 2020-01-03 2020-04-01 4909.
Base R Solution:
cal_c$total <- sapply(split(cal_c, rownames(cal_c)), function(x){
sum(ex1[((ex1$date >= x$start) & (ex1$date <= x$end)), match(x$variable, names(ex1))])})
An option using data.table:
cal_c[, total :=
ex1[cal_c, on=.(date>=start, date<=end), by=.EACHI,
sum(.SD[[variable]])]$V1
]
output:
variable start end total
1: a 2020-01-01 2020-02-04 34.04780
2: b 2020-01-03 2020-04-05 135.40290
3: c 2020-02-03 2020-03-01 91.10271
4: d 2020-01-03 2020-04-01 4978.59884
data:
set.seed(0L)
library(data.table)
ex1 <- data.table(date = seq.Date(from = as.IDate('20200101', format="%Y%m%d"), length.out = 100, by = 'day'),
a = rnorm(100, mean = 1, sd = 2),
b = runif(100, min = 1, max = 2),
c = rnorm(100, mean = 3, sd = 1),
d = runif(100, min = 50, max = 60))
cal_c <- data.table(variable = c('a', 'b', 'c','d'),
start = as.IDate(c('20200101', '20200103', '20200203', '20200103'), format="%Y%m%d"),
end = as.IDate(c('20200204', '20200405', '20200301', '20200401'), format="%Y%m%d"))

summarise data for multiple variables of a data.frame in r?

I am trying to compute the upper and lower quartile of the two variables in my data.frame across the time period of my interest. The code below gave me single digit for upper and lower value.
set.seed(50)
FakeData <- data.frame(seq(as.Date("2001-01-01"), to= as.Date("2003-12-31"), by="day"),
A = runif(1095, 0,10),
D = runif(1095,5,15))
colnames(FakeData) <- c("Date", "A","D")
statistics <- FakeData %>%
gather(-Date, key = "Variable", value = "Value") %>%
mutate(Year = year(Date), Month = month(Date)) %>%
filter(between(Month,3,5)) %>%
mutate(NewDate = ymd(paste("2020", Month,day(Date), sep = "-"))) %>%
group_by(Variable, NewDate) %>%
summarise(Upper = quantile(Value,0.75, na.rm = T),
Lower = quantile(Value, 0.25, na.rm = T))
I would want an output like below (the Final_output is what i am interested)
Output1 <- data.frame(seq(as.Date("2000-03-01"), to= as.Date("2000-05-31"), by="day"),
Upper = runif(92, 0,10), lower = runif(92,5,15), Variable = rep("A",92))
colnames(Output1)[1] <- "Date"
Output2 <- data.frame(seq(as.Date("2000-03-01"), to= as.Date("2000-05-31"), by="day"),
Upper = runif(92, 2,10), lower = runif(92,5,15), Variable = rep("D",92))
colnames(Output2)[1] <- "Date"
Final_Output<- bind_rows(Output1,Output2)
I can propose you a data.table solution. In fact there are several ways to do that.
The final steps (apply quartile by group on the Value variable) could be translated into (if you want, as in your example, two columns):
statistics[,.('p25' = quantile(get('Value'), probs = 0.25), 'p75' = quantile(get('Value'), probs = 0.75)),
by = c("Variable", "NewDate")]
If you prefer long-formatted output:
library(data.table)
setDT(statistics)
statistics[,.(lapply(get('Value'), quantile, probs = .25,.75)) ,
by = c("Variable", "NewDate")]
All steps together
It's probably better if you chose to use data.table to do all steps using data.table verbs. I will assume your data have the structure similar to the dataframe you generated and arranged, i.e.
statistics <- FakeData %>%
gather(-Date, key = "Variable", value = "Value")
In that case, mutate and filter steps would become
statistics[,`:=`(Year = year(Date), Month = month(Date))]
statistics <- statistics[Month %between% c(3,5)]
statistics[, NewDate = :ymd(paste("2020", Month,day(Date), sep = "-"))]
And choose the final step you prefer, e.g.
statistics[,.('p25' = quantile(get('Value'), probs = 0.25), 'p75' = quantile(get('Value'), probs = 0.75)),
by = c("Variable", "NewDate")]

Resources