Merge / Join data.tables per row - r

I have the following data tables and I would like to make a single data table out of all three.
library(dplyr)
set.seed(123)
dt.Ger <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Germany = rnorm(365, 2, 1), check.names = FALSE)
dt.Aut <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Austria = rnorm(365, 4, 2), check.names = FALSE)
dt.Den <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Denmark = rnorm(365, 3, 1), check.names = FALSE)
dt.Ger <- dt.Ger %>%
mutate(month = format(date, '%b'),
date = format(date, '%d')) %>%
tidyr::pivot_wider(names_from = date, values_from = Germany)
dt.Aut <- dt.Aut %>%
mutate(month = format(date, '%b'),
date = format(date, '%d')) %>%
tidyr::pivot_wider(names_from = date, values_from = Austria)
dt.Den <- dt.Den %>%
mutate(month = format(date, '%b'),
date = format(date, '%d')) %>%
tidyr::pivot_wider(names_from = date, values_from = Denmark)
Now I would like to link all tables together, i.e. first dt.Ger, then possibly add two empty lines and then append dt.Aut, now add again two empty lines and finally add dt.Den. Ideally, it would be great if Germany were the first headline, then Austria (in the second empty line before dt.Aut) and then Denmark (in the second empty line before dt.Den).
So that I only have a single table as a return. This table should look something like this (I only did it with SnippingTool, so it only serves to explain):
EDIT:
Using
l <- list(dt.Ger, dt.Aut, dt.Den)
l.result <- rbindlist(l)
yields to:
And I want to get an extra space/line/row (at the red parts) where Germany, Austria and Denmark is written.

I'm still not sure, what you are trying to achive - for me it seems you are better of working with a list of data.tables.
Furthermore, I switched to using dcast instead of pivot_wider so you can drop tidyr / dplyr.
However, here is an approach inserting NAs inbetween the different data.tables using rbindlist:
library(data.table)
set.seed(123)
dt.Ger <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Germany = rnorm(365, 2, 1), check.names = FALSE)
dt.Aut <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Austria = rnorm(365, 4, 2), check.names = FALSE)
dt.Den <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Denmark = rnorm(365, 3, 1), check.names = FALSE)
# or rather date ~ month?
dt.Ger[, c("month", "date") := list(format(date, '%b'), format(date, '%d'))]
dt.Ger <- dcast(dt.Ger, month ~ date, value.var = "Germany")
dt.Aut[, c("month", "date") := list(format(date, '%b'), format(date, '%d'))]
dt.Aut <- dcast(dt.Aut, month ~ date, value.var = "Austria")
dt.Den[, c("month", "date") := list(format(date, '%b'), format(date, '%d'))]
dt.Den <- dcast(dt.Den, month ~ date, value.var = "Denmark")
# use a list of data.tables:
recommended <- list(Germany = dt.Ger, Austria = dt.Aut, Denmark = dt.Den)
DT <- rbindlist(list(data.table(month = c("", "Germany")), dt.Ger, data.table(month = c("", "Austria")), dt.Aut, data.table(month = c("", "Denmark")), dt.Den), fill = TRUE) # [, V1 := NULL]
DT[,(names(DT)):= lapply(.SD, as.character), .SDcols = names(DT)]
for (j in seq_len(ncol(DT))){
set(DT, which(is.na(DT[[j]])), j, "")
}
print(DT)

Related

Calculate Mean for Each Unique Value up to a certain date

Data for my example.
date1 = seq(as.Date("2019/01/01"), by = "month", length.out = 48)
date2 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date3 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date4 = seq(as.Date("2019/02/01"), by = "month", length.out = 48)
date = c(date1,date2,date3,date4)
subproducts1=rep("1",48)
subproducts2=rep("2",48)
subproductsx=rep("x",48)
subproductsy=rep("y",48)
b1 <- c(rnorm(48,5))
b2 <- c(rnorm(48,5))
b3 <-c(rnorm(48,5) )
b4 <- c(rnorm(48,5))
dfone <- data.frame(
"date"= date,
"subproduct"=
c(subproducts1,subproducts2,subproductsx,subproductsy),
"actuals"= c(b1,b2,b3,b4))
This creates Jan 2019 for date2,3,4 with value 0.
dfone <-dfone %>%
complete(date = seq.Date(from = min(date), to = as.Date('2021-06-01'), by = 'month'),
nesting(subproduct), fill = list(actuals = 0))
QUESTION: This calculates the mean for each unique sub product and replaces 0's with the mean of each, but how do I have a hard cutoff so the mean is only based off Jan-2019 to Dec-2020 and not Jan 2019 to Dec 2022?
library(dplyr)
dfone_new <- dfone %>%
group_by(subproduct) %>%
mutate(actuals = replace(actuals, actuals == 0,
mean(actuals[actuals != 0], na.rm = TRUE))) %>%
ungroup
We may need one more logical expression while subsetting the 'actuals' i.e. the 'date' should be between the 2019 Jan and 2020 Dec while calculating the mean
library(dplyr)
library(tidyr)
dfone %>%
group_by(subproduct) %>%
mutate(actuals = replace(actuals, actuals == 0,
mean(actuals[actuals != 0 &
between(date, as.Date("2019-01-01"), as.Date("2020-12-31"))],
na.rm = TRUE)))

Calculating row means and saving them in a new column in R (data table)

I have the following data table:
library(dplyr)
set.seed(123)
dt <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Germany = rnorm(365, 2, 1), check.names = FALSE)
dt <- dt %>%
mutate(month = format(date, '%b'),
date = format(date, '%d')) %>%
tidyr::pivot_wider(names_from = date, values_from = Germany)
I would like to add two new columns (monthlyAverage, quarterlyAverage), one containing the monthly averages and the other column the quarterly averages.
For monthly average you can take rowwise mean, for quaterly average you can create groups of 3 rows and take mean of every 3 months.
library(dplyr)
dt %>%
mutate(monthlyaverage = rowMeans(.[-1], na.rm = TRUE)) %>%
group_by(grp = ceiling(row_number()/3)) %>%
mutate(quaterlyaverage = mean(monthlyaverage)) %>%
select(month, grp, monthlyaverage, quaterlyaverage, everything())
If you want to do this using data.table :
library(data.table)
setDT(dt)[, monthlyaverage := rowMeans(.SD, na.rm = TRUE), .SDcols = -1]
dt[, quaterlyaverage := mean(monthlyaverage), ceiling(seq_len(nrow(dt))/3)]

Creating all possible variable combinations in R

I am having a daily dataset of 4 parameters which I have converted into monthly data using following code
library(zoo)
library(hydroTSM)
library(lubridate)
library(tidyverse)
set.seed(123)
df <- data.frame("date"= seq(from = as.Date("1983-1-1"), to = as.Date("2018-12-31"), by = "day"),
"Parameter1" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 15, 35),
"Parameter2" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 11, 29),
"Parameter3" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 50, 90),
"Parameter4" = runif(length(seq.Date(as.Date("1983-1-1"), as.Date("2018-12-31"), "days")), 0, 27))
Monthly_data <- daily2monthly(df, FUN=mean, na.rm=TRUE)
After that, I have reshaped it to represent each column as month using following code
#Function to convert month abbreviation to a numeric month
mo2Num <- function(x) match(tolower(x), tolower(month.abb))
Monthly_data %>%
dplyr::as_tibble(rownames = "date") %>%
separate("date", c("Month", "Year"), sep = "-", convert = T) %>%
mutate(Month = mo2Num(Month))%>%
tidyr::pivot_longer(cols = -c(Month, Year)) %>%
pivot_wider(names_from = Month, values_from = value, names_prefix = "Mon",
names_sep = "_") %>%
arrange(name)
Now, I want to create parameter combinations like Parameter1 * Parameter2, Parameter1 * Parameter3, Parameter1 * Parameter4, Parameter2 * Parameter3, Parameter2 * Parameter4, Parameter3 * Parameter4 which will be added to the pivoted monthly data as rbind. The new dataframe Parameter1 * Parameter2 means to multiply their monthly values and then rbind to the above result. Likewise for all other above said combinations. How can I achieve this?
You can use this base R approach using combn assuming data is present for all the years for all parameters where df1 is the dataframe from the above output ending with arrange(name).
data <- combn(unique(df1$name), 2, function(x) {
t1 <- subset(df1, name == x[1])
t2 <- subset(df1, name == x[2])
t3 <- t1[-(1:2)] * t2[-(1:2)]
t3$name <- paste0(x, collapse = "_")
cbind(t3, t1[1])
}, simplify = FALSE)
You can then rbind it to original data.
new_data <- rbind(df1, do.call(rbind, data))

replace historical data of a data.frame with the most recent year data in R?

I want to replace Jan 01 to Jun 25 of all the years in FakeData with data from Ob2020 for the two variables (Level & Flow) of my data.frame. Here is what i have started and am looking for suggestions to achieving my goal.
library(tidyverse)
library(lubridate)
set.seed(1500)
FakeData <- data.frame(Date = seq(as.Date("2010-01-01"), to = as.Date("2018-12-31"), by = "days"),
Level = runif(3287, 0, 30), Flow = runif(3287, 1,10))
Ob2020 <- data.frame(Date = seq(as.Date("2020-01-01"), to = as.Date("2020-06-25"), by = "days"),
Level = runif(177, 0, 30), Flow = runif(177, 1,10))
Here's a way using dplyr and lubridate :
library(dplyr)
library(lubridate)
FakeData %>%
mutate(day = day(Date), month = month(Date)) %>%
left_join(Ob2020 %>%
mutate(day = day(Date), month = month(Date)),
by = c('day', 'month')) %>%
mutate(Level = coalesce(Level.y, Level.x),
Flow = coalesce(Flow.y, Flow.x)) %>%
select(Date = Date.x, Level, Flow)
If you dont mind a data.table solution, here is an update join:
library(data.table)
#extract year and month of the date
setDT(FakeData)[, c("day", "mth") := .(mday(Date), month(Date))]
setDT(Ob2020)[, c("day", "mth") := .(mday(Date), month(Date))]
#print to console to show old values
head(FakeData)
head(Ob2020)
cols <- c("Level", "Flow")
FakeData[Ob2020[mth<=6L & day<=25], on=.(day, mth),
(cols) := mget(paste0("i.", cols))]
#print to console to show new values
head(FakeData)

summarise data for multiple variables of a data.frame in r?

I am trying to compute the upper and lower quartile of the two variables in my data.frame across the time period of my interest. The code below gave me single digit for upper and lower value.
set.seed(50)
FakeData <- data.frame(seq(as.Date("2001-01-01"), to= as.Date("2003-12-31"), by="day"),
A = runif(1095, 0,10),
D = runif(1095,5,15))
colnames(FakeData) <- c("Date", "A","D")
statistics <- FakeData %>%
gather(-Date, key = "Variable", value = "Value") %>%
mutate(Year = year(Date), Month = month(Date)) %>%
filter(between(Month,3,5)) %>%
mutate(NewDate = ymd(paste("2020", Month,day(Date), sep = "-"))) %>%
group_by(Variable, NewDate) %>%
summarise(Upper = quantile(Value,0.75, na.rm = T),
Lower = quantile(Value, 0.25, na.rm = T))
I would want an output like below (the Final_output is what i am interested)
Output1 <- data.frame(seq(as.Date("2000-03-01"), to= as.Date("2000-05-31"), by="day"),
Upper = runif(92, 0,10), lower = runif(92,5,15), Variable = rep("A",92))
colnames(Output1)[1] <- "Date"
Output2 <- data.frame(seq(as.Date("2000-03-01"), to= as.Date("2000-05-31"), by="day"),
Upper = runif(92, 2,10), lower = runif(92,5,15), Variable = rep("D",92))
colnames(Output2)[1] <- "Date"
Final_Output<- bind_rows(Output1,Output2)
I can propose you a data.table solution. In fact there are several ways to do that.
The final steps (apply quartile by group on the Value variable) could be translated into (if you want, as in your example, two columns):
statistics[,.('p25' = quantile(get('Value'), probs = 0.25), 'p75' = quantile(get('Value'), probs = 0.75)),
by = c("Variable", "NewDate")]
If you prefer long-formatted output:
library(data.table)
setDT(statistics)
statistics[,.(lapply(get('Value'), quantile, probs = .25,.75)) ,
by = c("Variable", "NewDate")]
All steps together
It's probably better if you chose to use data.table to do all steps using data.table verbs. I will assume your data have the structure similar to the dataframe you generated and arranged, i.e.
statistics <- FakeData %>%
gather(-Date, key = "Variable", value = "Value")
In that case, mutate and filter steps would become
statistics[,`:=`(Year = year(Date), Month = month(Date))]
statistics <- statistics[Month %between% c(3,5)]
statistics[, NewDate = :ymd(paste("2020", Month,day(Date), sep = "-"))]
And choose the final step you prefer, e.g.
statistics[,.('p25' = quantile(get('Value'), probs = 0.25), 'p75' = quantile(get('Value'), probs = 0.75)),
by = c("Variable", "NewDate")]

Resources