I have a dataframe that looks like follows:
head(df_HPI)
The HPI is a quarterly index I want to convert to annual. I have 17 regions (i.e. CCAA), so I want to aggregate the HPI to contain annual data per region. I made some changes but the code is not working.
# Convert series to annual data
df_HPI <- df_HPI_original
# Replace period format
df_HPI <- data.frame(sapply(df_HPI, function(x) {gsub("T1","-01-01",x)})) # Q1
df_HPI <- data.frame(sapply(df_HPI, function(x) {gsub("T2","-04-01",x)})) # Q2
df_HPI <- data.frame(sapply(df_HPI, function(x) {gsub("T3","-07-01",x)})) # Q3
df_HPI <- data.frame(sapply(df_HPI, function(x) {gsub("T4","-10-01",x)})) # Q4
# Convert column into a date
df_HPI$Periodo <- as.Date(df_HPI$Periodo)
# Aggregate to annual data
df_HPI %>%
mutate(Year=year(Periodo),
Quarter=quarter(Periodo),
Finyear = ifelse(Quarter <= 2, Year, Year+1)) %>%
group_by(Finyear, CCAA) %>%
summarise(HPIy=mean(HPI))
In the last step, the programme says the argument is not logical and returns NAs.
The problem is that the HPI column was converted to a factor when you replaced the period format via gsub. You therefore have to convert it back to numeric. Try this:
library(dplyr)
library(lubridate)
set.seed(42)
# Example data
quarters <- paste0("T", c(1:4))
years <- c("2019", "2020")
dates <- c(paste0(years[[1]], quarters), paste0(years[[2]], quarters))
df_HPI <- data.frame(
Periodo = rep(dates, 2),
CCAA = c(rep("Region1", 8), rep("Region2", 8)),
HPI = runif(16)
)
head(df_HPI)
#> Periodo CCAA HPI
#> 1 2019T1 Region1 0.9148060
#> 2 2019T2 Region1 0.9370754
#> 3 2019T3 Region1 0.2861395
#> 4 2019T4 Region1 0.8304476
#> 5 2020T1 Region1 0.6417455
#> 6 2020T2 Region1 0.5190959
# Replace period format
df_HPI <- data.frame(sapply(df_HPI, function(x) {gsub("T1","-01-01",x)})) # Q1
df_HPI <- data.frame(sapply(df_HPI, function(x) {gsub("T2","-04-01",x)})) # Q2
df_HPI <- data.frame(sapply(df_HPI, function(x) {gsub("T3","-07-01",x)})) # Q3
df_HPI <- data.frame(sapply(df_HPI, function(x) {gsub("T4","-10-01",x)})) # Q4
# Convert column into a date
df_HPI$Periodo <- as.Date(df_HPI$Periodo)
# Problem: HPI was converted to a factor
class(df_HPI$HPI)
#> [1] "factor"
# Solution: Convert back to numeric
df_HPI$HPI <- as.numeric(as.character(df_HPI$HPI))
# Aggregate to annual data
df_HPI %>%
mutate(Year=year(Periodo),
Quarter=quarter(Periodo),
Finyear = ifelse(Quarter <= 2, Year, Year+1)) %>%
group_by(Finyear, CCAA) %>%
summarise(HPIy=mean(HPI))
#> # A tibble: 6 x 3
#> # Groups: Finyear [3]
#> Finyear CCAA HPIy
#> <dbl> <fct> <dbl>
#> 1 2019 Region1 0.926
#> 2 2019 Region2 0.681
#> 3 2020 Region1 0.569
#> 4 2020 Region2 0.592
#> 5 2021 Region1 0.436
#> 6 2021 Region2 0.701
Created on 2020-04-04 by the reprex package (v0.3.0)
Related
In some cases, a "year" doesn't necessarily cycle from January 1st. For example, academic year starts at the end of August in the US. Another example is the NBA season.
My question: given data containing a date column, I want to create another column that refers to which period it falls in. For example, consider that we are given the following tib:
library(lubridate, warn.conflicts = FALSE)
library(tibble)
tib <- tibble(my_dates = as_date(c("1999-01-01", "2010-08-09", "2010-09-02", "1995-03-02")))
tib
#> # A tibble: 4 x 1
#> my_dates
#> <date>
#> 1 1999-01-01
#> 2 2010-08-09
#> 3 2010-09-02
#> 4 1995-03-02
and we want to mutate a column that refers to the academic year each date belongs to, provided that the academic year starts on August 31st:
desired_output <-
tib %>%
add_column(belongs_to_school_year = c("1998-1999", "2009-2010", "2010-2011", "1994-1995"))
desired_output
#> # A tibble: 4 x 2
#> my_dates belongs_to_school_year
#> <date> <chr>
#> 1 1999-01-01 1998-1999
#> 2 2010-08-09 2009-2010
#> 3 2010-09-02 2010-2011
#> 4 1995-03-02 1994-1995
How can I create the column belongs_to_school_year using mutate(), based on my_dates?
You can use dplyr and lubridate for this:
desired_output <- tib %>%
mutate(school_year = case_when(month(my_dates) <= 8 ~ paste(year(my_dates)-1, year(my_dates), sep = "-"),
month(my_dates) > 8 ~ paste(year(my_dates), year(my_dates)+1, sep = "-")))
or:
desired_output <- tib %>%
mutate(school_year = if_else(month(my_dates) <= 8,
paste(year(my_dates)-1, year(my_dates), sep = "-"),
paste(year(my_dates), year(my_dates)+1, sep = "-")))
This question already has an answer here:
How to use Pivot_longer to reshape from wide-type data to long-type data with multiple variables
(1 answer)
Closed 1 year ago.
I have a data frame which each row indicates a unique id.
ID <- 1:12
Date1 <- seq(as.Date("2000/1/1"), length.out = 12, by = "months")
Date2 <- seq(as.Date("2001/1/1"), length.out = 12, by = "months")
Date3 <- seq(as.Date("2002/1/1"), length.out = 12, by = "months")
Fcast1 <- rnorm(12)
Fcast2 <- rnorm(12)
Fcast3 <- rnorm(12)
df <- data.frame(ID, Date1, Fcast1, Date2, Fcast2, Date3, Fcast3)
I would like to gather Date1 to Date3 and Fcast1 to Fcast3 columns in two columns of Date and Fcast and repeat IDs 3 times. basically creating long-view of data or rbind-ing each pair of Date and Fcast.
Desired Output shape:
ID <- rep(ID, 3)
Date = c(Date1, Date2, Date3)
Fcast = c(Fcast1, Fcast2, Fcast3)
df <- data.frame(ID, Date, Fcast)
Using tidyr::pivot_longer -
tidyr::pivot_longer(df,
cols = -ID,
names_to = '.value',
names_pattern = '(.*)\\d+')
# A tibble: 36 x 3
# ID Date Fcast
# <int> <date> <dbl>
# 1 1 2000-01-01 0.452
# 2 1 2001-01-01 0.242
# 3 1 2002-01-01 -0.540
# 4 2 2000-02-01 1.54
# 5 2 2001-02-01 0.178
# 6 2 2002-02-01 0.883
# 7 3 2000-03-01 -0.987
# 8 3 2001-03-01 1.40
# 9 3 2002-03-01 0.675
#10 4 2000-04-01 -0.632
# … with 26 more rows
You can do something like:
library(data.table)
setDT(df)
melt(df, measure.vars=patterns("^Date", "^Fcast"), value.name=c("Date", "Fcast"))[,
variable := NULL][]
I am sorry if the title is incomprehensible. I have a data as shown below; 1, 2, 3.. are months of various years. And I want to gather months separately for a and l.
a l
1-2006 3.498939 0.8523857
1-2007 14.801777 0.2457656
1-2008 6.893728 0.5381691
2-2006 2.090962 0.6764694
2-2007 9.192913 0.8740950
2-2016 5.059505 1.1761113
Structure of data is;
data<-structure(list(a = c(3.49893890760882, 14.8017770056402, 6.89372828391484,
2.0909624091048, 9.19291324208917, 5.05950526612261, 13.1570625271881,
14.9570662205959, 7.72453112976811, 12.9331892673657
), l = c(0.852385662732809,
0.245765570168399, 0.538169092055646, 0.676469362818052, 0.874095005203713,
1.17611132212132, 0.76857056091243, 0.622533767341579, 0.9562200838363,
1.10064589903771, 0.85863722854391
)), class = "data.frame", row.names = c("1-2006",
"1-2007", "1-2008",
"2-2006", "2-2007",
"2-2016",
"3-2015", "3-2016", "3-2017", "3-2018"
))
For example; I want to gather all january (1-2005, 1-2006..) and march data(3-2012, 3-2015..) data for a and also for l. Like this one:
january_a
1-2006 3.498939
1-2007 14.801777
1-2008 6.893728
january_l
1-2006 0.8523857
1-2007 0.2457656
1-2008 0.5381691
march_a
3-2012 9.192913
3-2015 5.059505
march_l
3-2012 0.8740950
3-2015 1.1761113
You could add a column which contains only the numerical prefix, and then split on that:
data$prefix <- sub("^(\\d+).*$", "\\1", row.names(data))
data_a <- split(data[,"a"], data$prefix)
data_a
$`1`
[1] 3.498939 14.801777 6.893728
$`2`
[1] 2.090962 9.192913 5.059505
Data:
data <- data.frame(a=c(3.498939, 14.801777, 6.893728, 2.090962, 9.192913, 5.059505),
l=c(0.8523857, 0.2457656, 0.5381691, 0.6764694, 0.8740950, 1.1761113))
row.names(data) <- c("1-2006", "1-2007", "1-2008", "2-2006", "2-2007", "2-2016")
This is another variation that you can try using tidyverse which returns a list of dataframes, where every element has a combination of month and "a" or "l".
library(tidyverse)
data %>%
rownames_to_column('date') %>%
pivot_longer(cols = -date) %>%
separate(date, c('month', 'year'), sep = "-", remove = FALSE) %>%
group_split(month, name)
#[[1]]
# A tibble: 3 x 5
# date month year name value
# <chr> <chr> <chr> <chr> <dbl>
#1 1-2006 1 2006 a 3.50
#2 1-2007 1 2007 a 14.8
#3 1-2008 1 2008 a 6.89
#[[2]]
# A tibble: 3 x 5
# date month year name value
# <chr> <chr> <chr> <chr> <dbl>
#1 1-2006 1 2006 l 0.852
#2 1-2007 1 2007 l 0.246
#3 1-2008 1 2008 l 0.538
#...
#...
This has some additional columns to uniquely identify values which you can remove if not needed.
Another option is group_split
library(purrr)
library(dplyr)
library(stringr)
data %>%
rownames_to_column('rn') %>%
select(rn, a) %>%
group_split(rn = str_remove(rn, '-.*'), keep = FALSE) %>%
map(flatten_dbl)
#[[1]]
#[1] 3.498939 14.801777 6.893728
#[[2]]
#[1] 2.090962 9.192913 5.059505
data
data <- data.frame(a=c(3.498939, 14.801777, 6.893728, 2.090962, 9.192913, 5.059505),
l=c(0.8523857, 0.2457656, 0.5381691, 0.6764694, 0.8740950, 1.1761113))
row.names(data) <- c("1-2006", "1-2007", "1-2008", "2-2006", "2-2007", "2-2016")
I have a series of large matrices and I am just getting used to navigating them in this format and working with functions.
I have minute data for a number of parameters which i have been able to reduce to daily averages - i would like to align each mean output with a date sequence and from there extract the daily average for each year.
In the singular form i have done it like this
A <- matrix(c(1:3285),nrow=3)
AA <- sapply(1:1095, function(x) mean(A [,x], na.rm = TRUE))
D <- seq(from = as.Date("2013-01-01"), to = as.Date("2015-12-31"), by= 1)
df <- cbind.data.frame(D,AA)
Which gets me the means per column aligned to a date for 2013-2015
library(lubridate)
years <- year(as.Date(df$D, "%d-%b-%y"))
day <- yday(as.Date(df$D, "%d-%b-%y"))
#to get the average of DOY over three years
avg <- as.data.frame(tapply(df$AA,day, mean, na.rm=T)) #gives average value on day of year
#Average for specific DOY for each year
av <- as.data.frame(tapply(df$AA,list(day,years), mean, na.rm=T)) #gets the DOY average per year
#bind to get yearly averages and overall average in a data frame format
DF <- cbind(av,avg)
head(DF)
colnames(DF)[4] <- "avg" #rename ts average column
Now say i have multiple matrices (all the same dimension just different parameters) that i want to do this for... is there an efficient way to loop through this so i get a data frame (DF) output for each A-C?
#extra matrices to play with:
B <- matrix(c(3285:6570),nrow=3)
C <- matrix(c(6570:9855),nrow=3)
I have gotten thus far with some initial help on stackoverflow:
#column means for each matrices
vapply(list(A, B, C), colMeans, numeric(1095))
Here's a tidyverse solution. Let
dates <- seq(from = as.Date("2013-01-01"), to = as.Date("2015-12-31"), by = 1)
A <- data.frame(matrix(c(1:3285), ncol = 3, byrow = TRUE))
since I understand that dates are the same to all the matrices. Also, I made A long rather than wide, that's better when working with tidyverse. Then perhaps you would prefer the output in the form of
A %>% group_by(year = year(dates), day = yday(dates)) %>%
summarise(dayYearAvg = mean(c(X1, X2, X3))) %>%
group_by(day) %>% mutate(dayAvg = mean(dayYearAvg))
# A tibble: 1,095 x 4
# Groups: day [365]
# year day dayYearAvg dayAvg
# <dbl> <dbl> <dbl> <dbl>
# 1 2013 1 2 1097
# 2 2013 2 5 1100
# 3 2013 3 8 1103
# ...
If not, we get the same as in your example with
A %>% group_by(year = year(dates), day = yday(dates)) %>%
summarise(dayYearAvg = mean(c(X1, X2, X3))) %>%
group_by(day) %>% mutate(dayAvg = mean(dayYearAvg)) %>%
spread(year, dayYearAvg) %>% ungroup %>% select(-day)
# A tibble: 365 x 4
# dayAvg `2013` `2014` `2015`
# <dbl> <dbl> <dbl> <dbl>
# 1 1097 2 1097 2192
# 2 1100 5 1100 2195
# 3 1103 8 1103 2198
# 4 1106 11 1106 2201
# ...
Now let also
B <- data.frame(matrix(c(3285:6569), ncol = 3, byrow = TRUE))
C <- data.frame(matrix(c(6570:9854), ncol = 3, byrow = TRUE))
l <- list(A, B, C)
This gives
map(l, . %>% group_by(year = year(dates), day = yday(dates)) %>%
summarise(dayYearAvg = mean(c(X1, X2, X3))) %>%
group_by(day) %>% mutate(dayAvg = mean(dayYearAvg)) %>%
spread(year, dayYearAvg) %>% ungroup %>% select(-day))
# [[1]]
# A tibble: 365 x 4
# dayAvg `2013` `2014` `2015`
# <dbl> <dbl> <dbl> <dbl>
# 1 1097 2 1097 2192
# 2 1100 5 1100 2195
# ...
# [[2]]
# A tibble: 365 x 4
# dayAvg `2013` `2014` `2015`
# <dbl> <dbl> <dbl> <dbl>
# 1 4381 3286 4381 5476
# 2 4384 3289 4384 5479
# ...
# [[3]]
# A tibble: 365 x 4
# dayAvg `2013` `2014` `2015`
# <dbl> <dbl> <dbl> <dbl>
# 1 7666 6571 7666 8761
# 2 7669 6574 7669 8764
# ...
Here's a tinyverse solution (i.e., no third-party packages) that wraps your process in a function to receive a matrix as input and return data frame as output. Then run lapply on a list of matrices.
df_process <- function(mat) {
# CREATE DF AND ADD NEW COLUMNS
df <- within(data.frame(D=seq(from = as.Date("2013-01-01"),
to = as.Date("2015-12-31"), by= 1),
AA=sapply(1:1095, function(x) mean(mat[,x], na.rm=TRUE))),
{
year <- format(as.Date(df$D, origin="1970-01-01"), "%Y")
day <- format(as.Date(df$D, origin="1970-01-01"), "%d")
})
# CREATE DF WITH TAPPLY CALLS, RENAME COLUMNS
df <- setNames(data.frame(tapply(df$AA,list(day,years), mean, na.rm=T),
avg = c(tapply(df$AA, day, mean, na.rm=T))),
c("2013", "2014", "2015", "avg"))
}
A <- matrix(c(1:3285),nrow=3)
B <- matrix(c(3286:6570),nrow=3)
C <- matrix(c(6571:9855),nrow=3)
# NAMED LIST OF DATA FRAMES
DF_list <- setNames(lapply(list(A, B, C), df_process), c("A", "B", "C"))
all.equal(DF, DF_list$A)
# [1] TRUE
identical(DF, DF_list$A)
# [1] TRUE
Output
lapply(DF_list, head)
# $A
# 2013 2014 2015 avg
# 01 501.5 1596.5 2691.5 1596.5
# 02 504.5 1599.5 2694.5 1599.5
# 03 507.5 1602.5 2697.5 1602.5
# 04 510.5 1605.5 2700.5 1605.5
# 05 513.5 1608.5 2703.5 1608.5
# 06 516.5 1611.5 2706.5 1611.5
# $B
# 2013 2014 2015 avg
# 01 3786.5 4881.5 5976.5 4881.5
# 02 3789.5 4884.5 5979.5 4884.5
# 03 3792.5 4887.5 5982.5 4887.5
# 04 3795.5 4890.5 5985.5 4890.5
# 05 3798.5 4893.5 5988.5 4893.5
# 06 3801.5 4896.5 5991.5 4896.5
# $C
# 2013 2014 2015 avg
# 01 7071.5 8166.5 9261.5 8166.5
# 02 7074.5 8169.5 9264.5 8169.5
# 03 7077.5 8172.5 9267.5 8172.5
# 04 7080.5 8175.5 9270.5 8175.5
# 05 7083.5 8178.5 9273.5 8178.5
# 06 7086.5 8181.5 9276.5 8181.5
I'm looking to remove all dates that have any NAs/missing data for any observation type, from a data.frame that has duplicate dates. For example, here I'd like to end up with a data.frame of just three rows with 2018-12-05 data. My non-example data.frame is quite long, so I'm leaning toward run time over readability, if there are a few ways, and I'm open to tidyverse and non-tidyverse ways.
date_time <- rep(seq(from=as.POSIXct("2018-12-01", tz="GMT"),
to=as.POSIXct("2018-12-05", tz="GMT"), by="1 day"),3)
value <- c(1,2,NA,NA,5,NA,NA,NA,4,5,7,NA,NA,NA,8)
class <- c(rep("a", 5), rep("b", 5), rep("c", 5))
df <- data.frame(date_time, value, class)
from:
filtered to:
If you need speed, I don't know. But the following function keeps only the rows where the dates have all values of class.
fun <- function(DF){
DF2 <- DF[!is.na(DF$value), ]
u <- unique(DF2$class)
sp <- split(DF2, DF2$date_time)
inx <- sapply(sp, function(d){
all(u %in% d$class)
})
DF2 <- do.call(rbind, sp[inx])
row.names(DF2) <- NULL
DF2
}
fun(df)
# date_time value class
#1 2018-12-05 5 a
#2 2018-12-05 5 b
#3 2018-12-05 8 c
Edit.
Here is a comparative speed test. Camille's answer is faster for larger dataframes, where speed is more important. And is nicer.
library(microbenchmark)
library(ggplot2)
library(dplyr)
fun2 <- function(DF){
DF %>%
arrange(date_time, class) %>%
group_by(date_time) %>%
mutate(all_vals = all(!is.na(value))) %>%
filter(all_vals)
}
mb <- microbenchmark(
rui = fun(df),
camille = fun2(df)
)
mb1 <- microbenchmark(
rui = fun(df1),
camille = fun2(df1)
)
ap <- autoplot(mb)
ap1 <- autoplot(mb1)
cowplot::plot_grid(ap, ap1)
Within some dplyr functions, you can use the base all. Group by date, then find whether all values for each group are non-NA.
With some extra steps to illustrate:
library(dplyr)
df %>%
arrange(date_time, class) %>%
group_by(date_time) %>%
mutate(all_vals = all(!is.na(value))) %>%
filter(all_vals)
#> # A tibble: 3 x 4
#> # Groups: date_time [1]
#> date_time value class all_vals
#> <dttm> <dbl> <fct> <lgl>
#> 1 2018-12-05 00:00:00 5 a TRUE
#> 2 2018-12-05 00:00:00 5 b TRUE
#> 3 2018-12-05 00:00:00 8 c TRUE
Or more briefly, find the non-NAs within filter:
df %>%
group_by(date_time) %>%
filter(all(!is.na(value)))
#> # A tibble: 3 x 3
#> # Groups: date_time [1]
#> date_time value class
#> <dttm> <dbl> <fct>
#> 1 2018-12-05 00:00:00 5 a
#> 2 2018-12-05 00:00:00 5 b
#> 3 2018-12-05 00:00:00 8 c
Created on 2018-12-01 by the reprex package (v0.2.1)