Select specific date / hour range from list elements and create dataframe - r

I have a list with approximately 150 elements (data frames) of weather data (ID,date,time,temperature). I want to select specific date range and time from each list element (df) and create a data frame (or multiple) with these selected rows. Given the fact I can't provide real data I've created a reproducible example:
library(lubridate)
library(dplyr)
library(tidyr)
library(purrr)
z1 <- seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date()+780), by = "10 min")
z2 <- seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date()+780), by = "10 min")
z3 <- seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date()+780), by = "10 min")
z4 <- seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date()+780), by = "10 min")
temperature1 <- runif(112321, min = -5, max = 45)
temperature2 <- runif(112321, min = -5, max = 45)
temperature3 <- runif(112321, min = -5, max = 45)
temperature4 <- runif(112321, min = -5, max = 45)
station1 <- data.frame(date = z1, temp = temperature1)
station2 <- data.frame(date = z2, temp = temperature2)
station3 <- data.frame(date = z3, temp = temperature3)
station4 <- data.frame(date = z4, temp = temperature4)
##isolate date from time
station1 <- separate(station1, date, c("date", "time"), sep = " ")
station2 <- separate(station2, date, c("date", "time"), sep = " ")
station3 <- separate(station3, date, c("date", "time"), sep = " ")
station4 <- separate(station4, date, c("date", "time"), sep = " ")
## list of all stations
stations_list <- list(station1,station2,station3,station4)
#create a column with station ID (name) ##
ID_names <- c("station1","station2","station3","station4")
stations_list <- mapply(cbind,stations_list, "ID" = ID_names, SIMPLIFY = F)
Now in this list I want to select specific date and time range so I used the following script:
selected_date_time <- map_dfr(stations_list,
~ filter(.x, date >= "2021-06-01" &
date <= "2021-10-15" & time >= "18:00" & time <= "10:00" |
date > "2022-08-18" & date <= "2022-10-05" & time >= "09:00"
& time <= "17:00"))
In this case, I got a data frame with only 2022 year and no selection fro 2021. I changed slightly the code and I selected different hour range :
selected_date_time <- map_dfr(stations_list,
~ filter(.x, date >= "2021-06-01" &
date <= "2021-10-15" & time >= "18:00" & time <= "10:00" |
date > "2022-08-18" & date <= "2022-10-05" & time <= "09:00"
& time >= "17:00"))
In the last case I got a data frame with zero observations. What am I doing wrong ?!

As mentioned by #AntoniosK, your filter logic was off so I made a few amendments but most importantly, for this filter to work, we need to make sure the date and time are "date" and "time" class respectively.
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following objects are masked from 'package:base':
#>
#> date, intersect, setdiff, union
library(tidyverse)
library(hms)
#>
#> Attaching package: 'hms'
#> The following object is masked from 'package:lubridate':
#>
#> hms
z1 <-seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date() + 780), by = "10 min")
z2 <-seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date() + 780), by = "10 min")
z3 <-seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date() + 780), by = "10 min")
z4 <-seq.POSIXt(as.POSIXct(Sys.Date()), as.POSIXct(Sys.Date() + 780), by = "10 min")
temperature1 <- runif(112321, min = -5, max = 45)
temperature2 <- runif(112321, min = -5, max = 45)
temperature3 <- runif(112321, min = -5, max = 45)
temperature4 <- runif(112321, min = -5, max = 45)
station1 <- tibble(date = z1, temp = temperature1)
station2 <- tibble(date = z2, temp = temperature2)
station3 <- tibble(date = z3, temp = temperature3)
station4 <- tibble(date = z4, temp = temperature4)
station1 <- station1 %>%
mutate(time = hms::as_hms(date),
date = as_date(date)) %>%
relocate(date, time)
station2 <- station2 %>%
mutate(time = hms::as_hms(date),
date = as_date(date)) %>%
relocate(date, time)
station3 <- station3 %>%
mutate(time = hms::as_hms(date),
date = as_date(date)) %>%
relocate(date, time)
station4 <- station4 %>%
mutate(time = hms::as_hms(date),
date = as_date(date)) %>%
relocate(date, time)
## list of all stations
stations_list <- list(station1, station2, station3, station4)
#create a column with station ID (name) ##
ID_names <- c("station1", "station2", "station3", "station4")
stations_list <-
mapply(cbind, stations_list, "ID" = ID_names, SIMPLIFY = F)
stations_list %>%
map_dfr(~ filter(
.x,
(
between(date, as.Date("2021-06-01"), as.Date("2021-10-15")) &
(time >= as_hms("18:00:00") | time <= as_hms("10:00:00"))
) |
(date > as.Date("2022-08-18") &
date <= as.Date("2022-10-05")) &
(time <= as_hms("09:00:00") | time >= as_hms("17:00:00"))
)) %>%
arrange(date) %>%
head()
#> date time temp ID
#> 1 2021-06-01 00:00:00 20.259581 station1
#> 2 2021-06-01 00:10:00 37.558833 station1
#> 3 2021-06-01 00:20:00 18.729679 station1
#> 4 2021-06-01 00:30:00 5.880394 station1
#> 5 2021-06-01 00:40:00 2.393515 station1
#> 6 2021-06-01 00:50:00 36.030296 station1
Created on 2021-05-26 by the reprex package (v2.0.0)

Related

Find time interval where value rapidly decrease

I have timeseries with datetime and value, sometimes I see value is rapidly goes down. I need to find in what time did it happen.
I was thinking I need a kind of rolling window (5 mins for example) and comparing min and max Values in this window. If max/min > 2 - that means Value decrease is 50% - bingo. But how to build and analyze that windows? Or maybe there is better idea?
My example:
library(ggplot2)
set.seed(900)
data1 <-
data.frame(
datetime = seq.POSIXt(
as.POSIXct("2020-12-26 10:00:00"),
as.POSIXct("2020-12-26 10:00:00") + 15 * 500,
"15 sec"
),
Value = sample(140000:150000, 501, replace = T)
)
data2 <- data.frame(
datetime = seq.POSIXt(
as.POSIXct("2020-12-26 12:05:15"),
as.POSIXct("2020-12-26 12:05:15") + 15 * 100,
"15 sec"
),
Value = sample(100000:110000, 101, replace = T)
)
data3 <- data.frame(
datetime = seq.POSIXt(
as.POSIXct("2020-12-26 12:30:30"),
as.POSIXct("2020-12-26 12:30:30") + 15 * 299,
"15 sec"
),
Value = seq(100100, 130000, by = 100)
)
data4 <- data.frame(
datetime = seq.POSIXt(
as.POSIXct("2020-12-26 13:45:30"),
as.POSIXct("2020-12-26 13:45:30") + 15 * 2398,
"15 sec"
),
Value = seq(100, 120000, by = 50)
)
data <- do.call("rbind", list(data1, data2, data3, data4))
ggplot (data, aes(x = datetime, y = Value)) + theme_bw() + geom_line()
I am not sure is there a more elegant way to obtain the "rapidly decrease" but I would think that way;
library(dplyr)
data %>%
mutate(Value2=lag(Value),diff=abs(Value2-Value)) %>%
arrange(-diff) %>%
head(2)
output;
datetime Value Value2 diff
<dttm> <dbl> <dbl> <dbl>
1 2020-12-26 13:45:30 100 130000 129900
2 2020-12-26 12:05:15 102133 142219 40086
OR to obtain extreme decreases, I would calculate z scores;
data %>%
mutate(Value2=lag(Value),diff=abs(Value2-Value)) %>%
mutate(z_score=(diff-mean(diff,na.rm=T))/sd(diff,na.rm=T)) %>%
filter(z_score>3)
output;
datetime Value Value2 diff z_score
<dttm> <dbl> <dbl> <dbl> <dbl>
1 2020-12-26 11:15:45 140014 149687 9673 3.11
2 2020-12-26 12:05:15 102133 142219 40086 13.7
3 2020-12-26 12:14:15 109525 100112 9413 3.02
4 2020-12-26 13:45:30 100 130000 129900 44.9

How to create a date with "as.Date" from 3 different vectors (day, month, years)?

I have 3 different vectors including data of days, months and years. I would like to merge these 3 into one and add it to a new column of my data frame. I tried to use "as.Date" to merge these 3 vectors but it won't work...
Could you help me? :)
Here is my code:
Day<- substr(x = my_meteo_charleroi$Local.Time, start = 1, stop =2 )
Month<- substr(x = my_meteo_charleroi$Local.Time, start = 4, stop =5 )
Year<- substr(x = my_meteo_charleroi$Local.Time, start = 7, stop =10 )
my_date<- as.Date(c(Day, Month, Year), format = c("%d, %m, %y"))
Does this work:
Day <- c(10,11,12)
Month <- c(11,11,12)
Year <- c(2019,2020,2020)
library(tibble)
library(dplyr)
tibble(Day, Month, Year) %>%
mutate(my_date = paste(Day, Month, Year, sep = '-')) %>%
mutate(my_date = as.Date(my_date, format = '%d-%m-%Y', origin = '1970-01-01')) %>%
pull(my_date)
[1] "2019-11-10" "2020-11-11" "2020-12-12"
Dataframe with my_date column looks like this:
tibble(Day, Month, Year) %>% mutate(my_date = paste(Day, Month, Year, sep = '-')) %>%
mutate(my_date = as.Date(my_date, format = '%d-%m-%Y', origin = '1970-01-01'))
# A tibble: 3 x 4
Day Month Year my_date
<dbl> <dbl> <dbl> <date>
1 10 11 2019 2019-11-10
2 11 11 2020 2020-11-11
3 12 12 2020 2020-12-12
You are pretty close already, just two modifications needed.
Both arguments to as.Date() should be strings here, not vectors.
as.Date(paste(Day, Month, Year, sep = "-"), format = '%d-%m-%y')

How to put/save all elements of a List into one Excel sheet in R?

I have a list (bbb) with 5 elements in it, i.e., each element for a year, like 2010, 2011, ... , 2014:
The first one in the list is this:
> bbb[1]
$`2010`
Date Average
X2010.01.01 2010-01-01 2.079090e-03
X2010.01.02 2010-01-02 5.147627e-04
X2010.01.03 2010-01-03 2.997464e-04
X2010.01.04 2010-01-04 1.375538e-04
X2010.01.05 2010-01-05 1.332109e-04
The second one in the list is this:
> bbb[2]
$`2011`
Date Average
X2011.01.01 2011-01-01 1.546253e-03
X2011.01.02 2011-01-02 1.152864e-03
X2011.01.03 2011-01-03 1.752446e-03
X2011.01.04 2011-01-04 2.639658e-03
X2011.01.05 2011-01-05 5.231150e-03
X2011.01.06 2011-01-06 8.909878e-04
And so on.
Here is my question:
How can I save all of these list's elements in 1 sheet of an Excel file to have something like this:
Your help would be highly appreciated.
You can do this using dcast.
bbb <- list(`2010` = data.frame(date = as.Date("2010-01-01") + 0:4,
avg = 1:5),
`2011` = data.frame(date = as.Date("2011-01-01") + 0:5,
avg = 11:16),
`2012` = data.frame(date = as.Date("2012-01-01") + 0:9,
avg = 21:30),
`2013` = data.frame(date = as.Date("2013-01-01") + 0:7,
avg = 21:28))
df <- do.call("rbind", bbb)
df$year <- format(df$date, format = "%Y")
df$month_date <- format(df$date, format = "%b-%d")
library(data.table)
library(openxlsx)
df_dcast <- dcast(df, month_date~year, value.var = "avg")
write.xlsx(df_dcast, "example1.xlsx")
Or using spread
library(dplyr)
library(tidyr)
df2 <- df %>%
select(-date) %>%
spread(key = year, value = avg)
write.xlsx(df2, "example2.xlsx")
This isn't very pretty, but it's the best I could think of right now. But you could take the dataframes and loop through the list, joining them by date like this:
library(tidyverse)
library(lubridate)
bbb <- list(`2010` = tibble(date = c('01-01-2010', '01-02-2010', '01-03-2010', '01-04-2010', '01-05-2010'),
average = 11:15),
`2011` = tibble(date = c('01-01-2011', '01-02-2011', '01-03-2011', '01-04-2011', '01-05-2011'),
average = 1:5),
`2012` = tibble(date = c('01-01-2012', '01-02-2012', '01-03-2012', '01-04-2012', '01-05-2012'),
average = 6:10))
for (i in seq_along(bbb)) {
if(i == 1){
df <- bbb[[i]] %>%
mutate(
date = paste(day(as.Date(date, format = '%m-%d-%Y')),
month(as.Date(date, format = '%m-%d-%Y'), label = TRUE),
sep = '-')
)
colnames(df) <- c('date', names(bbb[i])) # Assuming your list of dataframes has just 2 columns: date and average
} else {
join_df <- bbb[[i]] %>%
mutate(
date = paste(day(as.Date(date, format = '%m-%d-%Y')),
month(as.Date(date, format = '%m-%d-%Y'), label = TRUE),
sep = '-')
)
colnames(join_df) <- c('date', names(bbb[i]))
df <- full_join(df, join_df, by = 'date')
}
}
This loops through the list of dataframes and reformats the dates to Day-Month.
# A tibble: 5 x 4
date `2010` `2011` `2012`
<chr> <int> <int> <int>
1 1-Jan 11 1 6
2 2-Jan 12 2 7
3 3-Jan 13 3 8
4 4-Jan 14 4 9
5 5-Jan 15 5 10
You could then write that out with the writexl package function write_xlsx

Using the pipe in selfmade function with tidyeval (quo_name)

I have two functions: date_diff and group_stat. So I have read this article tidyverse and I try so create simple functions and use the pipe.
The first function creates a difftime and names them timex_minus_timey but when I pipe this result into the next function I have to look at the name so I can fill in summary_var. Is there a better way to do this?
library(tidyverse)
#
set.seed(42)
data <- dplyr::bind_rows(
tibble::tibble(Hosp = rep("A", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60)),
tibble::tibble(Hosp = rep("B", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60))
)
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
out
}
group_stat <- function(df, group_var, summary_var, .f) {
func <- rlang::as_function(.f)
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := func(!!summary_var, na.rm = TRUE))
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, summary_var = time2_minus_time1, mean)
#> # A tibble: 2 x 2
#> Hosp time2_minus_time1_mean
#> <chr> <dbl>
#> 1 A 15.1
#> 2 B 14.9
Created on 2019-05-02 by the reprex package (v0.2.1)
If you intend to always use these functions one after another in this way you could add an attribute containing the new column's name with date_diff, and have group_stat use that attribute. With the if condition, the attribute is only used if it exists and the summary_var argument is not provided.
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
attr(out, 'date_diff_nm') <- name
out
}
group_stat <- function(df, group_var, summary_var, .f) {
if(!is.null(attr(df, 'date_diff_nm')) & missing(summary_var))
summary_var <- attr(df, 'date_diff_nm')
group_var <- rlang::enquo(group_var)
name <- paste0(summary_var, "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise_at(summary_var, funs(!!name := .f), na.rm = T)
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, .f = mean)
# # A tibble: 2 x 2
# Hosp time2_minus_time1_mean
# <chr> <dbl>
# 1 A 15.1
# 2 B 14.9

Compute daily, month and annual average of several data sets

I have a data frame:
MS_NR SS_NR DATE HOUR VALUE
1 13095010 68 1/01/2014 0:00:00 9,8
2 13095010 68 1/01/2014 1:00:00 8,0
3 13095010 68 1/01/2014 2:00:00 NA
4 13095010 68 1/01/2014 3:00:00 7,5
5 13095010 68 1/01/2014 4:00:00 7,0
6 13095010 68 1/01/2014 5:00:00 8,5
are temperature observations of a weather station taken every hour, I want to calculate the daily, weekly, monthly and annual averages of several data frames of different weather stations. How can I do this within a loop, so that the process is not repetitive?
When working with hydro-meteorological data, I usually use xts and hydroTSM packages as they have many functions for data aggregation.
You didn't provide any data so I created one for demonstration purpose
library(xts)
library(hydroTSM)
# Generate random data
set.seed(2018)
date = seq(from = as.Date("2016-01-01"), to = as.Date("2018-12-31"),
by = "days")
temperature = runif(length(date), -15, 35)
dat <- data.frame(date, temperature)
# Convert to xts object for xts & hydroTSM functions
dat_xts <- xts(dat[, -1], order.by = dat$date)
# All daily, monthly & annual series in one plot
hydroplot(dat_xts, pfreq = "dma", var.type = "Temperature")
# Weekly average
dat_weekly <- apply.weekly(dat_xts, FUN = mean)
plot(dat_weekly)
# Monthly average
dat_monthly <- daily2monthly(dat_xts, FUN = mean, na.rm = TRUE)
plot.zoo(dat_monthly, xaxt = "n", xlab = "")
axis.Date(1, at = pretty(index(dat_monthly)),
labels = format(pretty(index(dat_monthly)), format = "%b-%Y"),
las = 1, cex.axis = 1.1)
# Seasonal average: need to specify the months
dat_seasonal <- dm2seasonal(dat_xts, season = "DJF", FUN = mean, na.rm = TRUE)
plot(dat_seasonal)
# Annual average
dat_annual <- daily2annual(dat_xts, FUN = mean, na.rm = TRUE)
plot(dat_annual)
Edit: using OP's data
df <- readr::read_csv2("Temp_2014_Hour.csv")
str(df)
# Convert DATE to Date object & put in a new column
df$date <- as.Date(df$DATE, format = "%d/%m/%Y")
dat <- df[, c("date", "VALUE")]
str(dat)
dat_xts <- xts(dat[, -1], order.by = dat$date)
Created on 2018-02-28 by the reprex package (v0.2.0).
I try this
first using read.table load the file
library(openair)
Temp <- read.table (file, header=TRUE, sep=";",stringsAsFactors = FALSE, dec = ",", na.strings = "NA")
tiempos <- Temp$HOUR
timestamps <- as.POSIXlt(as.POSIXct('1900-1-1', tz='UTC')
+ as.difftime(as.character(tiempos))
time <- format(timestamps, format='%H:%M:%S')
date<-paste(Temp[,3], time, sep=" ")
date
Temp_met <- cbind(date, CovTemp[-c(3,4)])
Temp_met$date <- as.POSIXct(strptime(Met_CovTemp$date,
format = "%d/%m/%Y %H:%M", "GMT"))
## daily mean
Temp_daily <- timeAverage(Met_CovTemp, avg.time = "day")
## weekly mean
Temp_week <- timeAverage(Met_CovTemp, avg.time = "week")
## monthly mean
Temp_month <- timeAverage(Met_CovTemp, avg.time = "month")
## annual mean
Temp_annual <- timeAverage(Met_CovTemp, avg.time = "year")

Resources