require(dplyr)
df <- data.frame(Date.time = c("2015-01-01 00:00:00", "2015-01-01 00:30:00", "2015-01-01 01:00:00", "2015-01-01 01:30:00", "2015-01-01 02:00:00"),
RH33HMP = c(99.6,99.6,99.5,99.3,98.63),
RH33HMP_f = c(9,9,92,93,9),
RH38HMP = c(99.6,99.6,99.5,99.3,98.63),
RH38HMP_f = c(9,902,9,9,91))
Here is some example data.frame.
I'd like to set every value to NA where the corresponding quality column (_f) contains something else than 9. First, I grep the column number with the actual measurements:
col_var <- grep("^Date.|_f$", names(df), invert = T)
Then I use dplyr and mutate_at with an if_else function. My problem is, that mutate_at iterates through all the columns of col_val, but the function itself does not. I tried several examples that I found on stackoverflow, but none of them seem to work.
# does not work
df_qc <- df %>%
mutate_at(.vars = col_var,
.funs = list(~ ifelse(df[, col_var+1] == 9, ., NA)))
i=1
df_qc <- df %>%
mutate_at(.vars = col_var,
.funs = list(~ ifelse(df[, i+1] == 9, ., NA)))
I think I am quite close, any help appreciated.
We can use Map :
df[col_var] <- Map(function(x, y) {y[x != 9] <- NA;y},df[col_var + 1],df[col_var])
df
# Date.time RH33HMP RH33HMP_f RH38HMP RH38HMP_f
#1 2015-01-01 00:00:00 99.60 9 99.6 9
#2 2015-01-01 00:30:00 99.60 9 NA 902
#3 2015-01-01 01:00:00 NA 92 99.5 9
#4 2015-01-01 01:30:00 NA 93 99.3 9
#5 2015-01-01 02:00:00 98.63 9 NA 91
Similarly, you can use map2 in purrr if you prefer tidyverse.
df[col_var] <- purrr::map2(df[col_var + 1],df[col_var], ~{.y[.x != 9] <- NA;.y})
One dplyr and purrr option could be:
map2_dfr(.x = df %>%
select(ends_with("HMP")),
.y = df %>%
select(ends_with("_f")),
~ replace(.x, .y != 9, NA)) %>%
bind_cols(df %>%
select(-ends_with("HMP")))
RH33HMP RH38HMP Date.time RH33HMP_f RH38HMP_f
<dbl> <dbl> <fct> <dbl> <dbl>
1 99.6 99.6 2015-01-01 00:00:00 9 9
2 99.6 NA 2015-01-01 00:30:00 9 902
3 NA 99.5 2015-01-01 01:00:00 92 9
4 NA 99.3 2015-01-01 01:30:00 93 9
5 98.6 NA 2015-01-01 02:00:00 9 91
Related
I am basic R user.However I cannot create the plot which shows the daily minimum temperature (in F) of the weather dataset, which is given in nycflights13. I need tocreate a new column called date with str_c, such that date should be given as follows "YYYY-MM-DD".I use the lubridate package but it gives error.
Someone has explanation for that?
lubridate::make_date can solve the problem.
Below is the documentation.
make_date(year = 1970L, month = 1L, day = 1L)
library(tidyverse)
library(nycflights13)
library(lubridate)
df <- weather
df <- df %>%
mutate(date = make_date(year, month, day)) %>%
group_by(date) %>%
summarize(min_temp = min(temp)*1.8 + 32)
> df
# A tibble: 364 x 2
date min_temp
<date> <dbl>
1 2013-01-01 80.5
2 2013-01-02 73.4
3 2013-01-03 78.9
4 2013-01-04 84.1
5 2013-01-05 89.6
6 2013-01-06 91.5
7 2013-01-07 89.6
8 2013-01-08 84.1
9 2013-01-09 93.2
10 2013-01-10 102.
# ... with 354 more rows
ggplot(df) + geom_point(aes(x = date, y = min_temp))
Also this might solve your code.
as.Date(str_c(weather$year, "-", weather$month, "-", weather$day), format = "%Y-%m-%d")
I have dataframe loaded by a csv which is like:
A tibble: 5 x 8
ID EventType EventDate EventValue EventValueExt1 EventValueExt2 EventValueExt3 EventValueExt4
<dbl> <chr> <dttm> <chr> <dbl> <dbl> <dbl> <dbl>
1 12340 steps 2019-11-26 21:18:00 3017 NA NA NA NA
2 12339 steps 2019-11-25 14:23:00 3016 NA NA NA NA
3 12338 steps 2019-11-25 14:00:00 3015 NA NA NA NA
4 12337 geo_logging 2019-11-22 19:10:00 40.748498,-73.9933~ 16.4 0 16.8 0
5 12336 geo_logging 2019-11-22 19:09:00 40.7484843,-73.993~ 22.2 0 16.8 0
Then I try to create a DT::datatable with: datatable(device1_report1583417393205)
and I get:
As you can see my datetime has a strange format. I try
class(device1_report1583417393205$EventDate)
[1] "POSIXct" "POSIXt"
and then I try:
library(lubridate)
library(chron)
device1_report1583417393205$EventDate <- ymd_hms(device1_report1583417393205$EventDate)
chron(dates = format(device1_report1583417393205$EventDate, '%Y-%m-%d'), time = format(device1_report1583417393205$EventDate, "%H:%M:%S"),
format = c('y-m-d', 'h:m:s'))
but still get the same result in my DT. I also tried to make them factors but still the same. Any ideas?
EDIT:REPRODUCIBLE EXAMPLE
require(lubridate)
require(dplyr)
df = data.frame(timestring = c("2015-12-12 13:34:56", "2015-12-14 16:23:32"),
localzone = c("America/Los_Angeles", "America/New_York"), stringsAsFactors = F)
df$moment = as.POSIXct(df$timestring, format="%Y-%m-%d %H:%M:%S", tz="UTC")
df = df %>% rowwise() %>% mutate(localtime = force_tz(moment, localzone))
df
DT::datatable(df)
I have a dataset that contains start and end time stamps, as well as a performance percentage. I'd like to calculate group statistics over hourly blocks, e.g. "the average performance for the midnight hour was x%."
My question is if there is a more efficient way to do this than a series of ifelse() statements.
# some sample data
pre.starting <- data.frame(starting = format(seq.POSIXt(from =
as.POSIXct(Sys.Date()), to = as.POSIXct(Sys.Date()+1), by = "5 min"),
"%H:%M", tz="GMT"))
pre.ending <- data.frame(ending = pre.starting[seq(1, nrow(pre.starting),
2), ])
ending2 <- pre.ending[-c(1), ]
starting2 <- data.frame(pre.starting = pre.starting[!(pre.starting$starting
%in% pre.ending$ending),])
dataset <- data.frame(starting = starting2
, ending = ending2
, perct = rnorm(nrow(starting2), 0.5, 0.2))
For example, I could create hour blocks with code along the lines of the following:
dataset2 <- dataset %>%
mutate(hour = ifelse(starting >= 00:00 & ending < 01:00, 12
, ifelse(starting >= 01:00 & ending < 02:00, 1
, ifelse(starting >= 02:00 & ending < 03:00, 13)))
) %>%
group_by(hour) %>%
summarise(mean.perct = mean(perct, na.rm=T))
Is there a way to make this code more efficient, or improve beyond ifelse()?
We can use cut ending hour based on hourly interval after converting timestamps into POSIXct and then take mean for each hour.
library(dplyr)
dataset %>%
mutate_at(vars(pre.starting, ending), as.POSIXct, format = "%H:%M") %>%
group_by(ending_hour = cut(ending, breaks = "1 hour")) %>%
summarise(mean.perct = mean(perct, na.rm = TRUE))
# ending_hour mean.perct
# <fct> <dbl>
# 1 2019-09-30 00:00:00 0.540
# 2 2019-09-30 01:00:00 0.450
# 3 2019-09-30 02:00:00 0.612
# 4 2019-09-30 03:00:00 0.470
# 5 2019-09-30 04:00:00 0.564
# 6 2019-09-30 05:00:00 0.437
# 7 2019-09-30 06:00:00 0.413
# 8 2019-09-30 07:00:00 0.397
# 9 2019-09-30 08:00:00 0.492
#10 2019-09-30 09:00:00 0.613
# … with 14 more rows
I have the following data frame:
Date_from <- c("2013-02-01","2013-05-10","2013-08-13","2013-02-01","2013-05-10","2013-08-13","2013-02-01","2013-05-10","2013-08-13")
Date_to <- c("2013-05-07","2013-08-12","2013-11-18","2013-05-07","2013-08-12","2013-11-18","2013-05-07","2013-08-12","2013-11-18")
y <- data.frame(Date_from,Date_to)
y$concentration <- c("1.5","2.5","1.5","3.5","1.5","2.5","1.5","3.5","3")
y$Parameter<-c("A","A","A","B","B","B","C","C","C")
y$Date_from <- as.Date(y$Date_from)
y$Date_to <- as.Date(y$Date_to)
y$concentration <- as.numeric(y$concentration)
I will need to check the data frame if for EACH Parameter the date range begins at the first day of the year (2013-01-01) and ends at the last day of the year (2013-12-31). If not I will need to add an extra row at the beginning and at the end for each of the parameters to complete the date range to a full year for each parameter. The result should look like this:
Date_from Date_to concentration Parameter
2013-01-01 2013-01-31 NA NA
2013-02-01 2013-05-07 1.5 A
2013-05-10 2013-08-12 2.5 A
2013-08-13 2013-11-18 1.5 A
2013-11-19 2013-12-31 NA NA
2013-01-01 2013-01-31 NA NA
2013-02-01 2013-05-07 3.5 B
2013-05-10 2013-08-12 1.5 B
2013-08-13 2013-11-18 2.5 B
2013-11-19 2013-12-31 NA NA
2013-01-01 2013-01-31 NA NA
2013-02-01 2013-05-07 1.5 C
2013-05-10 2013-08-12 3.5 C
2013-08-13 2013-11-18 3.0 C
2013-11-19 2013-12-31 NA NA
Please note: The date ranges are only equal in this example for simplification.
UPDATE: This is my original data snippet and code:
sm<-read.csv("https://www.dropbox.com/s/tft6inwcrjqujgt/Test_data.csv?dl=1",sep=";",header=TRUE)
cleaned_sm<-sm[,c(4,5,11,14)] ##Delete obsolete columns
colnames(cleaned_sm)<-c("Parameter","Concentration","Date_from","Date_to")
cleaned_sm$Date_from<-as.Date(cleaned_sm$Date_from, format ="%d.%m.%Y")
cleaned_sm$Date_to<-as.Date(cleaned_sm$Date_to, format ="%d.%m.%Y")
#detect comma decimal separator and replace with dot decimal separater as comma is not recognised as a number
cleaned_sm=lapply(cleaned_sm, function(x) gsub(",", ".", x))
cleaned_sm<-data.frame(cleaned_sm)
cleaned_sm$Concentration <- as.numeric(cleaned_sm$Concentration)
cleaned_sm$Date_from <- as.Date(cleaned_sm$Date_from)
cleaned_sm$Date_to <- as.Date(cleaned_sm$Date_to)
Added code based on #jasbner:
cleaned_sm %>%
group_by(Parameter) %>%
do(add_row(.,
Date_from = ymd(max(Date_to))+1 ,
Date_to = ymd(paste(year(max(Date_to)),"1231")),
Parameter = .$Parameter[1])) %>%
do(add_row(.,
Date_to = ymd(min(Date_from))-1,
Date_from = ymd(paste(year(min(Date_from)),"0101")) ,
Parameter = .$Parameter[1],
.before = 0)) %>%
filter(!duplicated(Date_from,fromLast = T),!duplicated(Date_to))
My attempt with dplyr and lubridate. Hacked together but I think it should work. Note this does not look for any gaps in the middle of the date ranges. Basically, for each group, you add a row before and after that particular group. Then if there are any cases where the date range starts at the beginning of the year or ends at the end of the year the added rows are filtered out.
library(dplyr)
library(lubridate)
cleaned_sm %>%
group_by(Parameter) %>%
do(add_row(.,
Date_from = ymd(max(.$Date_to))+1 ,
Date_to = ymd(paste(year(max(.$Date_to)),"1231")),
Parameter = .$Parameter[1])) %>%
do(add_row(.,
Date_to = ymd(min(.$Date_from))-1,
Date_from = ymd(paste(year(min(.$Date_from)),"0101")) ,
Parameter = .$Parameter[1],
.before = 0)) %>%
filter(!duplicated(Date_from,fromLast = T),!duplicated(Date_to))
# A tibble: 15 x 4
# Groups: Parameter [3]
# Date_from Date_to concentration Parameter
# <date> <date> <dbl> <chr>
# 1 2013-01-01 2013-01-31 NA A
# 2 2013-02-01 2013-05-07 1.50 A
# 3 2013-05-10 2013-08-12 2.50 A
# 4 2013-08-13 2013-11-18 1.50 A
# 5 2013-11-19 2013-12-31 NA A
# 6 2013-01-01 2013-01-31 NA B
# 7 2013-02-01 2013-05-07 3.50 B
# 8 2013-05-10 2013-08-12 1.50 B
# 9 2013-08-13 2013-11-18 2.50 B
# 10 2013-11-19 2013-12-31 NA B
# 11 2013-01-01 2013-01-31 NA C
# 12 2013-02-01 2013-05-07 1.50 C
# 13 2013-05-10 2013-08-12 3.50 C
# 14 2013-08-13 2013-11-18 3.00 C
# 15 2013-11-19 2013-12-31 NA C
This seems like it requires a combination of different packages to attack it. I am using tidyr, data.table, and I used lubridate.
date.start <- seq.Date(as.Date("2013-01-01"), as.Date("2013-12-31"), by = "day")
Date.Int <- data.frame(Date_from = date.start, Date_to = date.start)
y_wide <- y %>% spread(Parameter, concentration)
y_wide <- as.data.table(setkey(as.data.table(y_wide), Date_from, Date_to))
Date.Int <- as.data.table(setkey(as.data.table(Date.Int), Date_from, Date_to))
dats <- foverlaps(Date.Int, y_wide, nomatch = NA)
fin.dat <- dats %>%
mutate(A = ifelse(is.na(A), -5, A),
seqs = cumsum(!is.na(A) & A != lag(A, default = -5))) %>%
group_by(seqs) %>%
summarise(Date_from = first(i.Date_from),
Date_to = last(i.Date_to) ,
A = first(A),
B = first(B),
C = first(C)) %>%
mutate(A = ifelse(A == -5, NA, A)) %>%
ungroup()%>%
gather(Concentration, Parameter, A:C) %>%
mutate(Concentration = ifelse(is.na(Parameter), NA, Concentration))
Okay, so I created a vector of dates from a start point to an end point (date.start); then I turned into a data.frame with the same interval names and interval dates for Date.Int. This is because foverlaps needs to compare two intervals (same date start and end dates in Date.Int are now officially intervals). I then took your data you provided and spread, turning it from long format data to wide format data and turned that into a data.table. keying a data.table sets up how it should be arranged, and when using foverlaps you have to key the start dates and end dates (in that order). foverlaps determines if an interval falls within another interval of dates. If you print out dats, you will see a bunch of lines with NA for everything because they did not fall within an interval. So now we have to group these in some manner. I picked grouping by values of "A" in dats. The grouping variable is called seqs. But then I summarised the data, and then switched it back from wide format to long format and replaced the appropriate NA values.
I have the following data frame:
time <- c("2004-01-01 01:30:00","2004-01-01 04:30:00","2004-01-01 07:30:00",
"2004-01-01 10:30:00","2004-01-01 13:30:00","2004-01-01 16:30:00",
"2004-01-01 19:30:00","2004-01-01 22:30:00","2004-01-02 01:30:00",
"2004-01-02 04:30:00","2004-01-02 07:30:00","2004-01-02 10:30:00",
"2004-01-02 13:30:00","2004-01-02 16:30:00","2004-01-02 19:30:00",
"2004-01-02 22:30:00","2004-01-03 01:30:00","2004-01-03 04:30:00",
"2004-01-03 07:30:00","2004-01-03 10:30:00")
d <- c(0.00, 0.00,152808.30, 739872.84, 82641.22, 83031.04, 83031.04, 82641.22, 0.00,
0.00, 267024.71,1247414.7, 151638.85, 151249.03, 151249.03, 152028.67, 0.00, 0.00,
296650.81,1355783.85)
dat <- data.frame(time = time, dat = d)
which demonstrate the accumulation (per day) of solar radiation from a forecast model for 3 days.
To convert the units of solar radiation from J/m2 to W/m2, I need to calculate the difference between the different forecast times per day and divide by 10800 (the forecast time). Here is my attempt:
itime <- as.numeric(as.Date(dat$time))
utime <- unique(itime)
l <- list()
for(i in 1:length(utime)){
idx <- itime == utime[i]
dat2 <- dat[idx,]
dat3 <- dat2[1,2]/10800
for(ii in 2:nrow(dat2)){
dat3[ii] <- (abs(dat2[ii,2] - dat2[ii-1,2]))/10800
}
df <- data.frame(dateTime = dat2$time,
dd = dat3)
l[[i]] <- df
}
df1 <- do.call(rbind.data.frame, l)
df1[,1] <- as.POSIXct(df1[,1])
which performs as expected. However, the actual data on which I intend to use this code has a length of >100 days. Thus, it is not optimal to run a loop.
Is there another method I can use instead of a loop?
I have tried:
dat2 <- c(dat[1,2]/10800,rev(abs(diff(rev(dat[,2])))/10800))
df2 <- data.frame(time = as.POSIXct(dat[,1]), dd = dat2)
which gives nearly the same answer (as the loop), but it also calculates the difference between time steps in different days, instead of isolating the calculation to individual days.
plot(df1, type = 'l')
lines(df2, col = 'red')
As you can see, there is a mismatch during the early hours.
Can anyone suggest another method?
For your list l you can have the same result by
dat <- data.frame(
time = c("2004-01-01 01:30:00","2004-01-01 04:30:00","2004-01-01 07:30:00",
"2004-01-01 10:30:00","2004-01-01 13:30:00","2004-01-01 16:30:00",
"2004-01-01 19:30:00","2004-01-01 22:30:00","2004-01-02 01:30:00",
"2004-01-02 04:30:00","2004-01-02 07:30:00","2004-01-02 10:30:00",
"2004-01-02 13:30:00","2004-01-02 16:30:00","2004-01-02 19:30:00",
"2004-01-02 22:30:00","2004-01-03 01:30:00","2004-01-03 04:30:00",
"2004-01-03 07:30:00","2004-01-03 10:30:00"),
dat = c(0.00, 0.00,152808.30, 739872.84, 82641.22, 83031.04, 83031.04, 82641.22, 0.00,
0.00, 267024.71,1247414.7, 151638.85, 151249.03, 151249.03, 152028.67, 0.00, 0.00,
296650.81,1355783.85)
)
dat$itime <- as.numeric(as.Date(dat$time))
utime <- unique(dat$itime)
daydat <- function(u) {
dat2 <- dat[dat$itime==u,]
data.frame(dateTime = dat2$time, dd = c(dat2$dat[1], abs(diff(dat2$dat)))/10800)
}
l <- lapply(utime, daydat)
Here is a version with split():
dat$itime <- as.numeric(as.Date(dat$time))
daydat <- function(d) data.frame(dateTime = d$time, dd = c(d$dat[1], abs(diff(d$dat)))/10800)
L <- split(dat, dat$itime)
l <- lapply(L, daydat)
or without creating dat$itime:
daydat <- function(d) data.frame(dateTime = d$time, dd = c(d$dat[1], abs(diff(d$dat)))/10800)
l <- lapply(split(dat, as.Date(dat$time)), FUN=daydat)
or using by()
l2 <- unclass(by(dat, as.Date(dat$time), FUN=daydat))
If you want to have the result in the original dataframe you can use ave()
dat$dd <- ave(dat$dat, as.Date(dat$time), FUN=function(x) c(x[1], abs(diff(x)))/10800)
Use can use lag() from dplyr with group_by()
library(dplyr)
df <- dat %>%
mutate(date = as.Date(time)) %>%
group_by(date) %>%
mutate(before.dat = lag(dat, order_by=date)) %>%
mutate(diff = abs(dat - before.dat)/10800) %>%
select(time, date, dat, before.dat, diff)
df
#Source: local data frame [20 x 5]
#Groups: date [3]
# time date dat before.dat diff
# <fctr> <date> <dbl> <dbl> <dbl>
#1 2004-01-01 01:30:00 2004-01-01 0.00 NA NA
#2 2004-01-01 04:30:00 2004-01-01 0.00 0.00 0.00000000
#3 2004-01-01 07:30:00 2004-01-01 152808.30 0.00 14.14891667
#4 2004-01-01 10:30:00 2004-01-01 739872.84 152808.30 54.35782778
#5 2004-01-01 13:30:00 2004-01-01 82641.22 739872.84 60.85477963
#6 2004-01-01 16:30:00 2004-01-01 83031.04 82641.22 0.03609444
#7 2004-01-01 19:30:00 2004-01-01 83031.04 83031.04 0.00000000
#8 2004-01-01 22:30:00 2004-01-01 82641.22 83031.04 0.03609444
#9 2004-01-02 01:30:00 2004-01-02 0.00 NA NA
#10 2004-01-02 04:30:00 2004-01-02 0.00 0.00 0.00000000
#11 2004-01-02 07:30:00 2004-01-02 267024.71 0.00 24.72451019
#12 2004-01-02 10:30:00 2004-01-02 1247414.70 267024.71 90.77685093
#13 2004-01-02 13:30:00 2004-01-02 151638.85 1247414.70 101.46072685
#14 2004-01-02 16:30:00 2004-01-02 151249.03 151638.85 0.03609444
#15 2004-01-02 19:30:00 2004-01-02 151249.03 151249.03 0.00000000
#16 2004-01-02 22:30:00 2004-01-02 152028.67 151249.03 0.07218889
#17 2004-01-03 01:30:00 2004-01-03 0.00 NA NA
#18 2004-01-03 04:30:00 2004-01-03 0.00 0.00 0.00000000
#19 2004-01-03 07:30:00 2004-01-03 296650.81 0.00 27.46766759
#20 2004-01-03 10:30:00 2004-01-03 1355783.85 296650.81 98.06787407
Simplified code based on GGamba's comment
dat %>%
mutate(time = as.Date(time)) %>%
group_by(time) %>%
mutate(diff = (dat-lag(dat)) / 10800)