I have a dataset similar to the format of "my_data" below, where each line is a single count of an event. I want to obtain a summary of how many events happen in every hour. I would like to have every hour with no events be included with a 0 for its "hourly_total" value.
I can achieve this with dplyr as shown, but the empty hours are dropped instead of being set to 0.
Thank you!
set.seed(123)
library(dplyr)
library(lubridate)
latemail <- function(N, st="2012/01/01", et="2012/1/31") {
st <- as.POSIXct(as.Date(st))
et <- as.POSIXct(as.Date(et))
dt <- as.numeric(difftime(et,st,unit="sec"))
ev <- sort(runif(N, 0, dt))
rt <- st + ev
}
my_data <- data_frame( fake_times = latemail(25),
count = 1)
my_data %>% group_by( rounded_hour = floor_date(fake_times, unit = "hour")) %>%
summarise( hourly_total = sum(count))
Assign your counts to an object
counts <- my_data %>% group_by( rounded_hour = floor_date(fake_times, unit = "hour")) %>%
summarise( hourly_total = sum(count))
Create a data frame with all the necessary hours
complete_data = data.frame(hour = seq(floor_date(min(my_data$fake_times), unit = "hour"),
floor_date(max(my_data$fake_times), unit = "hour"),
by = "hour"))
Join to it and fill in the NAs.
complete_data %>% group_by( rounded_hour = floor_date(hour, unit = "hour")) %>%
left_join(counts) %>%
mutate(hourly_total = ifelse(is.na(hourly_total), 0, hourly_total))
Related
I want to replace Jan 01 to Jun 25 of all the years in FakeData with data from Ob2020 for the two variables (Level & Flow) of my data.frame. Here is what i have started and am looking for suggestions to achieving my goal.
library(tidyverse)
library(lubridate)
set.seed(1500)
FakeData <- data.frame(Date = seq(as.Date("2010-01-01"), to = as.Date("2018-12-31"), by = "days"),
Level = runif(3287, 0, 30), Flow = runif(3287, 1,10))
Ob2020 <- data.frame(Date = seq(as.Date("2020-01-01"), to = as.Date("2020-06-25"), by = "days"),
Level = runif(177, 0, 30), Flow = runif(177, 1,10))
Here's a way using dplyr and lubridate :
library(dplyr)
library(lubridate)
FakeData %>%
mutate(day = day(Date), month = month(Date)) %>%
left_join(Ob2020 %>%
mutate(day = day(Date), month = month(Date)),
by = c('day', 'month')) %>%
mutate(Level = coalesce(Level.y, Level.x),
Flow = coalesce(Flow.y, Flow.x)) %>%
select(Date = Date.x, Level, Flow)
If you dont mind a data.table solution, here is an update join:
library(data.table)
#extract year and month of the date
setDT(FakeData)[, c("day", "mth") := .(mday(Date), month(Date))]
setDT(Ob2020)[, c("day", "mth") := .(mday(Date), month(Date))]
#print to console to show old values
head(FakeData)
head(Ob2020)
cols <- c("Level", "Flow")
FakeData[Ob2020[mth<=6L & day<=25], on=.(day, mth),
(cols) := mget(paste0("i.", cols))]
#print to console to show new values
head(FakeData)
I want to create a data table in R with some data that I had already obtained. However, I'm not sure how to put those data into a table form because that required some skill to put he return data, monthlyRet, into the table according to their month respectively. The diagram attached below is the table format that I want, the data inside also need to be included.
Please note that the data for No.of.Positive and No.of.Negative are started from Aug instead of Jan due to the starting date in getSymbols. Hence, I wish the No.of.Positive and No.of.Negative can be arranged in the table created from Jan to Dec as shown in the diagram below.
The code below is how I obtained my data.
library(quantmod)
prices <-
getSymbols("^NDX", src = 'yahoo', from = "2009-07-01", to = "2019-08-01",
periodicity = "monthly", auto.assign = FALSE, warnings = FALSE)[,4]
return <- diff(log(prices))
r <- na.omit(exp(return)-1)
monthlyRet <- as.numeric(r)
meanMonthlyRet <- c()
No.of.Positive <- c()
No.of.Negative <- c()
for (j in 1:12){
Group <- c()
count_pos=0
count_neg=0
for (i in seq(j,length(monthlyRet),12)){
Group[i] <- monthlyRet[i]
if(monthlyRet[i]>0){
count_pos <- count_pos+1
}
else if(monthlyRet[i]<0){
count_neg <- count_neg+1
}
}
meanMonthlyRet[j] <- mean(Group, na.rm=TRUE)
Positive=0
Negative=0
if(meanMonthlyRet[j]>0){
Positive=count_pos
Negative=10-Positive
}
else if (meanMonthlyRet[j]<0){
Negative=count_neg
Positive=10-Negative
}
No.of.Positive[j] <- Positive
No.of.Negative[j] <- Negative
}
# My data required in table #--------------------------------------------------
Year <- c(2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019)
Month <- c("Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr","May","Jun","Jul")
r
No.of.Positive
No.of.Negative
I hope I can obtain exactly the same table format and content as the diagram below (I manually created in excel). Further, if the start and end date in getSymbols are changed, I hope the data in the table will still be correct.
Here is a tidyverse solution for your problem.
library(quantmod)
library(tidyverse)
prices <- getSymbols("^NDX", src = 'yahoo', from = "2009-07-01",
to = "2019-08-01", periodicity = "monthly",
auto.assign = FALSE, warnings = FALSE)[,4]
r <- prices %>%
log %>%
diff %>%
exp %>%
{. - 1}
table_out <- r %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("date", "variable")) %>%
mutate(variable = (variable * 100) %>% round(2)) %>%
separate(date, c("year", "month", "day")) %>%
select(-day) %>%
spread(month, variable)
n_pos <- map_dbl(table_out, ~sum(. > 0, na.rm = T))
n_neg <- map_dbl(table_out, ~sum(. < 0, na.rm = T))
table_out <- table_out %>%
mutate_if(is.double, ~str_c(., "%")) %>%
rbind(n_pos, n_neg)
x <- nrow(table_out)
table_out[(x-1):x, "year"] <- c("No. of Positive","No. of Negative")
table_out
How can I set the paramenter units ="mins" in the following function as a parameter?
This is just the data frame:
library(tidyverse)
u <- runif(10, 0, 60)
w <- runif(10, 0, 60)
df <- tibble(time_1 = as.POSIXct(u, origin = "2019-02-03 08:00:00"),
time_2 = as.POSIXct(w, origin = "2019-02-03 08:30:00"))
This is my funtion. I would like to be able to change the paramenter for difftime and set it as a parameter, e.g. units = "months".
time_diff <- function(df, stamp1, stamp2){
stamp1 <- enquo(stamp1)
stamp2 <- enquo(stamp2)
name <- paste0(quo_name(stamp1), "_", quo_name(stamp2))
df %>%
mutate(!!name := difftime(!!stamp1, !!stamp2, units="mins"))
}
df %>%
time_diff(time_2, time_1)
But I would like something like this:
df %>%
time_diff(time_2, time_1, mins)
What about just adding the units parameter? e.g.
time_diff <- function(df, stamp1, stamp2, units="mins"){
stamp1 <- enquo(stamp1)
stamp2 <- enquo(stamp2)
name <- paste0(quo_name(stamp1), "_", quo_name(stamp2))
df %>%
mutate(!!name := difftime(!!stamp1, !!stamp2, units=units))
}
Then you can do df %>% time_diff(time_2, time_1, "mins").
I have a data set that looks like this:
library(dplyr)
library(lubridate)
s <- c(1,1,1)
r <- c("2017-01-01 12:34:17", "2017-01-01 12:52:18", "2017-01-01 13:17:18")
t <- c(1,1,1)
g <- as.data.frame(matrix(c(s, as.POSIXct(r), t), nrow = 3, ncol = 3))
names(g) <- c("DeviceId", "Time", "Success/Fail")
g$Time <- as.POSIXct(g$Time, origin = '1970-01-01')
I am trying to write a function that loops through the data set and checks to see if the row and its successor's Time are more than 15 minutes apart. Then, the loop would add a row to the data set with the same DeviceId, the row's time plus 15 minutes, and 0 in the Success/Fail column. Here's what I've come up with:
f <- function(g) {
for(i in 2:nrow(g)) {
if(g$Time[i] - g$Time[i-1] >= 15) {
q <- list(g$DeviceId[i-1], g$Time[i-1] + minutes(15), 0)
y <- data.frame()
y <- rbind(g, q)
arrange(y, Time)
} else NULL
}
}
f(g)
I think this might be what you are after. I am sort of unclear about the success/fail indicator (-1 assigned to cases where the times are less than 15 minutes apart). It avoids the loop by using the lag() function in dplyr. Presumably, your data has more than one device so I added group_by(DeviceId)
x <- g %>%
group_by(DeviceId) %>%
mutate(
lTime = lag(Time, order_by = Time),
dTime = Time - lTime,
`Success/Fail` = if_else(dTime >= 15, 0, -1),
newTime = Time + minutes(15)
)
y <- x %>%
select(DeviceId, newTime, `Success/Fail`) %>%
rename(Time = newTime) %>%
ungroup() %>%
rbind(g, .)
Here is another option. I think the previous example is removing the first row? when the last row should be dropped (no time period after it).
g <- data.frame(DeviceId = rep(1,3),
Time = ymd_hms(c("2017-01-01 12:34:17", "2017-01-01 12:52:18", "2017-01-01 13:17:18")),
Success_Fail = rep(1,3))
g %>%
transmute(DeviceId = DeviceId,
Time = Time,
t = lead(Time)) %>%
drop_na %>%
rowwise() %>%
mutate(t2 = if((t - Time) > 15) {Time + minutes(15)} else {NA},
Success_Fail = 0) %>%
dplyr::select(DeviceId, Time = t2, Success_Fail) %>%
bind_rows(g) %>%
arrange(Time)
I'd like to create a function that can calculate the moving mean for a variable number of last observations and different variables. Take this as mock data:
df = expand.grid(site = factor(seq(10)),
year = 2000:2004,
day = 1:50)
df$temp = rpois(dim(df)[1], 5)
Calculating for 1 variable and a fixed number of last observations works. E.g. this calculates the average of the temperature of the last 5 days:
library(dplyr)
library(zoo)
df <- df %>%
group_by(site, year) %>%
arrange(site, year, day) %>%
mutate(almost_avg = rollmean(x = temp, 5, align = "right", fill = NA)) %>%
mutate(avg = lag(almost_avg, 1))
So far so good. Now trying to functionalize fails.
avg_last_x <- function(dataframe, column, last_x) {
dataframe <- dataframe %>%
group_by(site, year) %>%
arrange(site, year, day) %>%
mutate(almost_avg = rollmean(x = column, k = last_x, align = "right", fill = NA)) %>%
mutate(avg = lag(almost_avg, 1))
return(dataframe) }
avg_last_x(dataframe = df, column = "temp", last_x = 10)
I get this error:
Error in mutate_impl(.data, dots) : k <= n is not TRUE
I understand this is probably related to the evaluation mechanism in dplyr, but I don't get it fixed.
Thanks in advance for your help.
This should fix it.
library(lazyeval)
avg_last_x <- function(dataframe, column, last_x) {
dataframe %>%
group_by(site, year) %>%
arrange(site, year, day) %>%
mutate_(almost_avg = interp(~rollmean(x = c, k = last_x, align = "right",
fill = NA), c = as.name(column)),
avg = ~lag(almost_avg, 1))
}