Set paramenter of a function using difftime and dplyr - r

How can I set the paramenter units ="mins" in the following function as a parameter?
This is just the data frame:
library(tidyverse)
u <- runif(10, 0, 60)
w <- runif(10, 0, 60)
df <- tibble(time_1 = as.POSIXct(u, origin = "2019-02-03 08:00:00"),
time_2 = as.POSIXct(w, origin = "2019-02-03 08:30:00"))
This is my funtion. I would like to be able to change the paramenter for difftime and set it as a parameter, e.g. units = "months".
time_diff <- function(df, stamp1, stamp2){
stamp1 <- enquo(stamp1)
stamp2 <- enquo(stamp2)
name <- paste0(quo_name(stamp1), "_", quo_name(stamp2))
df %>%
mutate(!!name := difftime(!!stamp1, !!stamp2, units="mins"))
}
df %>%
time_diff(time_2, time_1)
But I would like something like this:
df %>%
time_diff(time_2, time_1, mins)

What about just adding the units parameter? e.g.
time_diff <- function(df, stamp1, stamp2, units="mins"){
stamp1 <- enquo(stamp1)
stamp2 <- enquo(stamp2)
name <- paste0(quo_name(stamp1), "_", quo_name(stamp2))
df %>%
mutate(!!name := difftime(!!stamp1, !!stamp2, units=units))
}
Then you can do df %>% time_diff(time_2, time_1, "mins").

Related

How to create a table with 1st row and 1st column as the header?

I want to create a data table in R with some data that I had already obtained. However, I'm not sure how to put those data into a table form because that required some skill to put he return data, monthlyRet, into the table according to their month respectively. The diagram attached below is the table format that I want, the data inside also need to be included.
Please note that the data for No.of.Positive and No.of.Negative are started from Aug instead of Jan due to the starting date in getSymbols. Hence, I wish the No.of.Positive and No.of.Negative can be arranged in the table created from Jan to Dec as shown in the diagram below.
The code below is how I obtained my data.
library(quantmod)
prices <-
getSymbols("^NDX", src = 'yahoo', from = "2009-07-01", to = "2019-08-01",
periodicity = "monthly", auto.assign = FALSE, warnings = FALSE)[,4]
return <- diff(log(prices))
r <- na.omit(exp(return)-1)
monthlyRet <- as.numeric(r)
meanMonthlyRet <- c()
No.of.Positive <- c()
No.of.Negative <- c()
for (j in 1:12){
Group <- c()
count_pos=0
count_neg=0
for (i in seq(j,length(monthlyRet),12)){
Group[i] <- monthlyRet[i]
if(monthlyRet[i]>0){
count_pos <- count_pos+1
}
else if(monthlyRet[i]<0){
count_neg <- count_neg+1
}
}
meanMonthlyRet[j] <- mean(Group, na.rm=TRUE)
Positive=0
Negative=0
if(meanMonthlyRet[j]>0){
Positive=count_pos
Negative=10-Positive
}
else if (meanMonthlyRet[j]<0){
Negative=count_neg
Positive=10-Negative
}
No.of.Positive[j] <- Positive
No.of.Negative[j] <- Negative
}
# My data required in table #--------------------------------------------------
Year <- c(2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019)
Month <- c("Aug","Sep","Oct","Nov","Dec","Jan","Feb","Mar","Apr","May","Jun","Jul")
r
No.of.Positive
No.of.Negative
I hope I can obtain exactly the same table format and content as the diagram below (I manually created in excel). Further, if the start and end date in getSymbols are changed, I hope the data in the table will still be correct.
Here is a tidyverse solution for your problem.
library(quantmod)
library(tidyverse)
prices <- getSymbols("^NDX", src = 'yahoo', from = "2009-07-01",
to = "2019-08-01", periodicity = "monthly",
auto.assign = FALSE, warnings = FALSE)[,4]
r <- prices %>%
log %>%
diff %>%
exp %>%
{. - 1}
table_out <- r %>%
as.data.frame() %>%
rownames_to_column() %>%
set_names(c("date", "variable")) %>%
mutate(variable = (variable * 100) %>% round(2)) %>%
separate(date, c("year", "month", "day")) %>%
select(-day) %>%
spread(month, variable)
n_pos <- map_dbl(table_out, ~sum(. > 0, na.rm = T))
n_neg <- map_dbl(table_out, ~sum(. < 0, na.rm = T))
table_out <- table_out %>%
mutate_if(is.double, ~str_c(., "%")) %>%
rbind(n_pos, n_neg)
x <- nrow(table_out)
table_out[(x-1):x, "year"] <- c("No. of Positive","No. of Negative")
table_out

How to delete specific record of one dataframe according to values in another dataframe in R

I have two dataframe (dat1 & dat2). Some records in dat2 need to be deleted, according to if var1 in dat1 is negative. I use the following codes, but I think they are not the best one, because I use an extra temporary dataframe tmp. Could we have a better method?
library(dplyr)
Date1 <- c("1999-12-17", "2005-1-5", "2003-11-2", "2005-6-12", "2005-8-9")
Date1 <- as.POSIXct(Date1, tz = "UTC")
Date2 <- c("2005-1-5", "2005-6-12", "2005-8-9")
Date2 <- as.POSIXct(Date2, tz = "UTC")
var1 <- c(-3, -10, 9, 5, 8)
var2 <- c(0.2, 0.6, 0.15)
dat1 <- data.frame(Date1, var1)
dat2 <- data.frame(Date2, var2)
#Below is what I did
tmp <- inner_join(dat1, dat2, by = c("Date1" = "Date2"))
tmp <- tmp[-tmp$var1 < 0, ]
dat2 <- tmp[, c(1,3)]
Something like this should work:
dat2 %>%
left_join(dat1, by = c("Date2" = "Date1")) %>%
filter(var1 > 0) %>%
mutate(var1 = NULL)
Given you're already using dplyr, why not make better use of pipes, filter, and select as such
library(dplyr)
dat2 %>%
left_join(dat1, by = c("Date2" = "Date1")) %>%
filter(var1 >= 0) %>%
select(-var1)

Apply different data to a function in R

I have the following data frame:
library(tidyverse)
set.seed(1234)
df <- data.frame(
x = seq(1, 100, 1),
y = rnorm(100)
)
Where I apply a smooth spline using different knots:
nknots <- seq(4, 15, 1)
output <- map(nknots, ~ smooth.spline(x = df$x, y = df$y, nknots = .x))
What I need to do now is to apply the same function using 2-point and 3-point averages:
df_2 <- df %>%
group_by(., x = round(.$x/2)*2) %>%
summarise_all(funs(mean))
df_3 <- df %>%
group_by(., x = round(.$x/3)*3) %>%
summarise_all(funs(mean))
In summary, I need to apply the function I used in output with the following data frames:
df
df_2
df_3
Of course, this is a minimal example, so I am looking for a efficient way of doing it. Preferably with the purrr package.
Using lapply, and the library zoo to calculate the moving average in a more simple and elegant manner:
library(zoo)
lapply(1:3,function(roll){
dftemp <- as.data.frame(rollmean(df,roll))
map(nknots, ~ smooth.spline(x = dftemp$x, y = dftemp$y, nknots = .x))
})
Here's one possible solution:
library(tidyverse)
set.seed(1234)
df <- data.frame(x = seq(1, 100, 1),
y = rnorm(100))
# funtion to get v-point averages
GetAverages = function(v) {
df %>%
group_by(., x = round(.$x/v)*v) %>%
summarise_all(funs(mean)) }
# specify nunber of knots
nknots <- seq(4, 15, 1)
dt_res = tibble(v=1:3) %>% # specify v-point averages
mutate(d = map(v, GetAverages)) %>% # get data for each v-point
crossing(., data.frame(nknots=nknots)) %>% # combine each dataset with a knot
mutate(res = map2(d, nknots, ~smooth.spline(x = .x$x, y = .x$y, nknots = .y))) # apply smooth spline
You can use dt_res$res[dt_res$v == 1] to see all results for your original daatset, dt_res$res[dt_res$v == 2] to see results for your 2-point estimate, etc.

Trying to add a new row based on a check in R

I have a data set that looks like this:
library(dplyr)
library(lubridate)
s <- c(1,1,1)
r <- c("2017-01-01 12:34:17", "2017-01-01 12:52:18", "2017-01-01 13:17:18")
t <- c(1,1,1)
g <- as.data.frame(matrix(c(s, as.POSIXct(r), t), nrow = 3, ncol = 3))
names(g) <- c("DeviceId", "Time", "Success/Fail")
g$Time <- as.POSIXct(g$Time, origin = '1970-01-01')
I am trying to write a function that loops through the data set and checks to see if the row and its successor's Time are more than 15 minutes apart. Then, the loop would add a row to the data set with the same DeviceId, the row's time plus 15 minutes, and 0 in the Success/Fail column. Here's what I've come up with:
f <- function(g) {
for(i in 2:nrow(g)) {
if(g$Time[i] - g$Time[i-1] >= 15) {
q <- list(g$DeviceId[i-1], g$Time[i-1] + minutes(15), 0)
y <- data.frame()
y <- rbind(g, q)
arrange(y, Time)
} else NULL
}
}
f(g)
I think this might be what you are after. I am sort of unclear about the success/fail indicator (-1 assigned to cases where the times are less than 15 minutes apart). It avoids the loop by using the lag() function in dplyr. Presumably, your data has more than one device so I added group_by(DeviceId)
x <- g %>%
group_by(DeviceId) %>%
mutate(
lTime = lag(Time, order_by = Time),
dTime = Time - lTime,
`Success/Fail` = if_else(dTime >= 15, 0, -1),
newTime = Time + minutes(15)
)
y <- x %>%
select(DeviceId, newTime, `Success/Fail`) %>%
rename(Time = newTime) %>%
ungroup() %>%
rbind(g, .)
Here is another option. I think the previous example is removing the first row? when the last row should be dropped (no time period after it).
g <- data.frame(DeviceId = rep(1,3),
Time = ymd_hms(c("2017-01-01 12:34:17", "2017-01-01 12:52:18", "2017-01-01 13:17:18")),
Success_Fail = rep(1,3))
g %>%
transmute(DeviceId = DeviceId,
Time = Time,
t = lead(Time)) %>%
drop_na %>%
rowwise() %>%
mutate(t2 = if((t - Time) > 15) {Time + minutes(15)} else {NA},
Success_Fail = 0) %>%
dplyr::select(DeviceId, Time = t2, Success_Fail) %>%
bind_rows(g) %>%
arrange(Time)

hourly sums with dplyr with zeros for empty hours

I have a dataset similar to the format of "my_data" below, where each line is a single count of an event. I want to obtain a summary of how many events happen in every hour. I would like to have every hour with no events be included with a 0 for its "hourly_total" value.
I can achieve this with dplyr as shown, but the empty hours are dropped instead of being set to 0.
Thank you!
set.seed(123)
library(dplyr)
library(lubridate)
latemail <- function(N, st="2012/01/01", et="2012/1/31") {
st <- as.POSIXct(as.Date(st))
et <- as.POSIXct(as.Date(et))
dt <- as.numeric(difftime(et,st,unit="sec"))
ev <- sort(runif(N, 0, dt))
rt <- st + ev
}
my_data <- data_frame( fake_times = latemail(25),
count = 1)
my_data %>% group_by( rounded_hour = floor_date(fake_times, unit = "hour")) %>%
summarise( hourly_total = sum(count))
Assign your counts to an object
counts <- my_data %>% group_by( rounded_hour = floor_date(fake_times, unit = "hour")) %>%
summarise( hourly_total = sum(count))
Create a data frame with all the necessary hours
complete_data = data.frame(hour = seq(floor_date(min(my_data$fake_times), unit = "hour"),
floor_date(max(my_data$fake_times), unit = "hour"),
by = "hour"))
Join to it and fill in the NAs.
complete_data %>% group_by( rounded_hour = floor_date(hour, unit = "hour")) %>%
left_join(counts) %>%
mutate(hourly_total = ifelse(is.na(hourly_total), 0, hourly_total))

Resources