sum of squared difference of multiple rows - r

Made up a data frame. How to calculate the squared difference/error in hourly TMP and DW for 1/1 to 1/9 against 1/10? Need the sum of squared difference between hour1 to hour 24 of each day from 1/1 to 1/9 against 1/10
The output should look like
Date SETmp SEDW
2012/1/1 X1 Y1
......
2012/1/9 X9 Y9
Data:
set.seed(1)
dataset <- data.frame(Date = seq(from = as.POSIXct("2012-1-1 0:00", tz = "UTC"),
to = as.POSIXct("2012-1-10 23:00", tz = "UTC"),
by="hour"),
TMP = rnorm(240),
DW = rnorm(240))

If I understand your question correctly, we can get there using the by and merge functions:
# add day and hour columns (for subsetting and merge)
dataset$day <- lubridate::day(dataset$Date)
dataset$hour <- lubridate::hour(dataset$Date)
# split data apart
data_ten <- subset(dataset, day == 10)
data_one_to_nine <- subset(dataset, day != 10)
# for each date, merge to data_ten using hours
# then calculate sum of squared differences
do.call('rbind.data.frame',
by(data_one_to_nine, data_one_to_nine$day, function(d){
xm <- merge(d, data_ten, by = 'hour')
data.frame(
'Date' = unique(as.Date(d$Date)),
'SE_TMP' = sum((xm$TMP.x - xm$TMP.y)^2),
'SE_DW' = sum((xm$DW.x - xm$DW.y)^2),
stringsAsFactors = FALSE
)
})
)
Date SE_TMP SE_DW
1 2012-01-01 59.33207 63.41261
2 2012-01-02 42.04597 58.90700
3 2012-01-03 66.15492 51.81897
4 2012-01-04 31.83438 40.68851
5 2012-01-05 30.26666 59.30694
6 2012-01-06 45.05186 55.39751
7 2012-01-07 61.93305 39.76287
8 2012-01-08 37.08246 47.81958
9 2012-01-09 58.54562 64.79331

Related

Count the number of rows within a certain time range based on each row in R (tidyverse)

I want to count the number of rows within a certain time range based on each row after grouping by id. For instance, let us say a 1-month window around each datetime entry in the column "cleaned_date".
head(data$cleaned_date)
[1] "2004-10-11 CDT" "2008-09-10 CDT" "2011-10-25 CDT" "2011-12-31 CST"
The dates are in POSIXct format.
For the first entry, I need to count the number of rows within the time from 2004-09-11 to 2004-11-11, for the second entry, count the number of rows within the time from 2008-08-10 to 2008-10-10, so on and so forth.
I used roughly the following code
data %>% group_by(id) %>% filter(cleaned_date %within% interval(cleaned_date - 24 * 60 * 60 * 30, cleaned_date + 24 * 60 * 60 * 30)) %>% mutate(counts = n())
But it does not seem to work and I got counts as an empty column. Any help would be appreciated, thanks!
A reproducible example can be the following:
The input is
cleaned_date id
1 2008-09-11 A
2 2008-09-10 B
3 2008-09-30 B
4 2011-10-25 A
5 2011-11-14 A
And I want the output to be
cleaned_date id counts
1 2008-09-11 A 1
2 2008-09-10 B 2
3 2008-09-30 B 2
4 2011-10-25 A 2
5 2011-11-14 A 2
For the first entry, I want to count the rows in the timeframe 2008-08-11 to 2008-10-11, the second entry seems to satisfy but we need to group by "id", so it does not count. For the second entry I want to count the rows in the timeframe 2008-08-10 to 2008-10-10, rows 2 and 3 satisfy, so the counts is 2. For the third entry I want to count the rows in the timeframe 2008-08-30 to 2008-10-30, rows 2 and 3 satisfy again, so on and so forth.
Note that the actual dataset I would like to operate on has millions of rows, so it might be more efficient to use tidyverse rather than base R.
Perhaps not the most elegant solution.
# input data. Dates as character vector
input = data.frame(
cleaned_date = c("2008-09-11", "2008-09-10", "2008-09-30", "2011-10-25", "2011-11-14"),
id = c("A", "B", "B", "A", "A")
)
# function to create a date window n months around specified date
window <- function(x, n = 1){
x <- rep(as.POSIXlt(x),2)
x[1]$mon <- x[1]$mon - n
x[2]$mon <- x[2]$mon + n
return(format(seq(from = x[1], to = x[2], by = "day"), format="%Y-%m-%d"))
}
# find counts for each row
input$counts <- unlist(lapply(1:nrow(input), function(x){
length(which((input$cleaned_date %in% window(input$cleaned_date[x])) & input$id == input$id[x]))
}))
input
cleaned_date id counts
1 2008-09-11 A 1
2 2008-09-10 B 2
3 2008-09-30 B 2
4 2011-10-25 A 2
5 2011-11-14 A 2
Edit for large datasets:
# dummy dataset with 1,000,000 rows
years <- c(2000:2020)
months <- c(1:12)
days <- c(1:20)
n <- 1000000
dates <- paste(sample(years, size = n, replace = T), sample(months, size = n, replace = T), sample(days, size = n, replace = T), sep = "-")
groups <- sample(c("A","B","C"), size = n, replace = T)
input <- data.frame(
cleaned_date = dates,
id = groups
)
input$cleaned_date <- format(as.POSIXlt(input$cleaned_date), format="%Y-%m-%d")
# optional, sort data by date for small boost in performance
input <- input[order(input$cleaned_date),]
counts <- NULL
#pb <- progress::progress_bar$new(total = length(unique(input$cleaned_date)))
t1 <- Sys.time()
# split up vectorization for each unique date.
for(date in unique(input$cleaned_date)){
#pb$tick()
w <- window(date)
tmp <- input[which(input$cleaned_date %in% w),]
tmp_counts <- unlist(lapply(which(tmp$cleaned_date == date), function(x){
length(which(tmp$id == tmp$id[x]))
}))
counts <- c(counts, tmp_counts)
}
# add counts to dataset
input$counts <- counts
# optional, re-order data to original format
input <- input[order(as.numeric(rownames(input))),]
print(Sys.time() - t1)
Time difference of 3.247204 mins
If you want to go faster, you can run the loop in parallel
library(foreach)
library(doParallel)
cores=detectCores()
cl <- makeCluster(cores[1]-1)
registerDoParallel(cl)
dates = unique(input$cleaned_date)
t1 <- Sys.time()
counts <- foreach(i=1:length(dates), .combine= "c") %dopar% {
w <- window(dates[i])
tmp <- input[which(input$cleaned_date %in% w),]
tmp_counts <- unlist(lapply(which(tmp$cleaned_date == dates[i]), function(x){
length(which(tmp$id == tmp$id[x]))
}))
tmp_counts
}
stopCluster(cl)
input$counts <- counts
input <- input[order(as.numeric(rownames(input))),]
print(Sys.time() - t1)
Time difference of 37.37211 secs
Note, I'm running this on a MacBook Pro with a 2.3 GHz Quad-Core Intel Core i7 and 16 GB of RAM.
It is still hard to determine exactly what you're trying to accomplish, but this will at least get you counts for a specified date range:
df %>%
group_by(id) %>%
filter(cleaned_date >= "2008-08-11" & cleaned_date <= "2008-10-11") %>%
mutate(counts = n())
Will give us:
cleaned_date id counts
<date> <chr> <int>
1 2008-09-11 A 1
2 2008-09-10 B 2
3 2008-09-30 B 2

handling calculations with date changes in xts - R

I have a stock dataset in xts format that has data for one year at a minute by minute level.
I need to calculate the returns, but only for the periods on the same date.
what would be the most efficient way to not calculate the returns while avoiding the returns of todays first observation and yesterdays last observation.
In the same vein, if one wants to calculate - 15 minute returns, then how to avoid calculating the periods that are not in the same day?
I am including a toy dataset with hourly periods (since original dataset is at minute level)
time_index <- seq(from = as.POSIXct("2021-01-01 07:00"), to = as.POSIXct("2021-02-28 18:00"), by = "hour")
set.seed(1)
value <- 100 + rnorm(n = length(time_index))
eventdata <- xts(value, order.by = time_index)
So how to calculate three hourly returns for intraday periods.
It is probably easier to compute all the returns and then drop the overnight-ones. For getting 15-minute returns you can create a time-sequence with these intervals and then run the computations on the reduced data.
library(xts)
library(lubridate)
library(TTR)
time_index <-
seq(
from = as.POSIXct("2021-01-01 07:00"),
to = as.POSIXct("2021-02-28 18:00"),
by = "hour"
)
set.seed(1)
value <- 100 + rnorm(n = length(time_index))
eventdata <- xts(value, order.by = time_index)
# compute hourly returns and drop first observation per day
res <- TTR::ROC(eventdata, type = "discrete")
res2 <- res[-xts:::startof(res, by = "days")]
> head(res2)
[,1]
2021-01-01 08:00:00 0.00815204
2021-01-01 09:00:00 -0.01017404
2021-01-01 10:00:00 0.02451394
2021-01-01 11:00:00 -0.01245897
2021-01-01 12:00:00 -0.01146199
2021-01-01 13:00:00 0.01318717
# compute 15-minute returns
time_index2 <- seq(
from = as.POSIXct("2021-01-01 07:00"),
to = as.POSIXct("2021-01-01 18:00"),
by = "min"
)
length(time_index2)
time_index3 <- seq(ymd_hms(from = '2021-01-01 07:00:00'),
by = '15 min', length.out=(661))
testasset <- xts(rnorm(661, sd = 0.03), order.by = time_index2)
res3 <- TTR::ROC(testasset[time_index3], type = "discrete")
head(res3)
> head(res3)
[,1]
2021-01-01 08:00:00 NA
2021-01-01 08:15:00 -0.6516079
2021-01-01 08:30:00 -5.7101543
2021-01-01 08:45:00 -0.5411609
2021-01-01 09:00:00 -2.2945892
2021-01-01 09:15:00 -2.3038205

time differences between rows without night in R

A subject was measured at several time points over several days. I have a row "resptime_s" (time that the subject was answered the beep on his smartphone). Now I want to know the mean time between those (so between the rows of this column) with the night time taken out (nighttime is always from 22:30 p.m till 7:30 a.m). Take as example:
The R script:
setwd("C:/Users/Hanne/Desktop/")
dat <- read.csv(file="datnew2.csv", sep=";",header=TRUE)
rows <- c(1:388) #time points
columns <- c(2,60) # datum and time
nVariables = 2
newdata<-dat[rows,columns]
head(newdata)
fun2 <- function(x){
bt <- as.integer(sub("(^\\d{1,2}):.*", "\\1", x))
f <- cumsum(c(FALSE, diff(bt) < 0))
d <- rep(as.Date("2018-01-01"), length.out = length(bt))
bt <- as.POSIXct(paste(d, x))
res <- sapply(split(bt, f), function(b) c(0, difftime(b[-1], b[1])))
unname(unlist(res))
}
fun2(newdata$resptime_s)
But the result isn't correct.
And with:
dput(head(newdata, 30))
I obtained this output:
Using the different functions for working with time intervals in lubridate gives the most elegant and easy to understand solution.
library(tidyverse)
library(lubridate)
data <- tribble(
~time_point, ~beeptime,
1, "08:30",
2, "11:13",
3, "12:08",
4, "17:20",
5, "22:47",
6, "7:36",
7, "9:40"
) %>%
mutate(beeptime = as_datetime(hm(beeptime)))
1. Define the daytime interval
day <- interval(
as_datetime(hm("07:30")),
as_datetime(hm("22:30"))
)
2. Keep daytime beeps and estimate the time (interval) between them
# %--% is basically the same as interval() above.
data_interval <-
data %>%
filter(beeptime %within% day) %>%
mutate(beep_interval = lag(beeptime) %--% beeptime)
3. Take the average
# You can use as.numeric() to extract (e.g.) minutes, which you can
# just pass to mean().
data_interval$beep_interval %>%
as.numeric("minutes") %>%
abs() %>%
mean(na.rm = TRUE)
#> [1] 247.6
Try the following. It pastes a date that increments every time the next hour is less than the previous one. Then difftime works as expected.
fun <- function(x){
bt <- as.integer(sub("(^\\d{1,2}):.*", "\\1", x))
inx <- as.logical(cumsum(c(FALSE, diff(bt) < 0)))
d <- rep(as.Date("2018-01-01"), length.out = length(bt))
d[inx] <- d[inx] + 1
beeptime <- as.POSIXct(paste(d, x))
difftime(beeptime[-1], beeptime[1])
}
fun(newdata$beeptime)
#Time differences in hours
#[1] 2.716667 3.633333 8.833333 14.283333 23.100000 25.166667
Data.
newdata <-
structure(list(time_point = 1:7, beeptime = structure(1:7, .Label = c("08:30",
"11:13", "12:08", "17:20", "22:47", "7:36", "9:40"), class = "factor")), class = "data.frame", row.names = c(NA,
-7L))
Edit.
I believe that I have missunderstood the question. The OP does not want differences between the first hour and all others. What is needed is the differences restarting from zero every night.
If this is the case, the following function will do it.
fun2 <- function(x){
bt <- as.integer(sub("(^\\d{1,2}):.*", "\\1", x))
f <- cumsum(c(FALSE, diff(bt) < 0))
d <- rep(as.Date("2018-01-01"), length.out = length(bt))
bt <- as.POSIXct(paste(d, x))
res <- sapply(split(bt, f), function(b) c(0, difftime(b[-1], b[1])))
unname(unlist(res))
}
fun2(newdata$beeptime)
#[1] 0.000000 2.716667 3.633333 8.833333 14.283333 0.000000 2.066667
Another approach could be to convert beeptime in offset (in seconds) from midnight using lubridate package.
We can then write a function(s) to calculate difference in time excluding night time (22:30 - 7:30).
Before we start solution, lets have a look for offset in seconds from midnight for 7:30 and 22:30.
library(lubridate)
as.numeric(seconds(hm("7:30")))
# [1] 27000
as.numeric(seconds(hm("22:30")))
# [1] 81000
I have written two sets of function to calculate difference between two times:
# Function checks individual time and shifts them to night boundary. So that
# time over night can be excluded.
checkNightBoundry <- function(val){
if(val < 27000){
val = 27000
} else if(val > 81000) {
val = 81000
}
val
}
# Arguments are offset from midnight in seconds
# Calculate difference between two time, excluding midtime
calcDifftime <- function(currVal, prevVal){
diffTime <- 0
currVal = checkNightBoundry(currVal)
prevVal = checkNightBoundry(prevVal)
if(currVal > prevVal){
diffTime = currVal - prevVal
}else if(currVal < prevVal){
diffTime = (81000 - prevVal) + (currVal - 27000)
}
diffTime
}
Now, use above functions:
library(dplyr)
library(lubridate)
df %>% mutate(beeptimeOffset = as.numeric(seconds(hm(beeptime)))) %>%
mutate(diffTime = mapply(calcDifftime,
beeptimeOffset, lag(beeptimeOffset, default = first(beeptimeOffset)))/3600)
# timepoint beeptime beeptimeOffset(sec) diffTime(hrs)
# 1 1 08:30 30600 0.0000000
# 2 2 11:13 40380 2.7166667
# 3 3 12:08 43680 0.9166667
# 4 4 17:20 62400 5.2000000
# 5 5 22:47 82020 5.1666667
# 6 6 7:36 27360 0.1000000
# 7 7 9:40 34800 2.0666667
Data:
df <- read.table(text =
"timepoint beeptime
1 08:30
2 11:13
3 12:08
4 17:20
5 22:47
6 7:36
7 9:40",
header = TRUE, stringsAsFactors = FALSE)

Expanding R Matrix on Date

I have the following R matrix:
Date MyVal
2016 1
2017 2
2018 3
....
2026 10
What I want to do is "blow it up" so that it goes like this (where monthly values are linearly interpolated):
Date MyVal
01/01/2016 1
02/01/2016 ..
....
01/01/2017 2
....
01/01/2026 10
I realize I can easily generate the sequence using:
DateVec <- seq(as.Date(paste(minYear,"/01/01", sep = "")), as.Date(paste(maxYear, "/01/01", sep = "")), by = "month")
And I can use that to make a large matrix and then fill things in using a for loop over the DateVector in but I wonder if there's a more elegant R way to do this?
You can use stats::approx:
library(stats)
ipc <- approx(df$Date, df$MyVal, xout = DateVec,
rule = 1, method = "linear", ties = mean)
You probably need to first convert the data in your original data-frame to have month and day and also be in asPOSIXct or as.Date format.
Based on what you provided, this works:
#Make the reference data-frame for interpolation:
DateVec <- seq(min(df$Date, na.rm=T),
max(df$Date, na.rm=T), by = "month")
#Interpolation:
intrpltd_df <- approx(df$Date, df$MyVal, xout = DateVec,
rule = 1, method = "linear", ties = mean)
# x y
# 1 2016-01-01 1.000000
# 2 2016-02-01 1.084699
# 3 2016-03-01 1.163934
# 4 2016-04-01 1.248634
# 5 2016-05-01 1.330601
# 6 2016-06-01 1.415301
Data:
#reproducing the data-frame:
Date <- seq(2016,2026)
MyVal <- seq(1:11)
Date <- data.frame(as.Date(paste0(Date,"/01/01"))) #yyyy-mm-dd format
df <- cbind(Date, MyVal)
df <- as.data.frame(df)
colnames(df) <- c ("Date", "MyVal") #Changing Column Names

R Increment Dates by Periods across data frame columns

I'm trying to do some tests around measurement periods in time. I'd like to increment the size of the measurement bins (ie 1 month vs 2 months, etc.).
I have a data frame with a date seq() which works fine my problem is with incrementing the date by a month, week, etc.
df1 <- data.frame(id = 1:20, date1 = seq(as.Date('2012-01-01'),by = 'month', len = 20))
df1$date2 <- df1$date1 + 30
This is obviously wrong if I want the 1st of each month or week. Is there a function or package for this type of issue?
EDIT:
This :
seq( x, by = "month", length.out = 1)
seems to work for individual cells, but won't work for a column as it returns a numeric:
df1$date2 <- sapply(df1$date1, function(x) seq( x, by = "month", length.out = 1))
> head(df1)
id date1 date2
1 1 2012-01-01 15340
2 2 2012-02-01 15371
3 3 2012-03-01 15400
4 4 2012-04-01 15431
5 5 2012-05-01 15461
6 6 2012-06-01 15492
It sounds like you're looking for cut:
df1$date2 <- cut(df1$date1 + as.difftime(31, units='days'), breaks='months')
df1$date3 <- cut(df1$date2 + as.difftime(1, units='weeks'), breaks='weeks')
There might be more elegant solutions but this should work -
df1$date2 <- as.Date(
paste(
ifelse(
strftime(df1$date1,'%m') == 12,
as.integer(strftime(df1$date1,'%Y')) + 1,
as.integer(strftime(df1$date1,'%Y'))
),
ifelse(
strftime(df1$date1,'%m') == 12,
1,
as.integer(strftime(df1$date1,'%m')) + 1
),
1,
sep = "-"
),
"%Y-%m-%d"
)

Resources