Interpolation for continuous data in R - r

I have a sample data as follows:
data1 <- read.table(text="1/1/12 1:48 AM 1.24
1/1/12 8:14 AM 0.26
1/1/12 2:01 PM 1.15
1/1/12 8:25 PM 0.15
1/2/12 2:36 AM 1.23
1/2/12 9:13 AM 0.25
1/2/12 2:54 PM 1.09
1/2/12 9:17 PM 0.16
1/3/12 3:28 AM 1.24
1/3/12 10:06 AM 0.21
1/3/12 3:52 PM 1.07
1/3/12 10:05 PM 0.15
1/4/12 4:21 AM 1.27
1/4/12 10:56 AM 0.16
1/4/12 4:49 PM 1.08
1/4/12 10:52 PM 0.12
1/5/12 5:12 AM 1.32
1/5/12 11:43 AM 0.1
1/5/12 5:41 PM 1.12
1/5/12 11:37 PM 0.08
1/6/12 5:58 AM 1.38
1/6/12 12:28 PM 0.03
1/6/12 6:27 PM 1.17
", sep="", header=F)
> head(data1)
V1 V2 V3 V4 date
1 1/1/12 1:48 AM 1.24 1/1/12 1:48 AM
2 1/1/12 8:14 AM 0.26 1/1/12 8:14 AM
3 1/1/12 2:01 PM 1.15 1/1/12 2:01 PM
4 1/1/12 8:25 PM 0.15 1/1/12 8:25 PM
5 1/2/12 2:36 AM 1.23 1/2/12 2:36 AM
6 1/2/12 9:13 AM 0.25 1/2/12 9:13 AM
Combine 3 columns to one to make data column
data1$date <- paste(data1$V1, data1$V2, data1$V3)
Create a date sequence to do the interpolation
daterange <- seq(from=as.POSIXct("2012-1-1 00:00"), to = as.POSIXct("2012-1-6 00:00"), length.out =1200)
I want to find the corresponding V4 values of the daterange specified above. I want to do the linear interpolation.

As others have said, you can use approx(...) to interpolate between successive points, although it's debatable if this is a good idea.
data1$posix <- as.POSIXct(data1$date,format="%m/%d/%y %I:%M %p")
df <- as.data.frame(with(data1,approx(posix,V4,n=1200))) # colnames are "x", "y"
colnames(df) <- c("date","V4")
df$posix <- as.POSIXct(df$date,origin="1970-01-01")
library(ggplot2)
ggplot()+
geom_point(data=data1, aes(x=posix, y=V4), color="red", size=5)+
geom_point(data=df, aes(x=posix, y=V4), color="blue", size=1)+
labs(x="Date")
Note the format string in the call to as.POSIXct(...). You have to specify that the times are in 12hr format using %I (not %H), and you have to specify that the string contains AM/PM (using %p), or your character times will not convert correctly. (They will convert, though, without throwing an error - so be careful).

Related

change date to yyyy-mm in R

I have a data set that will be used for time series. the date column is currently structured as follows:
> head(cam_shiller)
div stock dates
1 0.495 7.09 1933m1
2 0.490 6.25 1933m2
3 0.485 6.23 1933m3
4 0.480 6.89 1933m4
5 0.475 8.87 1933m5
6 0.470 10.39 1933m6
If I'm not mistaken, monthly data for time series should look like this: yyyy-mm. So I'm trying to make my date column look like this:
div stock dates
1 0.495 7.09 1933-01
2 0.490 6.25 1933-02
3 0.485 6.23 1933-03
4 0.480 6.89 1933-04
5 0.475 8.87 1933-05
6 0.470 10.39 1933-06
However, using the as.yearmo function produces a column full of NAs. I tried removing the 'm' and replacing it with a dash, and then running as.yearmo again. Now the results look like this:
div stock dates
1 0.495 7.09 Jan 1933
2 0.490 6.25 Feb 1933
3 0.485 6.23 Mar 1933
4 0.480 6.89 Apr 1933
5 0.475 8.87 May 1933
6 0.470 10.39 Jun 1933
How do I change the dates into the yyyy-mm format?
library(zoo)
cam_shiller = read.csv('https://raw.githubusercontent.com/bandcar/Examples/main/cam_shiller.csv')
cam_shiller$dates = gsub('m', '-', cam_shiller$dates)
cam_shiller$dates = as.yearmon(cam_shiller$dates)
Actually, in ts you just need to specify start= and frequency.
res <- ts(cam_shiller[, -3], start=1933, frequency=12)
res
# div stock
# Jan 1933 0.4950 7.09
# Feb 1933 0.4900 6.25
# Mar 1933 0.4850 6.23
# Apr 1933 0.4800 6.89
# May 1933 0.4750 8.87
# Jun 1933 0.4700 10.39
# Jul 1933 0.4650 11.23
# Aug 1933 0.4600 10.67
# Sep 1933 0.4550 10.58
# Oct 1933 0.4500 9.55
# Nov 1933 0.4450 9.78
# Dec 1933 0.4400 9.97
# Jan 1934 0.4408 10.54
# Feb 1934 0.4417 11.32
# Mar 1934 0.4425 10.74
# Apr 1934 0.4433 10.92
# May 1934 0.4442 9.81
# Jun 1934 0.4450 9.94
# Jul 1934 0.4458 9.47
# Aug 1934 0.4467 9.10
# Sep 1934 0.4475 8.88
# Oct 1934 0.4483 8.95
# Nov 1934 0.4492 9.20
# Dec 1934 0.4500 9.26
# ...
Or
ts(cam_shiller$stock, start=c(1933, 1), frequency=12)
# Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# 1933 7.09 6.25 6.23 6.89 8.87 10.39 11.23 10.67 10.58 9.55 9.78 9.97
# 1934 10.54 11.32 10.74 10.92 9.81 9.94 9.47 9.10 8.88 8.95 9.20 9.26
# 1935 9.26 8.98 8.41 9.04 9.75 10.12 10.65 11.37 11.61 11.92 13.04 13.04
# ...
It may be wise to check beforehand that there are no gaps in the data by evaluating the column and row variances of years and month matrices:
test <- do.call(rbind, strsplit(cam_shiller$dates, 'm')) |>
type.convert(as.is=TRUE)
matrixStats::colVars(matrix(test[, 1], 12))
# [1] 0 0 ...
matrixStats::rowVars(matrix(test[, 2], 12))
# [1] 0 0 0 0 0 0 0 0 0 0 0 0
If you use the xts::xts, it's rather picky since it wants a time-based class such as "Date" or "POSIXct". So you need whole dates, i.e. paste a 01 as pseudo date.
res <- transform(cam_shiller, dates=strptime(paste(dates, '01'), format='%Ym%m %d')) |>
{\(.) xts::as.xts(.[1:2], .$dates)}()
head(res)
# div stock
# 1933-01-01 0.495 7.09
# 1933-02-01 0.490 6.25
# 1933-03-01 0.485 6.23
# 1933-04-01 0.480 6.89
# 1933-05-01 0.475 8.87
# 1933-06-01 0.470 10.39
class(res)
# [1] "xts" "zoo"
Data:
cam_shiller <- structure(list(div = c(0.495, 0.49, 0.485, 0.48, 0.475, 0.47,
0.465, 0.46, 0.455, 0.45, 0.445, 0.44, 0.4408, 0.4417, 0.4425,
0.4433, 0.4442, 0.445, 0.4458, 0.4467, 0.4475, 0.4483, 0.4492,
0.45), stock = c(7.09, 6.25, 6.23, 6.89, 8.87, 10.39, 11.23,
10.67, 10.58, 9.55, 9.78, 9.97, 10.54, 11.32, 10.74, 10.92, 9.81,
9.94, 9.47, 9.1, 8.88, 8.95, 9.2, 9.26), dates = c("1933m1",
"1933m2", "1933m3", "1933m4", "1933m5", "1933m6", "1933m7", "1933m8",
"1933m9", "1933m10", "1933m11", "1933m12", "1934m1", "1934m2",
"1934m3", "1934m4", "1934m5", "1934m6", "1934m7", "1934m8", "1934m9",
"1934m10", "1934m11", "1934m12")), row.names = c(NA, 24L), class = "data.frame")
Try lubridate::ym to change dates to yyyy-mm format
library(tidyverse)
cam_shiller = read.csv('https://raw.githubusercontent.com/bandcar/Examples/main/cam_shiller.csv')
cam_shiller %>%
mutate(
date = lubridate::ym(dates),
date = strftime(date, "%Y-%m")
) %>%
head()
#> div stock dates date
#> 1 0.495 7.09 1933m1 1933-01
#> 2 0.490 6.25 1933m2 1933-02
#> 3 0.485 6.23 1933m3 1933-03
#> 4 0.480 6.89 1933m4 1933-04
#> 5 0.475 8.87 1933m5 1933-05
#> 6 0.470 10.39 1933m6 1933-06
Created on 2022-10-01 with reprex v2.0.2
The form in the question is already correct. It is not true
that you need to change it. It renders as Jan 1933, etc. but internally it is represented as year+(month-1)/12 (where month is a number 1, 2, ..., 12) which is exactly what you need for analysis. You do not want a character string of the form yyyy-mm for analysis.
If by "time series" you mean a zoo series then using u defined in the Note at the end, z below gives that with a yearmon index. The index argument to read.csv.zoo gives the column number or name of the index, the FUN argument tells it how to convert it and the format argument tells it the precise form of the dates.
If what you mean by time series is that you want a ts series then tt below gives that.
If what you mean is a data frame with a yearmon column then DF below gives that.
With either a zoo series or a ts series one could perform a variety of analyses. For example, acf(z) or acf(tt) would give the autocorrelation function.
For more information see ?read.csv.zoo . There is also an entire vignette on read.zoo and its variants. The vignettes are linked to on the CRAN home page for zoo. Also see ?strptime for the percent codes.
library(zoo)
# zoo series with yearmon column
z <- read.csv.zoo(u, index = 3, FUN = as.yearmon, format = "%Ym%m")
# ts series
tt <- as.ts(z)
# data frame with yearmon column
DF <- u |>
read.csv() |>
transform(dates = as.yearmon(dates, "%Ym%m"))
A character string of the form yyyy-mm is not a suitable form for most analyses but if you really did want that anyways then
# zoo series with yyyy-mm character string index
z2 <- aggregate(z, format(index(z), "%Y-%m"), c)
# data.frame with yyyy-mm character string column
DF2 <- transform(DF, dates = format(dates, "%Y-%m"))
Note
u <- "https://raw.githubusercontent.com/bandcar/Examples/main/cam_shiller.csv"

Extract observations based on rownames and range of colnames

I have a two dataframes - one is the base dataframe and the other the query dataframe.
Base Dataframe (base_df):
Mon Tue Wed Thu Fri Sat
A 5.23 0.01 6.81 8.67 0.10 6.21
B 6.26 2.19 4.28 5.57 0.16 2.81
C 7.41 2.63 4.32 6.57 0.20 1.69
D 6.17 1.50 5.30 9.22 2.19 5.47
E 1.23 9.01 8.09 1.29 7.65 4.57
Query Dataframe (query_df):
Person Start End
A Tue Thu
C Mon Wed
D Thu Sat
C Thu Sat
B Wed Fri
I want to extract all the observations for a particular person between the start and end days. The difference between start and end days is always three (inclusive of start and end days).
Hence the output wanted is:
Person Start End D1 D2 D3
A Tue Thu 0.01 6.81 8.67
C Mon Wed 7.41 2.63 4.32
D Thu Sat 9.22 2.19 5.47
C Thu Sat 6.57 0.20 1.69
B Wed Fri 4.28 5.57 0.16
I want to avoid a loop because the actual base_df is more than 35000 rows. Is there a data.table solution? Solutions using other data structures are good too. Thank you!
Another base R solution, using mapply...
query_df <- cbind(query_df,
t(mapply(function(p,s,e) {
base_df[p, match(s, names(base_df)):match(e, names(base_df))]},
query_df$Person,
query_df$Start,
query_df$End)))
names(query_df)[4:6] <- c("D1", "D2", "D3")
query_df
Person Start End D1 D2 D3
1 A Tue Thu 0.01 6.81 8.67
2 C Mon Wed 7.41 2.63 4.32
3 D Thu Sat 9.22 2.19 5.47
4 C Thu Sat 6.57 0.2 1.69
5 B Wed Fri 4.28 5.57 0.16
The data.table solution below should be working also for varying numbers of days between Start and End days (not just 3 day periods) thanks to a non-equi join and melt() / dcast() for reshaping:
library(data.table)
setDT(base_df)
setDT(query_df)
# reshape from wide to long
long <- melt(base_df, id.vars = "Person", variable.name = "Day")
# align factor levels
cols <- c("Start", "End")
query_df[, (cols) := lapply(.SD, factor, levels = levels(long$Day)), .SDcols = cols][
# add row id because Person is not unique
, rn := .I]
# non-equi join right join, i.e., take all rows of query_df
long[query_df, on = .(Person, Day >= Start, Day <= End),
.(rn, Person, Start = i.Start, End = i.End, value)][
# reshape from long to wide
, dcast(.SD, rn + Person + ... ~ rowid(rn, prefix = "D"))]
rn Person Start End D1 D2 D3
1: 1 A Tue Thu 0.01 6.81 8.67
2: 2 C Mon Wed 7.41 2.63 4.32
3: 3 D Thu Sat 9.22 2.19 5.47
4: 4 C Thu Sat 6.57 0.20 1.69
5: 5 B Wed Fri 4.28 5.57 0.16
Note that Day is a factor with the names of weekdays as factor levels in order of appearance:
str(long)
Classes ‘data.table’ and 'data.frame': 30 obs. of 3 variables:
$ Person: chr "A" "B" "C" "D" ...
$ Day : Factor w/ 6 levels "Mon","Tue","Wed",..: 1 1 1 1 1 2 2 2 2 2 ...
$ value : num 5.23 6.26 7.41 6.17 1.23 0.01 2.19 2.63 1.5 9.01 ...
- attr(*, ".internal.selfref")=<externalptr>
Aligned factor levels are crucial for the non-equi join.
Data
library(data.table)
base_df <- fread(
"Person Mon Tue Wed Thu Fri Sat
A 5.23 0.01 6.81 8.67 0.10 6.21
B 6.26 2.19 4.28 5.57 0.16 2.81
C 7.41 2.63 4.32 6.57 0.20 1.69
D 6.17 1.50 5.30 9.22 2.19 5.47
E 1.23 9.01 8.09 1.29 7.65 4.57"
)
query_df <- fread(
"Person Start End
A Tue Thu
C Mon Wed
D Thu Sat
C Thu Sat
B Wed Fri"
)
A tidyverse answer
I reshape base_df, then join and slice the correct days, then reshape back.
library(tidyr)
library(dplyr)
base_df <- tibble::rownames_to_column(base_df, 'Person')
days <- names(base_df)[-1]
base_df %>%
gather(day, value, -Person) %>%
right_join(mutate(query_df, i = row_number())) %>%
group_by(i) %>%
slice(which(days == Start):which(days == End)) %>%
mutate(col = c('D1', 'D2', 'D3')) %>%
select(-day, -i) %>%
spread(col, value)
data.table solution:
Here I use get to extract columns (e.g. Mon) from a data.table object.
library(data.table)
# Prepare data
base_df$Person <- rownames(base_df)
d <- merge(query_df, base_df, "Person", sort = FALSE)
setDT(d)
# Extract mid day (day between start and end)
d[, Mid := days[which(Start == days) + 1], 1:nrow(d)]
# Extract columns using get
d[, .(Person, Start, End,
D1 = get(Start), D2 = get(Mid), D3 = get(End)), 1:nrow(d)][, nrow := NULL][]
Person Start End D1 D2 D3
1: A Tue Thu 0.01 6.81 8.67
2: C Mon Wed 7.41 2.63 4.32
3: D Thu Sat 9.22 2.19 5.47
4: C Thu Sat 6.57 0.20 1.69
5: B Wed Fri 4.28 5.57 0.16
Base R solution:
# Order of days
days <- names(base_df)
# Order of persons
subjects <- rownames(base_df)
res <- apply(query_df, 1, function(x) {
# Extract observation between start:end date
foo <- base_df[x[1] == subjects, which(x[2] == days):which(x[3] == days)]
colnames(foo) <- paste0("D", 1:3)
foo})
# Merge with original query_df
res <- cbind(query_df, do.call("rbind", res))
rownames(res) <- NULL
res
A base solution using indexing with a numeric matrix:
ri <- match(query_df$Person, rownames(base_df))
ci <- match(query_df$Start, names(base_df))
cbind(query_df, `dim<-`(base_df[cbind(ri, rep(ci, 3) + rep(0:2, each = nrow(query_df)))],
c(nrow(query_df), 3)))
# Person Start End 1 2 3
# 1 A Tue Thu 0.01 6.81 8.67
# 2 C Mon Wed 7.41 2.63 4.32
# 3 D Thu Sat 9.22 2.19 5.47
# 4 C Thu Sat 6.57 0.20 1.69
# 5 B Wed Fri 4.28 5.57 0.16

Incrementally add seconds of a timestamp column grouped by ID in R

I have a dataframe that is essentially a time series data.
Timestamp <- c("1/27/2015 18:28:16","1/27/2015 18:28:17","1/27/2015 18:28:19","1/27/2015 18:28:20","1/27/2015 18:28:23","1/28/2015 22:43:08","1/28/2015 22:43:09","1/28/2015 22:43:13","1/28/2015 22:43:15","1/28/2015 22:43:16"
)
ID <- c("A","A","A","A","A","B","B","B","B","B")
v1<- c(1.70,1.71,1.77,1.79,1.63,7.20,7.26,7.16,7.18,7.18)
df <- data.frame(Timestamp ,ID,v1)
Timestamp ID v1
1/27/2015 18:28:16 A 1.70
1/27/2015 18:28:17 A 1.71
1/27/2015 18:28:19 A 1.77
1/27/2015 18:28:20 A 1.79
1/27/2015 18:28:23 A 1.63
1/28/2015 22:43:08 B 7.20
1/28/2015 22:43:09 B 7.26
1/28/2015 22:43:13 B 7.16
1/28/2015 22:43:15 B 7.18
1/28/2015 22:43:16 B 7.18
Since I dont really care about the timestamp, I was thinking of creating a column called interval to plot this data in one plot.
I am wrongly creating the interval column by doing this
df$interval <- cut(df$Timestamp, breaks="sec")
I want to incrementally add the "secs" of the timestamp and put it in the interval column and this should by grouped by ID. By this I mean, Everytime it has a new ID, the interval column resets to 1 and then incrementally adds the timestamp (secs).
My desired output
Timestamp ID v1 Interval
1/27/2015 18:28:16 A 1.70 1
1/27/2015 18:28:17 A 1.71 2
1/27/2015 18:28:19 A 1.77 4
1/27/2015 18:28:20 A 1.79 5
1/27/2015 18:28:23 A 1.63 8
1/28/2015 22:43:08 B 7.20 1
1/28/2015 22:43:09 B 7.26 2
1/28/2015 22:43:13 B 7.16 6
1/28/2015 22:43:15 B 7.18 8
1/28/2015 22:43:16 B 7.18 9
I also would like to plot this using ggplot with interval vs v1 by ID and so we get 2 time series in the same plot. I will then extract features from it.
Please help me how to work around this problem so that I can apply it to a larger dataset.
One solution with data.table:
For the data:
library(data.table)
df <- as.data.table(df)
df$Timestamp <- as.POSIXct(df$Timestamp, format='%m/%d/%Y %H:%M:%S')
df[, Interval := as.numeric(difftime(Timestamp, .SD[1, Timestamp], units='secs') + 1) , by=ID]
which outputs:
> df
Timestamp ID v1 Interval
1: 2015-01-27 18:28:16 A 1.70 1
2: 2015-01-27 18:28:17 A 1.71 2
3: 2015-01-27 18:28:19 A 1.77 4
4: 2015-01-27 18:28:20 A 1.79 5
5: 2015-01-27 18:28:23 A 1.63 8
6: 2015-01-28 22:43:08 B 7.20 1
7: 2015-01-28 22:43:09 B 7.26 2
8: 2015-01-28 22:43:13 B 7.16 6
9: 2015-01-28 22:43:15 B 7.18 8
10: 2015-01-28 22:43:16 B 7.18 9
Then for ggplot:
library(ggplot2)
ggplot(df, aes(x=Interval, y=v1, color=ID)) + geom_line()
and the graph:

Why is the time series data being plotted backwards in R?

I am stuck on the why that this is happening and have tried searching everywhere for the answer. When I try to plot a timeseries object in R the resulting plot comes out in reverse.
I have the following code:
library(sqldf)
stock_prices <- read.csv('~/stockPrediction/input/REN.csv')
colnames(stock_prices) <- tolower(colnames(stock_prices))
colnames(stock_prices)[7] <- 'adjusted_close'
stock_prices <- sqldf('SELECT date, adjusted_close FROM stock_prices')
head(stock_prices)
date adjusted_close
1 2014-10-20 3.65
2 2014-10-17 3.75
3 2014-10-16 4.38
4 2014-10-15 3.86
5 2014-10-14 3.73
6 2014-10-13 4.09
tail(stock_prices)
date adjusted_close
1767 2007-10-15 8.99
1768 2007-10-12 9.01
1769 2007-10-11 9.02
1770 2007-10-10 9.06
1771 2007-10-09 9.06
1772 2007-10-08 9.08
But when I try the following code:
stock_prices_ts <- ts(stock_prices$adjusted_close, start=c(2007, 1), end=c(2014, 10), frequency=12)
plot(stock_prices_ts, col='blue', lwd=2, type='l')
How the image that results is :
And even if I reverse the time series object with this code:
plot(rev(stock_prices_ts), col='blue', lwd=2, type='l')
I get this
which has arbitrary numbers.
Any idea why this is happening? Any help is much appreciated.
This is happened because your object loose its time serie structure once you apply rev function.
For example :
set.seed(1)
gnp <- ts(cumsum(1 + round(rnorm(100), 2)),
start = c(1954, 7), frequency = 12)
gnp ## gnp has a real time serie structure
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
1954 0.37 1.55 1.71 4.31 5.64 5.82
1955 7.31 9.05 10.63 11.32 13.83 15.22 15.60 14.39 16.51 17.47 18.45 20.39
1956 22.21 23.80 25.72 27.50 28.57 27.58 29.20 30.14 30.98 30.51 31.03 32.45
1957
rev(gnp) ## the reversal is just a vector
[1] 110.91 110.38 110.60 110.17 110.45 108.89 106.30 104.60 102.44 ....
In general is a liitle bit painful to manipulate the class ts. One idea is to use an xts object that "generally" conserve its structure one you apply common operation on it.
Even in this case the generic method rev is not implemented fo an xts object, it is easy to coerce the resulted zoo time series to and xts one using as.xts.
par(mfrow=c(2,2))
plot(gnp,col='red',main='gnp')
plot(rev(gnp),type='l',col='red',main='rev(gnp)')
library(xts)
xts_gnp <- as.xts(gnp)
plot(xts_gnp)
## note here that I apply as.xts again after rev operation
## otherwise i lose xts structure
rev_xts_gnp = as.xts(rev(as.xts(gnp)))
plot(rev_xts_gnp)

Change 15 minute data to daily mean in R

I have 15 minute data that I want to change into daily mean. I just listed the Columbia data below, but there are other sites (CR1 and CR2) where I didn't list that data. I put my code at the bottom. I get an error at
x <- xts(d[,-1], as.POSIXct(d[,1], format="%Y-%m-%d %H:%M", tz = "EST"))
Error in as.POSIXct.default(d[, 1], format = "%Y-%m-%d %H:%M", tz = "EST") :
do not know how to convert 'd[, 1]' to class “POSIXct”"
I'm pretty new to R so I'm sorry if the answer is something incredibly simple and I should have caught it.
datetime Discharge Columbia
2014-01-19 22:00 6030 4.3
2014-01-19 22:15 5970 4.28
2014-01-19 22:30 5880 4.25
2014-01-19 22:45 5830 4.23
2014-01-19 23:00 5710 4.19
2014-01-19 23:15 5620 4.16
2014-01-19 23:30 5510 4.12
2014-01-19 23:45 5400 4.08
2014-01-20 00:00 5340 4.06
2014-01-20 00:15 5290 4.04
2014-01-20 00:30 5260 4.03
2014-01-20 00:45 5210 4.01
2014-01-20 01:00 5180 4
2014-01-20 01:15 4990 3.93
2014-01-20 01:30 4830 3.87
2014-01-20 01:45 4810 3.86
2014-01-20 02:00 4780 3.85
2014-01-20 02:15 4780 3.85
2014-01-20 02:30 4760 3.84
2014-01-20 02:45 4760 3.84
2014-01-20 03:00 4760 3.84
2014-01-20 03:15 4760 3.84
USGS_Columbia_Data <- read.csv("~/Desktop/R/USGS_Columbia_Data.csv",header=TRUE)
## daily averages of the data
library(xts)
d <- structure(list(datetime = (USGS_Columbia_Data[1]),
Columbia = (USGS_Columbia_Data[3]),
CR1 = (USGS_Columbia_Data[5]),
CR2 = (USGS_Columbia_Data[7])),
.Names = c("datetime", "Columbia", "CR1", "CR2"),
row.names = c(NA, -3L), class = "data.frame")
x <- xts(d[,-1], as.POSIXct(d[,1], format="%Y-%m-%d %H:%M", tz = "EST"))
apply.daily(x, colMeans)
The other answer works, apparently, but you can (and probably should) use xts for something like this. The problem is with your use of structure(...) to create the data frame. USGS_Columbia_Data is already a data frame. If you want to extract columns 1,3,5, and 7, do this:
d <- USGS_Columbia_Data[,c(1,3,5,7)]
colnames(d) <- c("datetime","Columbia","CR1","CR2"")
You may not need the second line if USGS_Columbia_Data already has those column names. Having done that, you can create a date-indexed xts object as follows:
x <- xts(d[,-1], as.Date(d[,1], format="%Y-%m-%d"))
Then either of the following will work: (note I'm using the d from your example here).
apply.daily(x,mean)
# Discharge Columbia
# 2014-01-19 5743.75 4.201250
# 2014-01-20 4965.00 3.918571
aggregate(x,as.Date,mean)
# Discharge Columbia
# 2014-01-19 5743.75 4.201250
# 2014-01-20 4965.00 3.918571
will work.
If you want to leave the index as POSIXct, use this:
x <- xts(d[,-1], as.POSIXct(d[,1], format="%Y-%m-%d %H:%M"))
apply.daily(x,mean)
# Discharge Columbia
# 2014-01-19 23:45:00 5743.75 4.201250
# 2014-01-20 03:15:00 4965.00 3.918571
But note the index is the last time on each date, not the date itself.
You could use cut and aggregate
# make certain datetime is class POSIXct
d$datetime <- as.POSIXct(d$datetime, tz='EST')
aggregate(list(Discharge = d$Discharge, Columbia = d$Columbia), list(time = cut(d$datetime, "1 day")), mean)
> aggregate(list(Discharge = d$Discharge, Columbia = d$Columbia), list(datetime = cut(t$datetime, "1 day")), mean)
time Discharge Columbia
1 2014-01-19 5743.75 4.201250
2 2014-01-20 4965.00 3.918571

Resources