I have a large data set, a sample is given below:
df <- data.frame(stringsAsFactors=FALSE,
Date = c("2015-10-26", "2015-10-26", "2015-10-26", "2015-10-26",
"2015-10-27", "2015-10-27", "2015-10-27"),
Ticker = c("ANZ", "CBA", "NAB", "WBC", "ANZ", "CBA", "WBC"),
Open = c(29.11, 77.89, 32.69, 31.87, 29.05, 77.61, 31.84),
High = c(29.17, 77.93, 32.76, 31.92, 29.08, 78.1, 31.95),
Low = c(28.89, 77.37, 32.42, 31.71, 28.9, 77.54, 31.65),
Close = c(28.9, 77.5, 32.42, 31.84, 28.94, 77.74, 31.77),
Volume = c(6350170L, 2251288L, 3804239L, 5597684L, 5925519L, 2424679L,
5448863L)
)
The problem I am trying to solve is the missing data for NAB on 27-10-2015
I want the last value to repeat itself for the missing dates:
Date Ticker Open High Low Close Volume
2 2015-10-27 NAB 32.69 32.76 32.42 32.42 3804239
Any ideas on how to do this?
I have already unsuccessfully tried gather + spread
What if you tried something like this?
library(zoo)
res <- expand.grid(Date = unique(df$Date), Ticker = unique(df$Ticker))
res2 <- merge(res, df, all.x = TRUE)
res2 <- res2[order(res2$Ticker, res2$Date),]
res3 <- na.locf(res2)
res3[order(res3$Date, res3$Ticker),]
# Date Ticker Open High Low Close Volume
#1 2015-10-26 ANZ 29.11 29.17 28.89 28.90 6350170
#3 2015-10-26 CBA 77.89 77.93 77.37 77.50 2251288
#5 2015-10-26 NAB 32.69 32.76 32.42 32.42 3804239
#6 2015-10-26 WBC 31.87 31.92 31.71 31.84 5597684
#2 2015-10-27 ANZ 29.05 29.08 28.90 28.94 5925519
#4 2015-10-27 CBA 77.61 78.10 77.54 77.74 2424679
#8 2015-10-27 NAB 32.69 32.76 32.42 32.42 3804239
#7 2015-10-27 WBC 31.84 31.95 31.65 31.77 5448863
I'm assuming that if a Ticker/Day combo does not exist, you want to create one and LOCF it. This is what the expand.grid does.
tidyr::complete and tidyr::fill are built just for this situation:
library(tidyverse)
df %>%
complete(Date,Ticker) %>%
arrange(Ticker) %>%
fill(names(.)) %>%
arrange(Date)
#
# # A tibble: 8 x 7
# Date Ticker Open High Low Close Volume
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <int>
# 1 2015-10-26 ANZ 29.11 29.17 28.89 28.90 6350170
# 2 2015-10-26 CBA 77.89 77.93 77.37 77.50 2251288
# 3 2015-10-26 NAB 32.69 32.76 32.42 32.42 3804239
# 4 2015-10-26 WBC 31.87 31.92 31.71 31.84 5597684
# 5 2015-10-27 ANZ 29.05 29.08 28.90 28.94 5925519
# 6 2015-10-27 CBA 77.61 78.10 77.54 77.74 2424679
# 7 2015-10-27 NAB 32.69 32.76 32.42 32.42 3804239
# 8 2015-10-27 WBC 31.84 31.95 31.65 31.77 5448863
Another potential solution (Note: I had to convert your date vector to Date format, but this could be reversed in the final output):
library(tidyr)
library(dplyr)
df <- data.frame(stringsAsFactors=FALSE,
Date = as.Date(c("2015-10-26", "2015-10-26", "2015-10-26", "2015-10-26",
"2015-10-27", "2015-10-27", "2015-10-27")),
Ticker = c("ANZ", "CBA", "NAB", "WBC", "ANZ", "CBA", "WBC"),
Open = c(29.11, 77.89, 32.69, 31.87, 29.05, 77.61, 31.84),
High = c(29.17, 77.93, 32.76, 31.92, 29.08, 78.1, 31.95),
Low = c(28.89, 77.37, 32.42, 31.71, 28.9, 77.54, 31.65),
Close = c(28.9, 77.5, 32.42, 31.84, 28.94, 77.74, 31.77),
Volume = c(6350170L, 2251288L, 3804239L, 5597684L, 5925519L, 2424679L,
5448863L))
tickers<- unique(df$Ticker)
dates<- as.Date(df$Date)
possibilities<- as.data.frame(unique(expand.grid(dates,tickers)))
colnames(possibilities)<- c('Date','Ticker')
missing<- anti_join(possibilities,df[,c('Date','Ticker')])
missing_filled<- if(nrow(missing)>0){
replacement<- cbind(missing,filter(df,Date==missing$Date-1,Ticker==missing$Ticker)[,3:7])
}
final<- arrange(rbind(df,replacement),Date)
Related
I have a > 30,000 rows tibble with ID colums, dates and acts. Sometimes, for an ID, two acts are registered the very same day. In that case would like to aggregate those acts.
Please find below a minimal working example (MWE):
> id <- c ("N02", "N02", "N02", "N02", "N03", "N03", "N05", "N05", "N05", "N05")
> date_1 <- c ("2008-03-15", "2008-03-15", "2008-04-15", "2008-04-15", "2008-06-15", "2008-07-15", "2014-03-06", "2014-03-07", "2014-03-07", "2014-03-13")
> act <- c ("YYYY050", "ZZNL040", "YYYY050", "ZZNL040", "ZZNL040", "ZZNL040", "ZZNL065", "ZZNL065", "ZZNL040", "ZZNL065")
> date_2 <- c ("2008-03-15", "2008-03-15", "2008-04-15", "2008-04-15", "2008-06-15", "2008-07-15", "2014-03-06", "2014-03-07", "2014-03-07", "2014-03-13")
> df1 <- data.frame (id, date_1, act, date_2)
> df1
id date_1 act date_2
1 N02 2008-03-15 YYYY050 2008-03-15
2 N02 2008-03-15 ZZNL040 2008-03-15
3 N02 2008-04-15 YYYY050 2008-04-15
4 N02 2008-04-15 ZZNL040 2008-04-15
5 N03 2008-06-15 ZZNL040 2008-06-15
6 N03 2008-07-15 ZZNL040 2008-07-15
7 N05 2014-03-06 ZZNL065 2014-03-06
8 N05 2014-03-07 ZZNL065 2014-03-07
9 N05 2014-03-07 ZZNL040 2014-03-07
10 N05 2014-03-13 ZZNL065 2014-03-13
In present MWE, ID N02 has act YYYY050 and ZZNL040 the same day 2008-03-15.
In that case of mutiple acts the very same day, I would like aggregate the act values. I need to preserve the tibble structure id est date_1 and date_2 which are sometimes different for further calculations and, therefore use dplyr.
My desired output would be:
> id <- c ("N02", "N02", "N03", "N03", "N05", "N05", "N05")
> date_1 <- c ("2008-03-15", "2008-04-15", "2008-06-15", "2008-07-15", "2014-03-06", "2014-03-07", "2014-03-13")
> act <- c ("YYYY050/ZZNL040", "YYYY050/ZZNL040","ZZNL040", "ZZNL040", "ZZNL065", "ZZNL065/ZZNL040", "ZZNL065")
> date_2 <- c ("2008-03-15", "2008-04-15", "2008-06-15", "2008-07-15", "2014-03-06", "2014-03-07", "2014-03-13")
> df2 <- data.frame (id, date_1, act, date_2)
> df2
id date_1 act date_2
1 N02 2008-03-15 YYYY050/ZZNL040 2008-03-15
2 N02 2008-04-15 YYYY050/ZZNL040 2008-04-15
3 N03 2008-06-15 ZZNL040 2008-06-15
4 N03 2008-07-15 ZZNL040 2008-07-15
5 N05 2014-03-06 ZZNL065 2014-03-06
6 N05 2014-03-07 ZZNL065/ZZNL040 2014-03-07
7 N05 2014-03-13 ZZNL065 2014-03-13
Any idea ?
Thank you in advance for your help.
Charles.
You could do:
library(dplyr)
df1 %>%
group_by(id, date_1, date_2) %>%
summarise(act = paste(act, collapse = '/')) %>%
select(1, 2, 4, 3)
#> # A tibble: 7 x 4
#> # Groups: id, date_1 [7]
#> id date_1 act date_2
#> <chr> <chr> <chr> <chr>
#> 1 N02 2008-03-15 YYYY050/ZZNL040 2008-03-15
#> 2 N02 2008-04-15 YYYY050/ZZNL040 2008-04-15
#> 3 N03 2008-06-15 ZZNL040 2008-06-15
#> 4 N03 2008-07-15 ZZNL040 2008-07-15
#> 5 N05 2014-03-06 ZZNL065 2014-03-06
#> 6 N05 2014-03-07 ZZNL065/ZZNL040 2014-03-07
#> 7 N05 2014-03-13 ZZNL065 2014-03-13
I have a dataset from a sources that uses a special compression algorithm. Simply put, new measurements are recorded only when the change in slope (rate of change) is greater than a certain percentage (say 5%).
However, for the analysis I'm currently carrying out, I need values at regular intervals. I am able to carry out a piecewise interpolation using approx, approxfun or spline for different variables vs time (tme in below data) but I'd like to do it for all variables (columns of data.table) in a single shot.
library(data.table)
q = setDT(
structure(list(tme = structure(c(1463172120, 1463173320, 1463175720,
1463180520, 1463182920, 1463187720, 1463188920, 1463190120, 1463191320,
1463192520, 1463202180, 1463203380, 1463204580, 1463205780, 1463206980,
1463208180, 1463218980, 1463233440, 1463244240, 1463245440, 1463246640,
1463247840, 1463249040, 1463250240, 1463251440, 1463252640, 1463253840,
1463255040, 1463256240, 1463316360, 1463317560, 1463318760, 1463319960,
1463321160, 1463322360, 1463323560, 1463324760, 1463325960, 1463327160,
1463328360, 1463329560, 1463330760, 1463331960), class = c("POSIXct",
"POSIXt"), tzone = "America/Montreal"), rh = c(50.36, 47.31,
46.39, 46.99, 47.89, 50.37, 51.29, 51.92, 54.97, 67.64, 69.38,
68.96, 69.89, 56.66, 51.23, 55.38, 64.36, 50.72, 31.33, 31.38,
32.65, 33.15, 33.05, 31.87, 32.58, 32.65, 31.06, 29.82, 28.72,
67.95, 66.68, 64.66, 62.12, 59.86, 58.11, 57.41, 56.5, 56.16,
55.69, 54.57, 53.89, 53.81, 52.01), degc = c(30.0055555555556,
30.3611111111111, 30.6611111111111, 30.5833333333333, 30.2666666666667,
28.6888888888889, 28.2555555555556, 28.0722222222222, 27.4944444444444,
25.0722222222222, 24.8111111111111, 24.7166666666667, 24.1666666666667,
25.4111111111111, 25.5222222222222, 24.3555555555556, 22.7722222222222,
25.5222222222222, 27.8111111111111, 27.9888888888889, 28.0277777777778,
28.1333333333333, 28.5333333333333, 28.7, 28.85, 29.1555555555556,
28.8388888888889, 29.5111111111111, 29.6722222222222, 22.3888888888889,
22.5722222222222, 22.9444444444444, 23.3722222222222, 23.6777777777778,
23.8777777777778, 24.2055555555556, 24.6888888888889, 24.9777777777778,
25.3888888888889, 25.8, 26.1, 26.1555555555556, 26.7388888888889
)), .Names = c("tme", "rh", "degc"), row.names = c(NA, -43L), class = c("data.table",
"data.frame")))
q is my queried dataset. Here's what works for individual variables (degc in this example):
interpolate_degc <- approxfun(x = q$tme, y = q$degc, method = "linear")
# To get the uniform samples:
width <- "10 mins"
new_times <- seq.POSIXt(from = q$tme[1], to = q$tme[nrow(q)], by = width)
new_degc <- interpolate_degc(new_times)
I'd like to do this for all variables in a single shot, preferably using data.table.
This seems to work:
cols = c("rh", "degc")
DT = q[.(seq(min(tme), max(tme), by="10 mins")), on=.(tme)]
DT[, (cols) := lapply(cols, function(z) with(q,
approxfun(x = tme, y = get(z), method = "linear")
)(tme))]
tme rh degc
1: 2016-05-13 16:42:00 50.360 30.00556
2: 2016-05-13 16:52:00 48.835 30.18333
3: 2016-05-13 17:02:00 47.310 30.36111
4: 2016-05-13 17:12:00 47.080 30.43611
5: 2016-05-13 17:22:00 46.850 30.51111
---
263: 2016-05-15 12:22:00 54.026 26.04000
264: 2016-05-15 12:32:00 53.866 26.11667
265: 2016-05-15 12:42:00 53.826 26.14444
266: 2016-05-15 12:52:00 53.270 26.33056
267: 2016-05-15 13:02:00 52.370 26.62222
Generally when you want to iterate over columns, lapply or Map will work.
How it works: Inside the with(q, ...), tme and get(z) refer to columns of q, but outside of it, we're looking at columns of DT (in this case just tme).
Another way of doing the same thing:
q[, {
tt = seq(min(tme), max(tme), by="10 mins")
c(
.(tme = tt),
lapply(.SD, function(z) approxfun(x = tme, y = z, method="linear")(tt))
)
}, .SDcols=cols]
For time series I like to use specialized packages like xts and zoo:
library(xts)
ts <- merge(xts(x = q[,-1], order.by = q[,1]), new_times)
head(ts)
#> rh degc
#> 2016-05-13 16:42:00 50.36 30.00556
#> 2016-05-13 16:52:00 NA NA
#> 2016-05-13 17:02:00 47.31 30.36111
#> 2016-05-13 17:12:00 NA NA
#> 2016-05-13 17:22:00 NA NA
#> 2016-05-13 17:32:00 NA NA
head(na.approx(ts))
#> rh degc
#> 2016-05-13 16:42:00 50.360 30.00556
#> 2016-05-13 16:52:00 48.835 30.18333
#> 2016-05-13 17:02:00 47.310 30.36111
#> 2016-05-13 17:12:00 47.080 30.43611
#> 2016-05-13 17:22:00 46.850 30.51111
#> 2016-05-13 17:32:00 46.620 30.58611
head(na.spline(ts))
#> rh degc
#> 2016-05-13 16:42:00 50.36000 30.00556
#> 2016-05-13 16:52:00 48.52407 30.20524
#> 2016-05-13 17:02:00 47.31000 30.36111
#> 2016-05-13 17:12:00 46.62601 30.47791
#> 2016-05-13 17:22:00 46.33972 30.56219
#> 2016-05-13 17:32:00 46.30857 30.62093
I would like to match the values of two table based on the following spatial and time conditions:
Two hour time interval
min(dist), max dist 20km
I can solve the problem using for loops but since the tables' dimension are 3million * 10, 100 000 * 13 it takes too long to complete.
Do you have any suggestion? I post below a practical example and the desired outpu. Thank you.
Example
DT1 <- data.table(
Date = as.POSIXct(c("2005-01-05 10:40:00", "2005-01-06 10:40:00", "2005-01-07 10:40:00", "2005-01-08 10:40:00", "2005-01-09 10:40:00", "2005-01-10 10:40:00"), format = "%Y-%m-%d %T", tz = "GMT"),
Lat = c(rep(50, 3), 35.44, 25.44, 15.44),
Lon = c(rep(-50, 3), -10.44, -20.44, -30.44),
Other.col = sample(LETTERS, 6))
DT2 <- data.table(
Date = as.POSIXct(c("2011-01-01 10:40:00", "2005-01-05 11:40:00", "2005-01-09 08:59:00", "2005-01-09 09:18:00", "2005-01-10 08:59:00"), format = "%Y-%m-%d %T", tz = "GMT"),
Lat = c(35.44, 1, 25.54, 25.43, 15.46),
Lon = c(-10.44, 1, -20.66, -20.42, -30.13),
Quality = c("h", "f", "n", "z", "l"))
DT1
Date Lat Lon Other.col
1: 2005-01-05 10:40:00 50.00 -50.00 E
2: 2005-01-06 10:40:00 50.00 -50.00 C
3: 2005-01-07 10:40:00 50.00 -50.00 O
4: 2005-01-08 10:40:00 35.44 -10.44 Z
5: 2005-01-09 10:40:00 25.44 -20.44 T
6: 2005-01-10 10:40:00 15.44 -30.44 S
DT2
Date Lat Lon Quality
1: 2011-01-01 10:40:00 35.44 -10.44 h
2: 2005-01-05 11:40:00 1.00 1.00 f
3: 2005-01-09 08:59:00 25.54 -20.66 n
4: 2005-01-09 09:18:00 25.43 -20.42 z
5: 2005-01-10 08:59:00 15.46 -30.13 l
Output
Date Lat Lon Other.col V2
1: 2005-01-05 10:40:00 50.00 -50.00 E NA
2: 2005-01-06 10:40:00 50.00 -50.00 C NA
3: 2005-01-07 10:40:00 50.00 -50.00 O NA
4: 2005-01-08 10:40:00 35.44 -10.44 Z NA
5: 2005-01-09 10:40:00 25.44 -20.44 T z
6: 2005-01-10 10:40:00 15.44 -30.44 S l
Data:
DB1 <- data.frame(orderItemID = 1:10,
orderDate = c("2013-01-21","2013-03-31","2013-04-12","2013-06-01","2014-01-01", "2014-02-19","2014-02-27","2014-10-02","2014-10-31","2014-11-21"),
deliveryDate = c("2013-01-23", "2013-03-01", "NA", "2013-06-04", "2014-01-03", "NA", "2014-02-28", "2014-10-04", "2014-11-01", "2014-11-23"))
Expected Outcome:
DB1 <- data.frame(orderItemID = 1:10,
orderDate= c("2013-01-21","2013-03-31","2013-04-12","2013-06-01","2014-01-01", "2014-02-19","2014-02-27","2014-10-02","2014-10-31","2014-11-21"),
deliveryDate = c("2013-01-23", "2013-03-01", "2013-04-14", "2013-06-04", "2014-01-03", "2014-02-21", "2014-02-28", "2014-10-04", "2014-11-01", "2014-11-23"))
My question is similar to another one I posted: so don´t be confused.
As you can see above I have some missing values in the delivery dates and I want to replace them by another date. That date should be the order date of the specific item + the average delivery time in (full) days.(2days)
The average delivery time is the time calculated from the average value of all samples that do not contain Missing values = (2days+1day+3days+2days+1day+2days+1day+2days):8=1,75
So I want to replace the NA in delivery time with the order date +2days. When there´s no NA, the date should stay the same.
I tried this already (with lubridate), but it´s not working :(
DB1$deliveryDate[is.na(DB1$deliveryDate) ] <- DB1$orderDate + days(2)
Can someone plz help me?
First, convert the columns to Date objects:
DB1[,2:3]<-lapply(DB1[,2:3],as.Date)
Then, replace the NA elements:
DB1$deliveryDate[is.na(DB1$deliveryDate)] <-
DB1$orderDate[is.na(DB1$deliveryDate)] +
mean(difftime(DB1$orderDate,DB1$deliveryDate,units="days"),na.rm=TRUE)
# orderItemID orderDate deliveryDate
#1 1 2013-01-21 2013-01-23
#2 2 2013-03-31 2013-03-01
#3 3 2013-04-12 2013-04-14
#4 4 2013-06-01 2013-06-04
#5 5 2014-01-01 2014-01-03
#6 6 2014-02-19 2014-02-21
#7 7 2014-02-27 2014-02-28
#8 8 2014-10-02 2014-10-04
#9 9 2014-10-31 2014-11-01
#10 10 2014-11-21 2014-11-23
You can do:
DB1 =cbind(DB1$orderItemID,as.data.frame(lapply(DB1[-1], as.character)))
days = round(mean(DB1$deliveryDate-DB1$orderDate, na.rm=T))
mask = is.na(DB1$deliveryDate)
DB1$deliveryDate[mask] = DB1$orderDate[mask]+days
# DB1$orderItemID orderDate deliveryDate
#1 1 2013-01-21 2013-01-23
#2 2 2013-03-31 2013-04-01
#3 3 2013-04-12 2013-04-14
#4 4 2013-06-01 2013-06-04
#5 5 2014-01-01 2014-01-03
#6 6 2014-02-19 2014-02-21
#7 7 2014-02-27 2014-02-28
#8 8 2014-10-02 2014-10-04
#9 9 2014-10-31 2014-11-01
#10 10 2014-11-21 2014-11-23
I re-arrange your data since they were not clean:
DB1 <- data.frame(orderItemID = 1:10,
orderDate = c("2013-01-21","2013-03-31","2013-04-12","2013-06-01","2014-01-01", "2014-02-19","2014-02-27","2014-10-02","2014-10-31","2014-11-21"),
deliveryDate = c("2013-01-23", "2013-04-01", NA, "2013-06-04", "2014-01-03", NA, "2014-02-28", "2014-10-04", "2014-11-01", "2014-11-23"))
Assuming that you have entered your data like this (note that NAs are not enclosed in quotes so they are read as NAs and not "NA")...
DB1 <- data.frame(orderItemID = 1:10,
orderDate = c("2013-01-21","2013-03-31","2013-04-12","2013-06-01","2014-01-01", "2014-02-19","2014-02-27","2014-10-02","2014-10-31","2014-11-21"),
deliveryDate = c("2013-01-23", "2013-03-01", NA, "2013-06-04", "2014-01-03", NA, "2014-02-28", "2014-10-04", "2014-11-01", "2014-11-23"),
stringsAsFactors = FALSE)
...and, per Nicola's answer, done this to get the formatting right...
DB1[,2:3]<-lapply(DB1[,2:3],as.Date)
...this also works:
library(lubridate)
DB1$deliveryDate <- with(DB1, as.Date(ifelse(is.na(deliveryDate), orderDate + days(2), deliveryDate), origin = "1970-01-01"))
Or you could use dplyr and pipe it:
library(lubridate)
library(dplyr)
DB2 <- DB1 %>%
mutate(deliveryDate = ifelse(is.na(deliveryDate), orderDate + days(2), deliveryDate)) %>%
mutate(deliveryDate = as.Date(.[,"deliveryDate"], origin = "1970-01-01"))
I have data for electricity sensor reading with interval 15 min but the start time is not fixed for example
in this day it start at min 13 another day start from different minute
dateTime KW
1/1/2013 1:13 34.70
1/1/2013 1:28 43.50
1/1/2013 1:43 50.50
1/1/2013 1:58 57.50
.
.
.//here start from min 02
1/30/2013 0:02 131736.30
1/30/2013 0:17 131744.30
1/30/2013 0:32 131751.10
1/30/2013 0:47 131759.00
I have data for one year and i need to have regular interval 30 min starting from mid night 00:00.
I am new to R ..can anyone help me
May be you can try:
dT <- as.POSIXct(strptime(df$dateTime, '%m/%d/%Y %H:%M'))
grp <- as.POSIXct(cut(c(as.POSIXct(gsub(' +.*', '', min(dT))), dT,
as.POSIXct(gsub(' +.*', '', max(dT)+24*3600))), breaks='30 min'))
df$grp <- grp[-c(1,length(grp))]
df
# dateTime KW grp
#1 1/1/2013 1:13 34.7 2013-01-01 01:00:00
#2 1/1/2013 1:28 43.5 2013-01-01 01:00:00
#3 1/1/2013 1:43 50.5 2013-01-01 01:30:00
#4 1/1/2013 1:58 57.5 2013-01-01 01:30:00
#5 1/30/2013 0:02 131736.3 2013-01-30 00:00:00
#6 1/30/2013 0:17 131744.3 2013-01-30 00:00:00
#7 1/30/2013 0:32 131751.1 2013-01-30 00:30:00
#8 1/30/2013 0:47 131759.0 2013-01-30 00:30:00
data
df <- structure(list(dateTime = c("1/1/2013 1:13", "1/1/2013 1:28",
"1/1/2013 1:43", "1/1/2013 1:58", "1/30/2013 0:02", "1/30/2013 0:17",
"1/30/2013 0:32", "1/30/2013 0:47"), KW = c(34.7, 43.5, 50.5,
57.5, 131736.3, 131744.3, 131751.1, 131759)), .Names = c("dateTime",
"KW"), class = "data.frame", row.names = c(NA, -8L))