data<-c(10.0,11.1,12.3,13.2,14.8,15.6,16.7,17.5,18.9,19.7,20.7,21.1,22.6,23.5,24.9,25.1,26.3,27.8,28.8,29.6,30.2,31.6,32.1,33.7)
startDate <- '2013-01-01'
endDate <- '2013-01-01'
df <- ts(cbind(data, startDate, endDate))
df
################
smp_size <- 0.80
train_ind <- length(df) * smp_size
train_split <- seq(from = 1, to = train_ind)
test_split <- seq(from = train_ind +1, to = length(df))
train <- data[train_split]
test <- data[-test_split]
(c(train, test))
I have the above data and I am trying to split it into time series splits, i..e the first 80% as training and the remaining 20% as testing.
I keep getting weird results:
(c(train, test))
[1] 10.0 11.1 12.3 13.2 14.8 15.6 16.7 17.5 18.9 19.7 20.7 21.1 22.6 23.5 24.9 25.1 26.3 27.8 28.8 29.6 30.2
[22] 31.6 32.1 33.7 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
[43] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 10.0 11.1 12.3 13.2 14.8 15.6
[64] 16.7 17.5 18.9 19.7 20.7 21.1 22.6 23.5 24.9 25.1 26.3 27.8 28.8 29.6 30.2 31.6 32.1 33.7
Why are there NA values in the middle of the data?
You should use nrow(df), not length(df) for time-series objects.
data <- c(10.0, 11.1, 12.3, 13.2, 14.8, 15.6, 16.7, 17.5, 18.9,
19.7, 20.7, 21.1, 22.6, 23.5, 24.9, 25.1, 26.3, 27.8,
28.8, 29.6, 30.2, 31.6, 32.1, 33.7)
startDate <- '2013-01-01'
endDate <- '2013-01-01'
df <- ts(cbind(data, startDate, endDate))
train <- df[1:(nrow(df) * .8), ]
test <- df[-(1:(nrow(df) * .8)), ]
> all.equal(df, ts(rbind(train, test)))
[1] TRUE
> length(df)
[1] 72
> nrow(df)
[1] 24
Calculate the number of rows to include in test set and use window function to subset time-series
train_size <- ceiling(nrow(df) * 0.8)
train_set <- window(df, end = train_size)
test_set <- window(df, start = train_size + 1)
train_set
#Time Series:
#Start = 1
#End = 20
#Frequency = 1
# data startDate endDate
# 1 10 2013-01-01 2013-01-01
# 2 11.1 2013-01-01 2013-01-01
# 3 12.3 2013-01-01 2013-01-01
# 4 13.2 2013-01-01 2013-01-01
# 5 14.8 2013-01-01 2013-01-01
# 6 15.6 2013-01-01 2013-01-01
# 7 16.7 2013-01-01 2013-01-01
# 8 17.5 2013-01-01 2013-01-01
# 9 18.9 2013-01-01 2013-01-01
#10 19.7 2013-01-01 2013-01-01
#11 20.7 2013-01-01 2013-01-01
#12 21.1 2013-01-01 2013-01-01
#13 22.6 2013-01-01 2013-01-01
#14 23.5 2013-01-01 2013-01-01
#15 24.9 2013-01-01 2013-01-01
#16 25.1 2013-01-01 2013-01-01
#17 26.3 2013-01-01 2013-01-01
#18 27.8 2013-01-01 2013-01-01
#19 28.8 2013-01-01 2013-01-01
#20 29.6 2013-01-01 2013-01-01
test_set
#Time Series:
#Start = 21
#End = 24
#Frequency = 1
# data startDate endDate
#21 30.2 2013-01-01 2013-01-01
#22 31.6 2013-01-01 2013-01-01
#23 32.1 2013-01-01 2013-01-01
#24 33.7 2013-01-01 2013-01-01
Related
This question already has an answer here:
Using Reshape from wide to long in R [closed]
(1 answer)
Closed 2 years ago.
Basically TTR allows to get technical indicator of a ticker and data should be vertical like:
Date Open High Low Close
2014-05-16 16.83 16.84 16.63 16.71
2014-05-19 16.73 16.93 16.66 16.80
2014-05-20 16.80 16.81 16.58 16.70
but my data frame is like:
Sdate Edate Tickers Open_1 Open_2 Open_3 High_1 High_2 High_3 Low_1 Low_2 Low_3 Close_1 Close_2 Close_3
2014-05-16 2014-07-21 TK 31.6 31.8 32.2 32.4 32.4 33.0 31.1 31.5 32.1 32.1 32.1 32.7
2014-05-17 2014-07-22 TGP 25.1 24.8 25.0 25.1 25.3 25.8 24.1 24.4 24.9 24.8 25.0 25.6
2014-05-18 2014-07-23 DNR 3.4 3.5 3.8 3.6 3.8 4.1 3.3 3.5 3.8 3.5 3.7 3.9
As you see I have multiple tickers and time range. I went over package TTR and it does not state how to get technical indicator from which is horizontally made and multiple tickers. My original data has 50days and thousands tickers. To do this, I just knew that, I need to make lists for each tickers, but I'm confused how to do this. How do I achieve this?
You can get data in vertical shape by using pivot_longer :
out <- tidyr::pivot_longer(df, cols = -c(Sdate,Edate, Tickers),
names_to = c('.value', 'num'),
names_sep = '_')
out
# A tibble: 9 x 8
# Sdate Edate Tickers num Open High Low Close
# <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#1 2014-05-16 2014-07-21 TK 1 31.6 32.4 31.1 32.1
#2 2014-05-16 2014-07-21 TK 2 31.8 32.4 31.5 32.1
#3 2014-05-16 2014-07-21 TK 3 32.2 33 32.1 32.7
#4 2014-05-17 2014-07-22 TGP 1 25.1 25.1 24.1 24.8
#5 2014-05-17 2014-07-22 TGP 2 24.8 25.3 24.4 25
#6 2014-05-17 2014-07-22 TGP 3 25 25.8 24.9 25.6
#7 2014-05-18 2014-07-23 DNR 1 3.4 3.6 3.3 3.5
#8 2014-05-18 2014-07-23 DNR 2 3.5 3.8 3.5 3.7
#9 2014-05-18 2014-07-23 DNR 3 3.8 4.1 3.8 3.9
If you want to split the above data into list of dataframes based on Ticker you can use split.
split(out, out$Tickers)
data
df <- structure(list(Sdate = c("2014-05-16", "2014-05-17", "2014-05-18"
), Edate = c("2014-07-21", "2014-07-22", "2014-07-23"), Tickers = c("TK",
"TGP", "DNR"), Open_1 = c(31.6, 25.1, 3.4), Open_2 = c(31.8,
24.8, 3.5), Open_3 = c(32.2, 25, 3.8), High_1 = c(32.4, 25.1,
3.6), High_2 = c(32.4, 25.3, 3.8), High_3 = c(33, 25.8, 4.1),
Low_1 = c(31.1, 24.1, 3.3), Low_2 = c(31.5, 24.4, 3.5), Low_3 = c(32.1,
24.9, 3.8), Close_1 = c(32.1, 24.8, 3.5), Close_2 = c(32.1,
25, 3.7), Close_3 = c(32.7, 25.6, 3.9)),
class = "data.frame", row.names = c(NA, -3L))
I was trying to forecast a time series problem using lm() and my data looks like below
Customer_key date sales
A35 2018-05-13 31
A35 2018-05-20 20
A35 2018-05-27 43
A35 2018-06-03 31
BH22 2018-05-13 60
BH22 2018-05-20 67
BH22 2018-05-27 78
BH22 2018-06-03 55
Converted my df to a list format by
df <- dcast(df, date ~ customer_key,value.var = c("sales"))
df <- subset(df, select = -c(dt))
demandWithKey <- as.list(df)
Trying to write a function such that applying this function across all customers
my_fun <- function(x) {
fit <- lm(ds_load ~ date, data=df) ## After changing to list ds_load and date column names
## are no longer available for formula
fit_b <- forecast(fit$fitted.values, h=20) ## forecast using lm()
return(data.frame(c(fit$fitted.values, fit_b[["mean"]])))
}
fcast <- lapply(df, my_fun)
I know the above function doesn't work, but basically I'm looking for getting both the fitted values and forecasted values for a grouped data.
But I've tried all other methods using tslm() (converting into time series data) and so on but no luck I can get the lm() work somehow on just one customer though. Also many questions/posts were on just fitting the model but I would like to forecast too at same time.
lm() is for a regression model
but here you have a time serie so for forecasting the serie you have to use one of the time serie model (ARMA ARCH GARCH...)
so you can use the function in r : auto.arima() in "forecast" package
I don't know what you're up to exactly, but you could make this less complicated.
Using by avoids the need to reshape your data, it splits your data e.g. by customer ID as in your case and applies a function on the subsets (i.e. it's a combination of split and lapply; see ?by).
Since you want to compare fitted and forecasted values somehow in your result, you probably need predict rather than $fitted.values, otherwise the values won't be of same length. Because your independent variable is a date in weekly intervals, you may use seq.Date and take the first date as a starting value; the sequence has length actual values (nrow each customer) plus h= argument of the forecast.
For demonstration purposes I add the fitted values as first column in the following.
res <- by(dat, dat$cus_key, function(x) {
H <- 20 ## globally define 'h'
fit <- lm(sales ~ date, x)
fitted <- fit$fitted.values
pred <- predict(fit, newdata=data.frame(
date=seq(x$date[1], length.out= nrow(x) + H, by="week")))
fcst <- c(fitted, forecast(fitted, h=H)$mean)
fit.na <- `length<-`(unname(fitted), length(pred)) ## for demonstration
return(cbind(fit.na, pred, fcst))
})
Result
res
# dat$cus_key: A28
# fit.na pred fcst
# 1 41.4 41.4 41.4
# 2 47.4 47.4 47.4
# 3 53.4 53.4 53.4
# 4 59.4 59.4 59.4
# 5 65.4 65.4 65.4
# 6 NA 71.4 71.4
# 7 NA 77.4 77.4
# 8 NA 83.4 83.4
# 9 NA 89.4 89.4
# 10 NA 95.4 95.4
# 11 NA 101.4 101.4
# 12 NA 107.4 107.4
# 13 NA 113.4 113.4
# 14 NA 119.4 119.4
# 15 NA 125.4 125.4
# 16 NA 131.4 131.4
# 17 NA 137.4 137.4
# 18 NA 143.4 143.4
# 19 NA 149.4 149.4
# 20 NA 155.4 155.4
# 21 NA 161.4 161.4
# 22 NA 167.4 167.4
# 23 NA 173.4 173.4
# 24 NA 179.4 179.4
# 25 NA 185.4 185.4
# ----------------------------------------------------------------
# dat$cus_key: B16
# fit.na pred fcst
# 1 49.0 49.0 49.0
# 2 47.7 47.7 47.7
# 3 46.4 46.4 46.4
# 4 45.1 45.1 45.1
# 5 43.8 43.8 43.8
# 6 NA 42.5 42.5
# 7 NA 41.2 41.2
# 8 NA 39.9 39.9
# 9 NA 38.6 38.6
# 10 NA 37.3 37.3
# 11 NA 36.0 36.0
# 12 NA 34.7 34.7
# 13 NA 33.4 33.4
# 14 NA 32.1 32.1
# 15 NA 30.8 30.8
# 16 NA 29.5 29.5
# 17 NA 28.2 28.2
# 18 NA 26.9 26.9
# 19 NA 25.6 25.6
# 20 NA 24.3 24.3
# 21 NA 23.0 23.0
# 22 NA 21.7 21.7
# 23 NA 20.4 20.4
# 24 NA 19.1 19.1
# 25 NA 17.8 17.8
# ----------------------------------------------------------------
# dat$cus_key: C12
# fit.na pred fcst
# 1 56.4 56.4 56.4
# 2 53.2 53.2 53.2
# 3 50.0 50.0 50.0
# 4 46.8 46.8 46.8
# 5 43.6 43.6 43.6
# 6 NA 40.4 40.4
# 7 NA 37.2 37.2
# 8 NA 34.0 34.0
# 9 NA 30.8 30.8
# 10 NA 27.6 27.6
# 11 NA 24.4 24.4
# 12 NA 21.2 21.2
# 13 NA 18.0 18.0
# 14 NA 14.8 14.8
# 15 NA 11.6 11.6
# 16 NA 8.4 8.4
# 17 NA 5.2 5.2
# 18 NA 2.0 2.0
# 19 NA -1.2 -1.2
# 20 NA -4.4 -4.4
# 21 NA -7.6 -7.6
# 22 NA -10.8 -10.8
# 23 NA -14.0 -14.0
# 24 NA -17.2 -17.2
# 25 NA -20.4 -20.4
As you can see, prediction and forecast yield the same values, since both methods are based on the same single explanatory variable date in this case.
Toy data:
set.seed(42)
dat <- transform(expand.grid(cus_key=paste0(LETTERS[1:3], sample(12:43, 3)),
date=seq.Date(as.Date("2018-05-13"), length.out=5, by="week")),
sales=sample(20:80, 15, replace=TRUE))
I'm working with a dataset of weather variables (temperature, precipitation, etc.) that has a few missing values. Because of my specific approach (summing these variables across several days), I need to address NA values in the dataset.
When there is a missing daily value, I'd like to fill that day with a mean value of the previous and following day. The assumption here is that weather values are similar from one day to the next. And yes, I realize this is a big assumption.
I've developed the following:
maxTemp <- c(13.2, 10.7, NA, 17.9, 6.6, 10, 13, NA, NA, 8.8, 9.9, 14.9, 16.3, NA, 18, 9.9, 11.5, 15.3, 21.7, 23.9, 26.6, 27, 22.3, NA, 17.9)
weather <- as.data.frame(maxTemp)
weather %>%
mutate(maxTempNA = if_else(is.na(maxTemp),
(lag(maxTemp) + lead(maxTemp))/2,
maxTemp))
However, in a few cases, I have two NA values on consecutive days, so this doesn't work. Any thoughts on approaches to code this so that when there are two (or more) NA's in a row, the average uses the 'bookending' values to fill the NAs?
The final result would do look like this:
maxTemp <- c(13.2, 10.7, 14.3, 17.9, 6.6, 10, 13, 10.9, 10.9, 8.8, 9.9, 14.9, 16.3, 17.15, 18, 9.9, 11.5, 15.3, 21.7, 23.9, 26.6, 27, 22.3, 20.1, 17.9)
How about using approx to replace NAs with interpolated values; by default, approx uses linear interpolation, so this should match your manual replace-by-mean results.
weather %>%
mutate(maxTemp_interp = approx(1:n(), maxTemp, 1:n())$y)
# maxTemp maxTemp_interp
# 1 13.2 13.20
# 2 10.7 10.70
# 3 NA 14.30
# 4 17.9 17.90
# 5 6.6 6.60
# 6 10.0 10.00
# 7 13.0 13.00
# 8 NA 11.60
# 9 NA 10.20
# 10 8.8 8.80
# 11 9.9 9.90
# 12 14.9 14.90
# 13 16.3 16.30
# 14 NA 17.15
# 15 18.0 18.00
# 16 9.9 9.90
# 17 11.5 11.50
# 18 15.3 15.30
# 19 21.7 21.70
# 20 23.9 23.90
# 21 26.6 26.60
# 22 27.0 27.00
# 23 22.3 22.30
# 24 NA 20.10
# 25 17.9 17.90
I've created a new column here to make it easier to compare with the original data.
Update
Markus pointed out in the comments (thanks #markus) that to reproduce your expected output, you'd actually need method = "constant" with f = 0.5:
weather %>%
mutate(maxTemp_interp = approx(1:n(), maxTemp, 1:n(), method = "constant", f = 0.5)$y)
# maxTemp maxTemp_interp
# 1 13.2 13.20
# 2 10.7 10.70
# 3 NA 14.30
# 4 17.9 17.90
# 5 6.6 6.60
# 6 10.0 10.00
# 7 13.0 13.00
# 8 NA 10.90
# 9 NA 10.90
# 10 8.8 8.80
# 11 9.9 9.90
# 12 14.9 14.90
# 13 16.3 16.30
# 14 NA 17.15
# 15 18.0 18.00
# 16 9.9 9.90
# 17 11.5 11.50
# 18 15.3 15.30
# 19 21.7 21.70
# 20 23.9 23.90
# 21 26.6 26.60
# 22 27.0 27.00
# 23 22.3 22.30
# 24 NA 20.10
# 25 17.9 17.90
If you want to use the mean of the most recent non-NA value going backwards and forwards, you can use something like data.table::nafill() to fill values both down and up, and then take the mean:
weather$prevTemp = data.table::nafill(weather$maxTemp, type = "locf")
weather$nextTemp = data.table::nafill(weather$maxTemp, type = "nocb")
weather$maxTemp[is.na(weather$maxTemp)] = ((weather$prevTemp + weather$nextTemp) / 2)[is.na(weather$maxTemp)]
I have the following sample dataframe with prices of toys in different shops:
dfData <- data.frame(article = c("Fix", "Foxi", "Stan", "Olli", "Barbie", "Ken", "Hulk"),
priceToys1 = c(10, NA, 10.5, NA, 10.7, 11.2, 12.0),
priceAllToys = c(NA, 11.4, NA, 11.9, 11.7, 11.1, NA),
price123Toys = c(12, 12.4, 12.7, NA, NA, 11.0, 12.1))
Additionally I generate a min price column by adding:
dfData$MinPrice <- apply(dfData[, grep("price", colnames(dfData))], 1, FUN=min, na.rm = TRUE)
So I have this dataframe now:
# article priceToys1 priceAllToys price123Toys MinPrice
#1 Fix 10.0 NA 12.0 10.0
#2 Foxi NA 11.4 12.4 11.4
#3 Stan 10.5 NA 12.7 10.5
#4 Olli NA 11.9 NA 11.9
#5 Barbie 10.7 11.7 NA 10.7
#6 Ken 11.2 11.1 11.0 11.0
#7 Hulk 12.0 NA 12.1 12.0
How do I get additional columns into the dataframe that tell me the factor of all prices relatively to the minimum price in percentage? The new column names should also include the shop name.
The result should look like this:
# article priceToys1 PercToys1 priceAllToys PercAllToys price123Toys Perc123Toys MinPrice
#1 Fix 10.0 100.0 NA NA 12.0 120.0 10.0
#2 Foxi NA NA 11.4 100.0 12.4 108.8 11.4
#3 Stan 10.5 100.0 NA NA 12.7 121.0 10.5
#4 Olli NA NA 11.9 100.0 NA NA 11.9
#5 Barbie 10.7 100.0 11.7 109.4 NA NA 10.7
#6 Ken 11.2 101.8 11.1 100.9 11.0 100.0 11.0
#7 Hulk 12.0 100.0 NA NA 12.1 100.8 12.0
Two possible solutions:
1) With the data.table-package:
# load the 'data.table'-package
library(data.table)
# get the columnnames on which to operate
cols <- names(dfData)[2:4] # or: grep("price", names(dfData), value = TRUE)
# convert dfData to a 'data.table'
setDT(dfData)
# compute the 'fraction'-columns
dfData[, paste0('Perc', gsub('price','',cols)) := lapply(.SD, function(x) round(100 * x / MinPrice, 1))
, .SDcols = cols][]
which gives:
article priceToys1 priceAllToys price123Toys MinPrice PercToys1 PercAllToys Perc123Toys
1: Fix 10.0 NA 12.0 10.0 100.0 NA 120.0
2: Foxi NA 11.4 12.4 11.4 NA 100.0 108.8
3: Stan 10.5 NA 12.7 10.5 100.0 NA 121.0
4: Olli NA 11.9 NA 11.9 NA 100.0 NA
5: Barbie 10.7 11.7 NA 10.7 100.0 109.3 NA
6: Ken 11.2 11.1 11.0 11.0 101.8 100.9 100.0
7: Hulk 12.0 NA 12.1 12.0 100.0 NA 100.8
2) With base R:
cols <- names(dfData)[2:4] # or: grep("price", names(dfData), value = TRUE)
dfData[, paste0('Perc', gsub('price','',cols))] <- round(100 * dfData[, cols] / dfData$MinPrice, 1)
which will get you the same result.
We can use mutate_at from dplyr
library(dplyr)
library(magrittr)
dfData %<>%
mutate_at(vars(matches("^price")), funs(Perc = round(100* ./MinPrice, 1)))
dfData
Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 6 years ago.
Improve this question
I have got monthly data in this format
PrecipMM Date
122.7 2004-01-01
54.2 2005-01-01
31.9 2006-01-01
100.5 2007-01-01
144.9 2008-01-01
96.4 2009-01-01
75.3 2010-01-01
94.8 2011-01-01
67.6 2012-01-01
93.0 2013-01-01
184.6 2014-01-01
101.0 2015-01-01
149.3 2016-01-01
50.2 2004-02-01
46.2 2005-02-01
57.7 2006-02-01
I want to calculate all of the difference of precipMM in same month of different years.
My dream output is like this:
PrecipMM Date PrecipMM_diff
122.7 2004-01-01 NA
54.2 2005-01-01 -68.5
31.9 2006-01-01 -22.3
100.5 2007-01-01 68.6
144.9 2008-01-01 44.4
96.4 2009-01-01 -48.5
75.3 2010-01-01 -21.2
94.8 2011-01-01 19.5
67.6 2012-01-01 -27.2
93.0 2013-01-01 25.4
184.6 2014-01-01 91.6
101.0 2015-01-01 -83.6
149.3 2016-01-01 48.3
50.2 2004-02-01 NA
46.2 2005-02-01 -4.0
57.7 2006-02-01 11.5
I think diff() can do this but I have no idea how.
I think you can do this with lag combined with group_by from dplyr. Here's how:
library(dplyr)
library(lubridate) # makes dealing with dates easier
# Load your example data
df <- structure(list(PrecipMM = c(4.4, 66.7, 48.2, 60.9, 108.1, 109.2,
101.7, 38.1, 53.8, 71.9, 75.4, 67.1, 92.7, 115.3, 68.9, 38.9),
Date = structure(5:20, .Label = c("101.7", "108.1", "109.2",
"115.3", "1766-01-01", "1766-02-01", "1766-03-01", "1766-04-01",
"1766-05-01", "1766-06-01", "1766-07-01", "1766-08-01", "1766-09-01",
"1766-10-01", "1766-11-01", "1766-12-01", "1767-01-01", "1767-02-01",
"1767-03-01", "1767-04-01", "38.1", "38.9", "4.4", "48.2",
"53.8", "60.9", "66.7", "67.1", "68.9", "71.9", "75.4", "92.7"
), class = "factor")), class = "data.frame", row.names = c(NA,
-16L), .Names = c("PrecipMM", "Date"))
results <- df %>%
mutate(years = year(Date), months = month(Date)) %>%
group_by(months) %>%
arrange(years) %>%
mutate(lagged.rain = lag(PrecipMM), rain.diff = PrecipMM - lagged.rain)
results
# Source: local data frame [16 x 6]
# Groups: months [12]
#
# PrecipMM Date years months lagged.rain rain.diff
# (dbl) (fctr) (dbl) (dbl) (dbl) (dbl)
# 1 4.4 1766-01-01 1766 1 NA NA
# 2 92.7 1767-01-01 1767 1 4.4 88.3
# 3 66.7 1766-02-01 1766 2 NA NA
# 4 115.3 1767-02-01 1767 2 66.7 48.6
# 5 48.2 1766-03-01 1766 3 NA NA
# 6 68.9 1767-03-01 1767 3 48.2 20.7
# 7 60.9 1766-04-01 1766 4 NA NA
# 8 38.9 1767-04-01 1767 4 60.9 -22.0
# 9 108.1 1766-05-01 1766 5 NA NA
# 10 109.2 1766-06-01 1766 6 NA NA
# 11 101.7 1766-07-01 1766 7 NA NA
# 12 38.1 1766-08-01 1766 8 NA NA
# 13 53.8 1766-09-01 1766 9 NA NA
# 14 71.9 1766-10-01 1766 10 NA NA
# 15 75.4 1766-11-01 1766 11 NA NA
# 16 67.1 1766-12-01 1766 12 NA NA