Related
I have a large dataframe (t2m.all) with longitude (lon), latitude (lat) and hourly temperatures since 1958 (X1958.01.01.00.00.00). A subset of the data is given in the code below:
dput(t2m.all[1:300,1:3])
structure(list(Lon = c(-102, -101.9, -101.8, -101.7, -101.6,
-101.5, -101.4, -101.3, -101.2, -101.1, -101, -100.9, -100.8,
-100.7, -100.6, -100.5, -100.4, -100.3, -100.2, -100.1, -100,
-99.9, -99.8, -99.7, -99.6, -99.5, -99.4, -99.3, -99.2, -99.1,
-99, -98.9, -98.8, -98.7, -98.6, -98.5, -98.4, -98.3, -98.2,
-98.1, -98, -97.9, -97.8, -97.7, -97.6, -97.5, -97.4, -97.3,
-97.2, -97.1, -97, -96.9, -96.8, -96.7, -96.6, -96.5, -96.4,
-96.3, -96.2, -96.1, -96, -95.9, -95.8, -95.7, -95.6, -95.5,
-95.4, -95.3, -95.2, -95.1, -95, -102, -101.9, -101.8, -101.7,
-101.6, -101.5, -101.4, -101.3, -101.2, -101.1, -101, -100.9,
-100.8, -100.7, -100.6, -100.5, -100.4, -100.3, -100.2, -100.1,
-100, -99.9, -99.8, -99.7, -99.6, -99.5, -99.4, -99.3, -99.2,
-99.1, -99, -98.9, -98.8, -98.7, -98.6, -98.5, -98.4, -98.3,
-98.2, -98.1, -98, -97.9, -97.8, -97.7, -97.6, -97.5, -97.4,
-97.3, -97.2, -97.1, -97, -96.9, -96.8, -96.7, -96.6, -96.5,
-96.4, -96.3, -96.2, -96.1, -96, -95.9, -95.8, -95.7, -95.6,
-95.5, -95.4, -95.3, -95.2, -95.1, -95, -102, -101.9, -101.8,
-101.7, -101.6, -101.5, -101.4, -101.3, -101.2, -101.1, -101,
-100.9, -100.8, -100.7, -100.6, -100.5, -100.4, -100.3, -100.2,
-100.1, -100, -99.9, -99.8, -99.7, -99.6, -99.5, -99.4, -99.3,
-99.2, -99.1, -99, -98.9, -98.8, -98.7, -98.6, -98.5, -98.4,
-98.3, -98.2, -98.1, -98, -97.9, -97.8, -97.7, -97.6, -97.5,
-97.4, -97.3, -97.2, -97.1, -97, -96.9, -96.8, -96.7, -96.6,
-96.5, -96.4, -96.3, -96.2, -96.1, -96, -95.9, -95.8, -95.7,
-95.6, -95.5, -95.4, -95.3, -95.2, -95.1, -95, -94.9, -102, -101.9,
-101.8, -101.7, -101.6, -101.5, -101.4, -101.3, -101.2, -101.1,
-101, -100.9, -100.8, -100.7, -100.6, -100.5, -100.4, -100.3,
-100.2, -100.1, -100, -99.9, -99.8, -99.7, -99.6, -99.5, -99.4,
-99.3, -99.2, -99.1, -99, -98.9, -98.8, -98.7, -98.6, -98.5,
-98.4, -98.3, -98.2, -98.1, -98, -97.9, -97.8, -97.7, -97.6,
-97.5, -97.4, -97.3, -97.2, -97.1, -97, -96.9, -96.8, -96.7,
-96.6, -96.5, -96.4, -96.3, -96.2, -96.1, -96, -95.9, -95.8,
-95.7, -95.6, -95.5, -95.4, -95.3, -95.2, -95.1, -95, -94.9,
-102, -101.9, -101.8, -101.7, -101.6, -101.5, -101.4, -101.3,
-101.2, -101.1, -101, -100.9, -100.8, -100.7), Lat = c(60, 60,
60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60, 60,
60, 60, 60, 60, 60, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9,
59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9,
59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9,
59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9,
59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9,
59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9,
59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.9, 59.8, 59.8,
59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8,
59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8,
59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8,
59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8,
59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8,
59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8, 59.8,
59.8, 59.8, 59.8, 59.8, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7,
59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7,
59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7,
59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7,
59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7,
59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7,
59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.7, 59.6,
59.6, 59.6, 59.6, 59.6, 59.6, 59.6, 59.6, 59.6, 59.6, 59.6, 59.6,
59.6, 59.6), X1958.01.01.00.00.00 = c(-37.2, -37.2, -36.6, -35.9,
-36, -36.1, -35.8, -35.5, -35.1, -34.7, -34.2, -33.6, -33.6,
-34, -33.8, -33.2, -32.8, -32.6, -32.5, -32.5, -32.4, -32.3,
-32.3, -32.7, -33.1, -33.2, -33.3, -33.3, -33.3, -33.1, -32.9,
-32.6, -32.4, -32.1, -31.8, -31.5, -31.2, -30.9, -30.6, -30.3,
-30, -29.7, -29.4, -29.1, -28.8, -28.5, -28.2, -27.9, -27.7,
-27.4, -27.1, -26.8, -26.6, -26.3, -26.1, -25.9, -25.6, -25.4,
-25.2, -24.9, -24.7, -24.4, -24.1, -23.8, -23.6, -23.3, -23,
-22.7, -22.4, -22.1, -21.9, -36.6, -37.1, -36.6, -36.1, -36,
-36.1, -35.7, -35, -34.6, -34.8, -34.7, -34, -33.8, -33.7, -33.6,
-33.2, -32.4, -32.3, -32.3, -32.3, -32.3, -32.8, -32.9, -32.7,
-33, -33.1, -33.1, -33.1, -33.1, -33, -32.8, -32.5, -32.3, -32.1,
-31.8, -31.5, -31.2, -30.9, -30.6, -30.3, -30.1, -29.8, -29.4,
-29.1, -28.8, -28.5, -28.3, -28, -27.7, -27.4, -27.1, -26.9,
-26.6, -26.4, -26.1, -25.9, -25.6, -25.4, -25.2, -24.9, -24.7,
-24.4, -24.1, -23.9, -23.6, -23.3, -23, -22.8, -22.5, -22.2,
-21.9, -36.7, -36.8, -36.8, -36.4, -36.2, -36, -35.9, -35.2,
-34.6, -34.6, -34.4, -34.2, -34.2, -33.8, -33.3, -32.7, -32.3,
-32.2, -32.3, -32.6, -33.3, -33.5, -33.2, -33.1, -32.5, -32.2,
-32.7, -33, -32.9, -32.8, -32.6, -32.5, -32.2, -32, -31.7, -31.5,
-31.2, -30.9, -30.6, -30.4, -30.1, -29.8, -29.5, -29.2, -28.9,
-28.6, -28.3, -28, -27.7, -27.4, -27.2, -26.9, -26.7, -26.4,
-26.2, -25.9, -25.7, -25.4, -25.2, -24.9, -24.7, -24.4, -24.2,
-23.9, -23.6, -23.4, -23.1, -22.8, -22.5, -22.3, -22, -21.9,
-36.6, -36.6, -36.5, -36.3, -36.2, -35.7, -35.1, -34.5, -34.1,
-34.5, -34.7, -34, -33.3, -33, -32.6, -32.6, -32.8, -33, -33.3,
-33.2, -32.9, -32.9, -33, -32.9, -33, -33.1, -33, -33, -32.9,
-32.8, -32.6, -32.4, -32.2, -32, -31.7, -31.4, -31.2, -31, -30.7,
-30.4, -30.1, -29.8, -29.4, -29.1, -28.8, -28.5, -28.3, -28,
-27.8, -27.5, -27.2, -26.9, -26.7, -26.4, -26.1, -25.9, -25.6,
-25.4, -25.2, -24.9, -24.7, -24.4, -24.2, -23.9, -23.6, -23.4,
-23.1, -22.9, -22.6, -22.3, -22, -21.9, -36.4, -36.3, -36, -35.7,
-35.3, -34.5, -34.3, -34.5, -34.5, -34.3, -33.9, -33.5, -33.2,
-32.9)), row.names = c(NA, 300L), class = "data.frame")
I wish to subset t2m.all to isolate a single row returning the temperature value for a specified latitude and longitude pairing. This works for some coordinates but strangely not for others. For example, the below code works for these coordinates:
res = subset(t2m.all, Lon == -100.7 & Lat == 59.8)
but does not work for these coordinates:
res = subset(t2m.all, Lon == -100.7 & Lat == 59.6)
Both set of coordinates are clearly in the t2m.all dataframe, so why can R subset for some sets of coordinates but not for others? The logic seems sound to me, so I'm not sure what can be wrong. I have spent a long time going through all possibilities and troubleshooting on Stackoverflow, but I have yet to find a solution. Any suggestions? Many thanks in advance.
The issue results from round-off errors. You could set a tolerance value when comparing Lon & Lat with a certain value. In base you could use abs(x-y) < 1e-5 to achieve it:
t2m.all |>
subset(abs(Lon - -100.7) < 1e-5 & abs(Lat - 59.6) < 1e-5)
# Lon Lat X1958.01.01.00.00.00
# 1 -100.7 59.6 -32.9
The dplyr equivalent is near():
library(dplyr)
t2m.all %>%
filter(near(Lon, -100.7) & near(Lat, 59.6))
# Lon Lat X1958.01.01.00.00.00
# 1 -100.7 59.6 -32.9
I have been trying to solve this for days, so any help would be appreciated!
I am trying to make an interaction plot for an OLS Regression.
This is the code I am using:
interact <- lm(ele$vt_c ~ ele$Immigrants:ele$X.qual, data = as.data.frame(ele))
interact_plot(model = interact, pred=Immigrants, modx =X.qual, modx.values = NULL, data = ele)
This is the error that is coming up
Error in ecdf(d[[modx]]) : 'x' must have 1 or more non-missing values
In addition: Warning message:
immigrants and X.qual are not included in an interaction with one another in the model.
Reproducible data
if (!"interactions" %in% installed.packages()) install.packages("interactions")
library(interactions)
ele = structure(list(vt_c = c(68.37056, 67.55938, 69.25354, 67.54727,
67.39343, 67.81161, 65.81312, 64.68675, 70.8572, 72.1439, 67.39006,
64.89897, 62.81833, 63.82975, 58.99062, 67.69617, 68.17096, 65.24267,
67.08106, 66.47592, 68.40781, 70.40636, 69.50657, 72.37613, 70.24236,
67.50159, 71.77177, 67.09047, 74.58491, 70.64892, 65.20199, 70.03566,
70.23142, 71.62487, 66.87982, 70.72528, 66.97507, 69.38713, 67.20061,
68.79907, 67.05735, 67.38101, 66.10595, 60.97635, 61.9047, 61.28828,
72.11577, 63.04311, 71.04747, 77.16823, 63.77144, 72.5249, 69.10145,
74.61647, 55.0847, 70.97664, 73.40273, 72.02715, 69.28485, 68.66256,
77.92079, 69.78192, 71.32363, 79.13777, 76.21347, 72.96919, 71.95923,
70.94545, 64.8141, 55.98621, 74.19439, 72.70276, 68.77999, 63.09397,
61.72898), Immigrants = c(57.3, 55.1, 50.6, 45.7, 42.8, 51.7,
51.2, 50.9, 44.9, 44.5, 44.3, 42.7, 50.5, 50.5, 39.2, 50.6, 39.7,
38.9, 39.2, 41.8, 42.5, 43.1, 39.5, 41.1, 44.2, 38.6, 41.8, 40.1,
43.8, 41.9, 38.2, 38.9, 37.5, 40.8, 33.2, 41.6, 38.1, 30, 38.8,
34.4, 36.5, 32.1, 41.3, 30.6, 32.9, 27.8, 35.4, 28.7, 37.1, 33.3,
29.8, 29.8, 33.8, 32.8, 28.8, 32.6, 31.6, 30.7, 28.6, 30.9, 34.7,
24.6, 24.7, 28.4, 26, 26.2, 27.4, 26.1, 22.6, 24.7, 32.4, 22.9,
26.4, 22.2, 22.1), X.qual = c(32.9, 29.8, 30.8, 32.5, 18.3, 47.3,
30.5, 29.8, 32.7, 38.5, 42.5, 25.8, 54.5, 52.2, 24.9, 29.3, 30.5,
23, 37.6, 22.3, 35.2, 54, 39.6, 42.8, 30.4, 41.5, 47.5, 44.5,
48.4, 31.3, 25.9, 28.2, 41.6, 46.5, 24.8, 36.3, 45.2, 27, 48.7,
40, 42.1, 19.7, 53.7, 26, 21.8, 12.1, 51.6, 19.2, 46.6, 54.4,
24.9, 30.1, 47.4, 51.4, 29.7, 57.4, 48.8, 47.6, 34.3, 22.8, 52,
21.8, 29.6, 55.2, 38.6, 37.4, 39.3, 25.9, 15.7, 19.8, 38.2, 39.3,
37.7, 18.3, 32.6)), class = "data.frame", row.names = c(NA, -75L
))
interact <- lm(vt_c ~ Immigrants:X.qual,
data = ele)
interact_plot(model = interact, pred=immigrants,
modx =X.qual, data = ele)
Thank you!
Welcome to SO, Lucia Thomas!
I read this message and it sounded so much more thorough than what usually write about reproducible questions:
Please make this question reproducible. This includes sample code you've attempted (including listing non-base R packages, and any errors/warnings received), sample unambiguous data (e.g., data.frame(x=...,y=...) or the output from dput(head(x))), and intended output given that input. Refs: stackoverflow.com/q/5963269, minimal reproducible example, and stackoverflow.com/tags/r/info.
That being said, I think I can help. Right now you have called each variable as a vector and called a data frame in your call to lm(). This has led to an incompatibility issue between these two functions.
ele = structure(list(vt_c = c(68.37056, 67.55938, 69.25354, 67.54727,
67.39343, 67.81161, 65.81312, 64.68675, 70.8572, 72.1439, 67.39006,
64.89897, 62.81833, 63.82975, 58.99062, 67.69617, 68.17096, 65.24267,
67.08106, 66.47592, 68.40781, 70.40636, 69.50657, 72.37613, 70.24236,
67.50159, 71.77177, 67.09047, 74.58491, 70.64892, 65.20199, 70.03566,
70.23142, 71.62487, 66.87982, 70.72528, 66.97507, 69.38713, 67.20061,
68.79907, 67.05735, 67.38101, 66.10595, 60.97635, 61.9047, 61.28828,
72.11577, 63.04311, 71.04747, 77.16823, 63.77144, 72.5249, 69.10145,
74.61647, 55.0847, 70.97664, 73.40273, 72.02715, 69.28485, 68.66256,
77.92079, 69.78192, 71.32363, 79.13777, 76.21347, 72.96919, 71.95923,
70.94545, 64.8141, 55.98621, 74.19439, 72.70276, 68.77999, 63.09397,
61.72898), immigrants = c(57.3, 55.1, 50.6, 45.7, 42.8, 51.7,
51.2, 50.9, 44.9, 44.5, 44.3, 42.7, 50.5, 50.5, 39.2, 50.6, 39.7,
38.9, 39.2, 41.8, 42.5, 43.1, 39.5, 41.1, 44.2, 38.6, 41.8, 40.1,
43.8, 41.9, 38.2, 38.9, 37.5, 40.8, 33.2, 41.6, 38.1, 30, 38.8,
34.4, 36.5, 32.1, 41.3, 30.6, 32.9, 27.8, 35.4, 28.7, 37.1, 33.3,
29.8, 29.8, 33.8, 32.8, 28.8, 32.6, 31.6, 30.7, 28.6, 30.9, 34.7,
24.6, 24.7, 28.4, 26, 26.2, 27.4, 26.1, 22.6, 24.7, 32.4, 22.9,
26.4, 22.2, 22.1), X.qual = c(32.9, 29.8, 30.8, 32.5, 18.3, 47.3,
30.5, 29.8, 32.7, 38.5, 42.5, 25.8, 54.5, 52.2, 24.9, 29.3, 30.5,
23, 37.6, 22.3, 35.2, 54, 39.6, 42.8, 30.4, 41.5, 47.5, 44.5,
48.4, 31.3, 25.9, 28.2, 41.6, 46.5, 24.8, 36.3, 45.2, 27, 48.7,
40, 42.1, 19.7, 53.7, 26, 21.8, 12.1, 51.6, 19.2, 46.6, 54.4,
24.9, 30.1, 47.4, 51.4, 29.7, 57.4, 48.8, 47.6, 34.3, 22.8, 52,
21.8, 29.6, 55.2, 38.6, 37.4, 39.3, 25.9, 15.7, 19.8, 38.2, 39.3,
37.7, 18.3, 32.6)), class = "data.frame", row.names = c(NA, -75L
))
Since you called the data frame, call the names of the columns, without the data frame appended:
interact <- lm(vt_c ~ immigrants:X.qual,
data = ele)
interact_plot(model = interact, pred=immigrants,
modx =X.qual, data = ele)
I have a data frame with 4000 columns and daily observations sorted by time. I want to create new columns that lag all existing columns 50 times in the past. So for a column Y create 50 additional columns that are Y-1day,Y-2days,Y-3days...Y-50days.
So far I've wrapped the following loop which does what I need to make.
The issue is that it's not very fast. Is there a more efficient way I can test?
for(i in 2:ncol(Data)){
for(j in 1:50){
Data<- slide(Data, Var = names(Data[i]), slideBy = -j)
}}
I'm attaching a snapshot of my data frame for reproducible example:
structure(list(time = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
93, 94, 95, 96, 97, 98, 99, 100), A = c(17.081545, 16.630901,
16.623749, 16.258942, 16.244635, 16.165951, 15.886981, 15.865522,
15.529327, 15.772532, 16.04435, 15.779685, 15.915594, 15.593705,
15.336195, 15.593705, 15.736767, 15.736767, 15.457797, 15.815451,
16.108727, 16.237482, 15.808297, 16.058655, 16.53791, 16.988556,
16.516453, 16.480686, 16.967096, 17.181688, 17.446352, 17.11731,
16.952789, 16.8598, 16.795422, 16.437769, 16.587982, 16.845493,
17.167381, 17.510729, 17.410587, 17.474964, 17.246065, 17.703863,
17.424892, 17.174536, 17.103004, 16.695278, 16.93133, 16.638054,
16.115879, 16.20887, 15.987124, 16.151646, 16.151646, 16.115879,
16.173105, 16.101574, 16.080114, 15.9299, 15.879828, 15.786839,
15.314735, 15.27897, 15.493563, 15.436337, 15.286123, 15.121602,
15.27897, 14.88555, 14.785408, 14.592275, 14.785408, 14.856938,
14.670959, 15.243204, 15.09299, 15.250358, 15.264664, 15.18598,
14.771102, 14.842632, 15, 15.150214, 15.200286, 15.078684, 15.379113,
15.658083, 15.636623, 15.879828, 15.715307, 15.729613, 15.422031,
16.080114, 16.39485, 16.502146, 16.74535, 16.902718, 17.088697,
16.831188), AAP = c(29.033333, 28.84, 28.893333, 28.866667, 28.700001,
28.799999, 28.973333, 28.866667, 28.806667, 28.973333, 29.713333,
29.033333, 28.626667, 28.546667, 28.173334, 28.166666, 28.24,
28.553333, 28.366667, 28.733334, 28.833334, 28.9, 29.166666,
29.846666, 30.08, 30.093334, 29.673334, 29.860001, 30.053333,
30.186666, 29.833334, 29.673334, 34.533333, 33.82, 33.373333,
33.633335, 33.593334, 33.833332, 33.586666, 33.946667, 34.66,
34.599998, 34.84, 34.779999, 34.093334, 33.713333, 33.560001,
33.933334, 33.086666, 33.139999, 33.279999, 33.200001, 33.259998,
32.466667, 32.713333, 32.686668, 33.053333, 33.806667, 33.333332,
33.613335, 33.633335, 33.799999, 34.206665, 34.5, 34.166668,
34.206665, 33.933334, 34, 34.373333, 33.700001, 33.173332, 32.633335,
32.639999, 34.013332, 33.566666, 34.053333, 34.053333, 34.826668,
35.106667, 35.68, 35.653332, 35.566666, 35.380001, 35.419998,
35.966667, 36.573334, 36.673332, 36.486668, 36.286667, 36.099998,
35.433334, 35.419998, 35.84, 36.533333, 36.779999, 38.98, 39.633335,
39.646667, 39.486668, 39.433334), AAPL = c(4.520714, 4.567143,
4.607143, 4.610714, 4.946429, 4.925714, 4.611429, 4.675714, 4.985714,
5.014286, 5.046429, 4.991428, 5.032857, 5.035, 5.054286, 5.146429,
5.160714, 5.188571, 5.284286, 5.492857, 5.537857, 5.687857, 5.557857,
5.631429, 5.638571, 5.778572, 5.624286, 5.597143, 5.800714, 6.045,
6.315, 6.437857, 6.272143, 6.200714, 6.092143, 6.302143, 6.352143,
6.356429, 6.408571, 6.357143, 6.302857, 5.97, 6.115714, 6.107143,
5.79, 5.621428, 5.69, 5.752857, 5.76, 5.851429, 5.882857, 6.035714,
6.137143, 6.242857, 6.118571, 6.078571, 6.071429, 6.075714, 5.964286,
6.114286, 5.952857, 5.841429, 5.87, 5.984286, 6.047143, 6.222857,
6.248571, 5.988572, 6.094285, 5.862857, 5.322857, 5.05, 5.088572,
5.298572, 5.072857, 5.311429, 5.071429, 5.282857, 5.17, 5.135714,
5.077143, 5.151429, 5.204286, 5.172857, 5.307143, 5.24, 5.32,
5.281428, 5.202857, 5.087143, 4.875714, 4.967143, 5.078571, 5.051429,
5.12, 5.364286, 5.364286, 5.68, 5.671429, 5.682857), ABC = c(14.5375,
14.4225, 14.395, 14.5175, 14.475, 14.475, 14.51, 14.515, 14.275,
14.3175, 14.4875, 14.375, 14.5025, 14.2525, 14.3925, 14.13, 14.47,
14.365, 14.5925, 14.57, 14.74, 14.71, 14.995, 14.9, 14.8625,
15.0325, 14.78, 14.875, 15.085, 15.0525, 15.4275, 15.3075, 14.9225,
15, 14.7025, 14.7975, 15, 15, 14.975, 15.3775, 15.435, 15.5325,
15.6625, 15.6575, 15.695, 15.1275, 15.1025, 15.0775, 15.265,
15.0325, 14.905, 15.1975, 15.215, 15.2025, 15.1025, 15.3775,
15.2775, 13.5075, 13.5275, 13.95, 14.3225, 14.09, 14.4275, 14.735,
14.6475, 14.8, 14.4575, 14.62, 14.7525, 14.7, 14.9, 15.125, 14.83,
14.9525, 14.825, 14.9625, 15, 14.975, 14.9675, 15.0975, 15.0875,
15.32, 15.5125, 15.38, 15.51, 15.575, 15.7475, 15.9975, 15.9175,
15.895, 15.955, 15.98, 16.209999, 16.459999, 16.5725, 16.514999,
16.4925, 16.5, 16.495001, 16.4825), ABMD = c(15.01, 14.98, 14.69,
14.52, 14.29, 14.42, 14.31, 14.17, 12.45, 12.05, 11.87, 11.97,
11.41, 11.16, 11.06, 11.2, 11.1, 11.57, 11.43, 11.88, 11.58,
11.12, 11.16, 11.32, 10.97, 10.88, 10.72, 10.3, 10.75, 10.25,
10.29, 10.41, 10.02, 10.05, 10.08, 10, 10.24, 10.89, 10.7, 10.8,
10.66, 10.71, 11.12, 11.18, 11.2, 10.95, 11.07, 11.12, 11.3,
11.19, 10.83, 10.56, 10.37, 10.47, 10.33, 10.17, 10.51, 10.4,
10.56, 10.74, 10.58, 10.6, 10.57, 10.71, 11.23, 11.28, 11.51,
11.15, 10.98, 10.98, 11.05, 10.76, 10.96, 11.1, 10.62, 11.1,
10.53, 10.69, 10.65, 10.73, 10.15, 10.15, 9.52, 9.6, 9.6, 9.52,
9.47, 9.44, 9.35, 9.27, 9.13, 8.92, 9.26, 9.45, 9.97, 10.25,
10.28, 9.99, 10.16, 10.17), ABT = c(22.392265, 22.166759, 21.912466,
22.40666, 22.790501, 23.011208, 22.588984, 22.517014, 22.085194,
22.19075, 22.089993, 22.09479, 21.95085, 22.061205, 22.037214,
22.027618, 22.018023, 21.811708, 21.720547, 21.600595, 21.854891,
21.898071, 21.907667, 21.840496, 21.874083, 21.725344, 21.667768,
21.581404, 22.166759, 22.305902, 22.488226, 22.469034, 22.339487,
22.26272, 21.802113, 21.946053, 22.243528, 22.200346, 22.066002,
22.051607, 22.099588, 22.075598, 22.267517, 22.382669, 22.310699,
22.02282, 22.209942, 22.070801, 22.128376, 21.907667, 21.792517,
21.365494, 21.336706, 21.048826, 20.996048, 21.39908, 21.562212,
21.677364, 21.95085, 22.430651, 22.368277, 22.161963, 22.157164,
22.646561, 22.843279, 23.19833, 22.963228, 22.91045, 22.98242,
23.049591, 23.169542, 23.927626, 23.500605, 23.111965, 22.69454,
23.078381, 22.824085, 22.920046, 23.001612, 23.255905, 23.073582,
23.586967, 23.692524, 23.634949, 23.850859, 23.601362, 23.519796,
23.543785, 23.438231, 23.634949, 23.567776, 23.395048, 23.735706,
23.706919, 23.678129, 23.529392, 23.452623, 23.366261, 23.351866,
23.145552), ACN = c(26.370001, 25.75, 25.65, 25.42, 26.610001,
26.959999, 26.5, 26.389999, 26.18, 26.290001, 26.1, 26, 25.67,
25.16, 24.9, 25.200001, 25.4, 25.68, 25.6, 26.049999, 25.99,
25.83, 25.48, 25.73, 25.77, 25.85, 25.51, 25.42, 25.200001, 24.639999,
24.9, 25.049999, 24.51, 24.9, 24.799999, 24.709999, 24.48, 25.15,
25.549999, 25.59, 25.42, 25.110001, 25.370001, 25.49, 25.32,
25.17, 24.950001, 24.459999, 24.48, 23.98, 24.030001, 23.950001,
23.66, 24.01, 24.280001, 24.299999, 24.4, 24.57, 24.16, 24.559999,
24.15, 24.440001, 24.35, 24.860001, 24.969999, 24.889999, 23.700001,
23.34, 23.440001, 23.120001, 22.860001, 22.5, 22.57, 22.440001,
21.9, 21.959999, 21.75, 21.85, 21.549999, 21.469999, 21.620001,
21.700001, 21.969999, 22.1, 22.1, 21.82, 22, 22.08, 21.860001,
21.92, 21.99, 22.049999, 22.01, 22.049999, 22.5, 22.790001, 22.719999,
22.76, 22.67, 22.34), ADBE = c(30.844999, 30.030001, 29.865,
29.370001, 29.389999, 29.41, 29.059999, 29.49, 29.110001, 29.115,
29.190001, 28.940001, 29.035, 28.535, 27.695, 27.790001, 28.004999,
28.084999, 27.74, 28.450001, 28.950001, 31.145, 31.709999, 31.995001,
31.76, 31.85, 31.295, 31.34, 31.85, 31.735001, 32.455002, 32.299999,
31.535, 31.415001, 30.754999, 30.875, 30.695, 30.715, 30.875,
31.17, 31.174999, 31.174999, 31.885, 32.535, 32.474998, 32.255001,
32.654999, 32.209999, 32.669998, 32.27, 31.594999, 31.945, 33.904999,
33.349998, 33.18, 33.134998, 33.27, 33.555, 33.110001, 33.865002,
33.584999, 33.380001, 33.290001, 33.424999, 34.049999, 34.195,
33.630001, 33.400002, 33.450001, 32.535, 31.74, 30.33, 27.385,
29.049999, 28.625, 29.77, 30.145, 30.02, 29.559999, 29.225, 29.235001,
29.735001, 28.575001, 28.645, 28.775, 28.459999, 28.85, 29.334999,
28.76, 28.965, 28.889999, 29.049999, 29.955, 29.889999, 30.549999,
31.059999, 31.115, 31.360001, 32.419998, 32.759998), ADI = c(36.389999,
35.400002, 35.560001, 35.5, 35.549999, 35.41, 35.080002, 35.560001,
35.099998, 35.639999, 36.07, 35.139999, 34.650002, 34.470001,
34.049999, 34.299999, 34.880001, 34.830002, 34.740002, 35.889999,
35.990002, 36.009998, 35.240002, 37.52, 37.52, 38.02, 37.18,
36.830002, 38.049999, 37.599998, 37.32, 37.130001, 36.700001,
36.299999, 36.5, 36.59, 37.32, 37.5, 36.720001, 38, 37.709999,
36.93, 37.119999, 37.049999, 36.950001, 36.919998, 37.849998,
37.130001, 37.209999, 36.57, 35.919998, 36.02, 35.830002, 35.709999,
35.830002, 36.23, 35.799999, 35.66, 35.119999, 36.330002, 36.139999,
35.709999, 35.599998, 35.310001, 35.41, 36.09, 35.669998, 35.34,
34.93, 34.099998, 33.650002, 32.84, 33.360001, 33.849998, 33.419998,
34.349998, 33.799999, 33.700001, 33.52, 33.360001, 33.52, 34.110001,
33.849998, 33.669998, 34.560001, 34.619999, 34.619999, 34.549999,
34.130001, 34.060001, 34.310001, 35.490002, 36.419998, 36.700001,
36.860001, 36.889999, 37.080002, 36.529999, 36.849998, 36.290001
)), row.names = c(NA, 100L), class = "data.frame")
We can use shift from data.table which can take a vector of values for n
library(data.table)
setDT(Data)
out <- Data[, shift(.SD, n = 1:50), .SDcols = -1]
names(out) <- paste0(rep(names(Data)[-1], each = 50), "_", 1:50, "days")
Data[, names(out) := out][]
I am trying to perform 6 months forecasting over production data for three power plants, I built my data as an hts object that has 3 levels. However, when I am performing the forecast function and then try to see the accuracy using test data I get the following error: "Error in x - fcasts: non-conformable arrays"
Furthermore, when I try to apply the "arima" as a forecasting method on the hts object I get the following (the warning message is repeated 9 times, as I have 9 time series in the hts object):
forecasts <- forecast(data,h = 6 , method = "bu" , fmethod = "arima")
I used the following instructions to get the hts object:
and the data has the following structure:
I am not sure where I am going wrong. Anyone can help with some thoughts??
Thank you!
The data:
structure(list(LarGroup1 = c(188.3, 187.2, 94.7, 109.2, 202.7,
146.6, 121.9, 151.3, 111.1, 103.4, 188.1, 168.1, 233.9, 230.7,
187.1, 0, 98.9, 173.5, 149.4, 168.6, 4.7, 14.8, 91.8, 166.5,
170.5, 123.6, 85.2, 64.4), LarGroup2 = c(159.1, 127.7, 210.3,
199.8, 113, 143.4, 144.5, 83.8, 41.6, 35.1, 95.2, 178.2, 241.1,
236.4, 181.9, 194.3, 196.1, 92.4, 154.6, 78.9, 35.7, 0, 74.5,
75.1, 140, 142.5, 3.8, 17.5), RibGroup1 = c(49.4, 102.4, 50.8,
118.8, 108.4, 139.5, 121.7, 69.6, 53.4, 28, 113.3, 96.3, 70.8,
124.4, 54.4, 128.7, 63.3, 2.1, 41.3, 0.4, 0.6, 0, 5.4, 57.9,
9.9, 30, 221, 167.2), RibGroup2 = c(32.7, 32, 98.1, 6.3, 85.5,
96.6, 41.1, 44.9, 50.4, 27.3, 0, 45.4, 199.1, 179.2, 86.1, 0,
58.4, 43.3, 41.8, 42.1, 22.1, 11.8, 71.8, 112, 204.1, 40.9, 24.5,
210.9), RibGroup3 = c(90.8, 15.4, 10.5, 124.4, 33.9, 8.4, 38.3,
56.9, 13.5, 0, 32.6, 132.8, 160.7, 168.7, 60.7, 131.9, 110.8,
29.2, 131.3, 62.1, 6.1, 0, 0, 3.4, 23.9, 192.7, 165.5, 0), SinGroup1 = c(235.2,
225.4, 226.1, 234.4, 222.1, 232.3, 233.4, 201.9, 195.3, 209.4,
233.6, 223.6, 222.2, 232, 224, 149.8, 201.6, 220.2, 203.1, 212.1,
71.9, 82.3, 183.2, 210.6, 198.6, 230.8, 218, 163.2), SinGroup2 = c(233.4,
225.6, 227, 51.6, 76, 230.7, 233.1, 202.7, 200.2, 207.2, 228.4,
226.2, 183.9, 230.4, 222.3, 227.7, 177.9, 152, 218.6, 210.6,
80.9, 63.2, 188.1, 209.5, 233.2, 210.1, 226.5, 200.5), SinGroup3 = c(233.2,
188.5, 226.9, 234.7, 222.8, 234.6, 220.6, 156.4, 209.2, 218.7,
232.9, 226.1, 215.4, 231, 222.7, 222.7, 183.7, 203.8, 216.8,
112, 0, 39.6, 180.8, 203.6, 221.1, 228.9, 202.8, 186.7), SinGroup4 = c(218,
215.5, 226.8, 235.6, 223.6, 234.8, 234.9, 69.3, 192, 207.8, 235.2,
217.2, 235.1, 231.8, 223.5, 230.5, 225.6, 220.1, 220, 211.9,
114.8, 44.5, 158.5, 206.3, 231.8, 179, 225.3, 198.6)), class = "data.frame", row.names = c(NA,
-28L))
In the accuracy function, you need to include test data, not training data. You ask for 6 steps ahead, but your test data only consists of 4 time periods.
The seasonal differencing error suggests you are using an old version of the forecast package. Please update your packages.
The following code works using current CRAN packages (forecast v8.4, hts v
library(hts)
Production_data <- data.frame(
LarGroup1 = c(
188.3, 187.2, 94.7, 109.2, 202.7,
146.6, 121.9, 151.3, 111.1, 103.4, 188.1, 168.1, 233.9, 230.7,
187.1, 0, 98.9, 173.5, 149.4, 168.6, 4.7, 14.8, 91.8, 166.5,
170.5, 123.6, 85.2, 64.4
), LarGroup2 = c(
159.1, 127.7, 210.3,
199.8, 113, 143.4, 144.5, 83.8, 41.6, 35.1, 95.2, 178.2, 241.1,
236.4, 181.9, 194.3, 196.1, 92.4, 154.6, 78.9, 35.7, 0, 74.5,
75.1, 140, 142.5, 3.8, 17.5
), RibGroup1 = c(
49.4, 102.4, 50.8,
118.8, 108.4, 139.5, 121.7, 69.6, 53.4, 28, 113.3, 96.3, 70.8,
124.4, 54.4, 128.7, 63.3, 2.1, 41.3, 0.4, 0.6, 0, 5.4, 57.9,
9.9, 30, 221, 167.2
), RibGroup2 = c(
32.7, 32, 98.1, 6.3, 85.5,
96.6, 41.1, 44.9, 50.4, 27.3, 0, 45.4, 199.1, 179.2, 86.1, 0,
58.4, 43.3, 41.8, 42.1, 22.1, 11.8, 71.8, 112, 204.1, 40.9, 24.5,
210.9
), RibGroup3 = c(
90.8, 15.4, 10.5, 124.4, 33.9, 8.4, 38.3,
56.9, 13.5, 0, 32.6, 132.8, 160.7, 168.7, 60.7, 131.9, 110.8,
29.2, 131.3, 62.1, 6.1, 0, 0, 3.4, 23.9, 192.7, 165.5, 0
), SinGroup1 = c(
235.2,
225.4, 226.1, 234.4, 222.1, 232.3, 233.4, 201.9, 195.3, 209.4,
233.6, 223.6, 222.2, 232, 224, 149.8, 201.6, 220.2, 203.1, 212.1,
71.9, 82.3, 183.2, 210.6, 198.6, 230.8, 218, 163.2
), SinGroup2 = c(
233.4,
225.6, 227, 51.6, 76, 230.7, 233.1, 202.7, 200.2, 207.2, 228.4,
226.2, 183.9, 230.4, 222.3, 227.7, 177.9, 152, 218.6, 210.6,
80.9, 63.2, 188.1, 209.5, 233.2, 210.1, 226.5, 200.5
), SinGroup3 = c(
233.2,
188.5, 226.9, 234.7, 222.8, 234.6, 220.6, 156.4, 209.2, 218.7,
232.9, 226.1, 215.4, 231, 222.7, 222.7, 183.7, 203.8, 216.8,
112, 0, 39.6, 180.8, 203.6, 221.1, 228.9, 202.8, 186.7
), SinGroup4 = c(
218,
215.5, 226.8, 235.6, 223.6, 234.8, 234.9, 69.3, 192, 207.8, 235.2,
217.2, 235.1, 231.8, 223.5, 230.5, 225.6, 220.1, 220, 211.9,
114.8, 44.5, 158.5, 206.3, 231.8, 179, 225.3, 198.6
)
)
Production_data_ts <- ts(Production_data, frequency = 12, start = c(2016, 7))
Production_data_hts <- hts(Production_data_ts, characters = c(3, 6))
data <- window(Production_data_hts, start = c(2016, 7), end = c(2018, 6))
test <- window(Production_data_hts, start = c(2018, 7), end = c(2018, 10))
forecasts <- forecast(data, h = 4, method = "bu")
accuracy(forecasts, test)
I have a large data set that consists of thousands of measurements of length and weight. I have provided a subset of 500 observations here:
df <- structure(list(length_cm = c(24.7, 23.8, 21.9, 23.2, 23.5, 22.2,
20.5, 22.6, 24, 21.6, 22.4, 21.2, 20.6, 23.1, 21.4, 23.1, 23.5,
23, 21.8, 22.4, 23, 23.8, 24, 21, 23.4, 23.2, 21.6, 25.9, 22.1,
30.6, 22.1, 21.7, 23.2, 21.1, 23.8, 23.2, 27.2, 23.8, 21.6, 21.1,
21.7, 22.9, 23.3, 24.1, 22.7, 20.4, 22.5, 21.7, 23.2, 22.7, 20.6,
23.7, 24.6, 23.5, 26.3, 23.6, 22.2, 23.6, 21.4, 23.3, 24.7, 24.4,
21.8, 24.9, 22.2, 23.1, 25, 23.5, 22.5, 20.4, 23.9, 23.7, 24,
24.2, 22.9, 36.4, 30, 26, 28.5, 27, 35.7, 24.3, 28.6, 29.8, 18.7,
25.7, 34.7, 31.4, 23.4, 37.7, 26.7, 28.3, 30.8, 29.2, 27.2, 25.6,
39, 35.1, 41.2, 35.7, 29.9, 25.7, 24.6, 24, 24.9, 31, 29.9, 29.4,
25.4, 20.2, 27.8, 32.7, 23.4, 29.1, 26.3, 25.7, 26, 24.9, 26.3,
31.5, 30.1, 25.9, 28.8, 37.9, 38.4, 21.5, 20.5, 21.3, 21.3, 20.9,
20.8, 22.5, 22.4, 21.4, 16.8, 17.3, 22.7, 19.7, 21.2, 18.1, 23.5,
18.1, 22, 18.5, 18.4, 19.2, 19.4, 19.9, 20.5, 18.6, 22.6, 20.9,
20.7, 20.6, 20.6, 21.6, 23.7, 22.8, 22.9, 20.8, 21.3, 23.5, 21.1,
21.6, 24, 21, 23.3, 20.3, 22.4, 23.7, 24.6, 20.7, 23.1, 22.6,
22.7, 19.5, 23, 19.8, 21, 19.8, 19.8, 17.2, 21.8, 25.3, 21.3,
19.2, 22.1, 24.5, 23.2, 22.6, 19, 22, 17.5, 19.9, 24.4, 23.7,
19.9, 23, 20.5, 18.3, 23.2, 21.1, 20.4, 22.2, 19.7, 19.2, 24,
23.3, 23.3, 19, 21.5, 22, 19.1, 23.7, 19.9, 21.2, 23, 27.3, 20.7,
22, 19.3, 24.9, 18.2, 20, 19.3, 25, 18, 21.8, 23.4, 23.9, 25.2,
18.5, 22.2, 24.6, 22, 20.4, 20.7, 21.7, 19.1, 23.1, 21.5, 21.2,
20.6, 22.3, 22.8, 21.3, 21.6, 22, 23, 24.2, 21.3, 19.7, 18.8,
20.9, 20.3, 22.3, 18.9, 19.9, 20.2, 23.9, 19.7, 19.5, 17.6, 23.1,
20.4, 20, 19.7, 20.3, 21.2, 23.9, 24, 25.6, 23.9, 23.5, 20.5,
30.8, 32.8, 28.4, 28.7, 28, 28.9, 29.8, 31, 31.7, 28.6, 28.7,
28.7, 26.7, 24.6, 30, 36.5, 26.5, 32, 29.6, 30.7, 27.7, 24.1,
29.8, 28.8, 26, 22.4, 24, 24.8, 22.7, 22.7, 23.8, 25.3, 32.3,
26.8, 22.1, 24.2, 23.8, 25.3, 24.1, 22.6, 22.9, 24.4, 26.7, 24.4,
24.7, 25, 23.7, 24.3, 22.3, 22.7, 20, 22.5, 24.5, 25.1, 24, 22,
20, 21.9, 18.3, 19.9, 19.4, 23.5, 20.2, 20, 17.8, 20.5, 23.2,
18.5, 21.2, 18.2, 19.1, 22.1, 18.3, 21.6, 19.5, 22.7, 23.6, 24.6,
23.2, 24.4, 19.1, 22.8, 23, 18.8, 22.6, 19, 21.7, 20.8, 23.7,
20.8, 20, 23.2, 22, 21.4, 20.6, 22.6, 23.8, 21, 26.4, 24.5, 32.6,
36.1, 36, 31, 33.1, 31.3, 34.2, 41.9, 35.4, 33.9, 31.9, 29.3,
34.2, 29.9, 36.4, 38.5, 30.7, 40.2, 34.1, 29.7, 37.8, 37.8, 35.3,
39, 39.5, 34.1, 30.5, 33.3, 33.2, 36, 31.6, 35, 34.2, 33.1, 31.5,
33.5, 33.7, 39, 33.2, 35, 34.1, 32.6, 36.2, 34.4, 31.7, 32, 37.5,
31.5, 32.7, 31.7, 35.7, 32.4, 28.5, 33.7, 33.9, 33.6, 34, 32,
29.8, 35, 36, 31.7, 32.5, 32, 31, 29.5, 33.4, 32.5, 26.5, 28,
35.3, 26, 26.5, 38.9, 32.7, 36.4, 35.7, 27.7, 25.8, 25.3, 30.1,
36, 33.4, 37, 33.6, 31.7, 29.7, 35.9, 28.5, 33.1, 33.9, 29, 36.5,
35.5, 29.2, 37.3, 40.3, 35.7, 32.6, 38.8, 40, 38.9, 39, 33.3,
33.5, 34.3, 38.8, 34.4, 36, 35.9, 35.1, 30.7, 38.1, 31.3, 35,
36.3, 32.4, 32.3, 35.5, 36.4, 36, 40.8, 34.2, 30.1, 35.6), wt_kg = c(0.165,
0.1412, 0.1043, 0.1225, 0.1247, 0.1099, 0.087, 0.1176, 0.1431,
0.1041, 0.1213, 0.0937, 0.0856, 0.1255, 0.1099, 0.124, 0.1361,
0.1384, 0.1021, 0.1113, 0.12, 0.1513, 0.1448, 0.0978, 0.138,
0.1232, 0.0942, 0.1881, 0.1038, 0.3498, 0.1122, 0.094, 0.1268,
0.1009, 0.1358, 0.12, 0.2388, 0.1456, 0.0982, 0.0903, 0.1005,
0.1252, 0.1138, 0.1476, 0.1326, 0.0849, 0.108, 0.0996, 0.1229,
0.1279, 0.0874, 0.1492, 0.1416, 0.1187, 0.193, 0.1383, 0.1125,
0.1449, 0.0941, 0.1265, 0.1823, 0.1455, 0.0948, 0.1603, 0.1119,
0.1124, 0.1641, 0.1259, 0.116, 0.086, 0.1361, 0.1284, 0.1403,
0.1461, 0.1195, 0.5985, 0.3099, 0.1829, 0.2688, 0.2244, 0.6214,
0.1554, 0.2475, 0.2976, 0.0683, 0.1731, 0.4751, 0.356, 0.1388,
0.5939, 0.2122, 0.2784, 0.3689, 0.3127, 0.2284, 0.1775, 0.6697,
0.5998, 0.8374, 0.5647, 0.3187, 0.1704, 0.1619, 0.1413, 0.1621,
0.3577, 0.319, 0.2846, 0.1815, 0.0776, 0.2567, 0.4483, 0.1337,
0.2798, 0.202, 0.1847, 0.1758, 0.1659, 0.1828, 0.3669, 0.3211,
0.1863, 0.2559, 0.6901, 0.6483, 0.0922, 0.088, 0.099, 0.0836,
0.094, 0.099, 0.1157, 0.1138, 0.1046, 0.0495, 0.0513, 0.119,
0.0761, 0.0936, 0.0564, 0.1438, 0.0636, 0.1134, 0.0641, 0.0594,
0.0713, 0.0733, 0.0804, 0.0853, 0.0689, 0.118, 0.0892, 0.0875,
0.0837, 0.0807, 0.1065, 0.1385, 0.1163, 0.1305, 0.0923, 0.0974,
0.1176, 0.0848, 0.1059, 0.157, 0.0932, 0.1127, 0.0779, 0.1048,
0.1327, 0.1688, 0.1096, 0.1304, 0.1173, 0.115, 0.0742, 0.129,
0.0629, 0.0992, 0.0758, 0.0722, 0.0535, 0.0958, 0.1721, 0.1017,
0.0766, 0.1099, 0.152, 0.128, 0.1185, 0.065, 0.1176, 0.0565,
0.0866, 0.163, 0.12, 0.0825, 0.1149, 0.0839, 0.0587, 0.1335,
0.0968, 0.0901, 0.1073, 0.0802, 0.0744, 0.1493, 0.1384, 0.1128,
0.0738, 0.1146, 0.1108, 0.08, 0.1285, 0.0829, 0.1116, 0.1368,
0.2348, 0.0995, 0.0989, 0.0748, 0.1484, 0.0629, 0.0823, 0.075,
0.1768, 0.0607, 0.1142, 0.1289, 0.1506, 0.1742, 0.0626, 0.1187,
0.1509, 0.1144, 0.0928, 0.0946, 0.099, 0.0717, 0.1318, 0.1025,
0.093, 0.0972, 0.1325, 0.1209, 0.0943, 0.1006, 0.1073, 0.1336,
0.1439, 0.1066, 0.0765, 0.0673, 0.1082, 0.0923, 0.1139, 0.068,
0.0758, 0.0868, 0.1499, 0.0779, 0.0794, 0.0575, 0.1392, 0.0915,
0.0845, 0.086, 0.084, 0.1049, 0.1486, 0.1573, 0.177, 0.1319,
0.13, 0.0872, 0.388, 0.4751, 0.2898, 0.2931, 0.2663, 0.2838,
0.3494, 0.3675, 0.4342, 0.2907, 0.3072, 0.2815, 0.2761, 0.1945,
0.3512, 0.615, 0.2195, 0.4818, 0.3684, 0.4056, 0.2841, 0.1617,
0.3425, 0.288, 0.1962, 0.1285, 0.1553, 0.1708, 0.1332, 0.1167,
0.1491, 0.2028, 0.1267, 0.2406, 0.1257, 0.1499, 0.1559, 0.1895,
0.1508, 0.1111, 0.1274, 0.1675, 0.2324, 0.1732, 0.1491, 0.1568,
0.1465, 0.1548, 0.1245, 0.1399, 0.0855, 0.1151, 0.1612, 0.1693,
0.1493, 0.1208, 0.088, 0.1106, 0.0654, 0.0827, 0.0794, 0.1331,
0.0834, 0.0837, 0.0619, 0.092, 0.1397, 0.071, 0.1035, 0.0676,
0.0729, 0.0906, 0.064, 0.0985, 0.0823, 0.1206, 0.155, 0.1438,
0.1357, 0.1695, 0.0834, 0.1359, 0.1289, 0.0764, 0.1249, 0.0775,
0.1139, 0.104, 0.1566, 0.1069, 0.0869, 0.1376, 0.1223, 0.105,
0.0996, 0.1356, 0.1335, 0.0951, 0.2162, 0.1744, 0.4547, 0.5789,
0.5555, 0.3899, 0.5037, 0.4281, 0.486, 1.0209, 0.5855, 0.5312,
0.488, 0.3133, 0.5054, 0.3724, 0.59, 0.8119, 0.3811, 0.797, 0.5139,
0.348, 0.7722, 0.743, 0.548, 0.8791, 0.9054, 0.5392, 0.4333,
0.5314, 0.4976, 0.5953, 0.4288, 0.5179, 0.5634, 0.5331, 0.4371,
0.5709, 0.5065, 0.8047, 0.5368, 0.5657, 0.5816, 0.4763, 0.5907,
0.533, 0.4384, 0.4949, 0.7277, 0.4445, 0.4894, 0.4655, 0.5384,
0.5106, 0.3343, 0.5186, 0.5262, 0.5311, 0.495, 0.4691, 0.3465,
0.5558, 0.5975, 0.4768, 0.4802, 0.4573, 0.4037, 0.3316, 0.5152,
0.4673, 0.2356, 0.2905, 0.5672, 0.2097, 0.2216, 0.7384, 0.4089,
0.6159, 0.5219, 0.2866, 0.2443, 0.2071, 0.3658, 0.5861, 0.5021,
0.6953, 0.5053, 0.3978, 0.3853, 0.6207, 0.2944, 0.507, 0.4412,
0.3424, 0.6597, 0.5892, 0.3295, 0.6505, 0.9334, 0.6674, 0.4919,
0.8392, 0.9123, 0.813, 0.8223, 0.5801, 0.5745, 0.5148, 0.8514,
0.5563, 0.6417, 0.6445, 0.5701, 0.4186, 0.8303, 0.46, 0.6041,
0.6537, 0.5221, 0.4782, 0.5657, 0.6499, 0.6667, 0.9074, 0.555,
0.6696, 0.6083)), .Names = c("length_cm", "wt_kg"), row.names = c(NA,
500L), class = "data.frame")
The relationship between length and weight is not linear. Unfortunately I could not include the whole data set here but when the whole data set is used a gam provides the best fit, unlike in this subset where loess is suggested.
I would like to focus on gam since an answer that works for the whole data set is what I am after.
It is obvious, even in the subset provided, that my data has some outliers, in the example data set (df) there are at least two obvious outliers.
library(ggplot2)
ggplot(df, aes(x=wt_kg, y=length_cm))+
geom_point()+
stat_smooth(method = "gam", formula = y ~ s(x), size = 1)
Moving forward with a gam approach I would like to generate the prediction interval so that I can identify which points fall in and out of say the 95% prediction interval.
This is extremely simple to do with a linear regression using predict:
l_model <- lm(wt_kg ~ length_cm, data=df)
df <- cbind(df, predict(l_model, interval = "prediction"))
Then simply plotting the upper and lower bounds of the interval
ggplot(df, aes(y=wt_kg, x=length_cm)) +
geom_ribbon(aes(ymin = lwr, ymax = upr),
fill = "blue", alpha = 0.2) +
geom_point()
But I can't seem to find a similar approach that works when using gam instead of lm. I have tried predict.gam from the mgcv package with no success.
library(mgcv)
df_model <- gam(wt_kg ~ length_cm, data=df)
gam_pred <- cbind(df, mgcv::predict.gam(df_model))
I don't get any errors when running this however what i get back is a single col of data which I am unsure how to interpret. Any help would be much appreciated.
I think that part of your code is:
require(broom)
require(gam)
mod <- gam(wt_kg ~ length_cm, data=df)
pred <- augment(mod)
But i dont understand the second ggplot2. "Pred" has the fitted value and others features about your regression, mainly .resid