Replacing values in df using index - why not working? - r

I am using the function provided in here: Replacing values in df using index and here: How to repeat the Grubbs test and flag the outliers
# Function to detect outliers with Grubbs test in a vector
grubbs.flag <- function(vector) {
outliers <- NULL
test <- vector
grubbs.result <- grubbs.test(test)
pv <- grubbs.result$p.value
# throw an error if there are too few values for the Grubb's test
if (length(test) < 3 ) stop("Grubb's test requires > 2 input values")
na.vect <- test
while(pv < 0.05) {
outliers <- c(outliers,as.numeric(strsplit(grubbs.result$alternative," ")[[1]][3]))
test <- vector[!vector %in% outliers]
# stop if all but two values are flagged as outliers
if (length(test) < 3 ) {
warning("All but two values flagged as outliers")
break
}
grubbs.result <- grubbs.test(test)
pv <- grubbs.result$p.value
idx.outlier <- which(vector %in% outliers)
na.vect <- replace(vector, idx.outlier, NA)
}
return(na.vect)
}
It works perfectly on example data provided there. But when I am trying to run it on my dataframe its seems that loop does not end or something. Does anyone know why is that?
My data:
test <- structure(list(Abs_18 = c(0.04359, 0.05682, 0.05002, 0.04997,
0.03433, 0.060055, 0.0447, 0.0499, 0.04509, 0.04875, 0.04052,
0.062785, 0.07602, 0.05072, 0.04253, 0.05595, 0.02888, 0.077018,
0.05416, 0.04966, 0.0476, 0.04252, 0.03891, 0.065207, 0.02675,
0.05892, 0.03523, 0.04546, 0.02696, 0.024995, 0.02469, 0.0442,
0.04504, 0.04421, 0.04683, 0.08017, -0.065334, 0.04914, 0.04086,
0.05341, 0.02706, 0.065362, 0.01571, 0.01021, 0.04802, 0.04807,
0.02735, 0.062755), FL_18 = c(3618, 3526, 3543, 5323, 5050, 767,
3641, 3418, 3353, 4179, 4864, 760, 3693, 3408, 3309, 5057, 4686,
748, 3693, 3349, 3240, 3934, 4876, 741, 2394, 3477, 3417, 4254,
4899, 755, 2375, 3486, 3370, 4516, 4838, 772, 817, 3449, 3361,
3945, 4856, 802, 2293, 2529, 3410, 4460, 5175, 813), Abs_25 = c(0.04261,
0.05332, 0.04966, 0.0482, 0.03355, 0.059344, 0.04572, 0.04967,
0.04275, 0.04989, 0.02745, 0.059196, 0.04649, 0.05517, 0.04181,
0.06214, 0.02749, 0.074719, 0.05264, 0.044, 0.04486, 0.03999,
0.0331, 0.058829, 0.03119, 0.05943, 0.03781, 0.04003, 0.02383,
0.069582, 0.02868, 0.04943, 0.04566, 0.0422, 0.03265, 0.067265,
-0.067674, 0.05038, 0.03828, 0.03854, 0.02671, 0.071176, 0.01602,
0.01055, 0.03961, 0.04729, 0.03009, 0.06377), FL_25 = c(2714,
2656, 2625, 3856, 3642, 606, 2759, 2580, 2498, 3276, 3495, 596,
2808, 2590, 2482, 3759, 3365, 586, 2838, 2548, 2433, 2864, 3557,
591, 1878, 2664, 2588, 3081, 3603, 602, 1820, 2672, 2576, 3154,
3589, 617, 572, 2661, 2575, 2918, 3601, 635, 1739, 1924, 2650,
3260, 3866, 655)), .Names = c("Abs_18", "FL_18", "Abs_25", "FL_25"
), row.names = c(NA, -48L), class = "data.frame")
I am using:
apply(test,2,grubbs.flag)

Related

Using gratia::data_slice() for a GAM with an offset

I am trying to get my fitted values from a gam with a few of the features on the GitHub version of gratia and am having trouble using the data_slice() function with a model that has an offset. I am not sure what I need to do to get the data I need for the fitted_values() function when my model has an offset.
I have a data set that looks like this:
df <- data.frame(
Landings = c(6918, 4899, 43, 0, 1712, 34427, 1080, 2826, 30521, 53302, 19467,
98013, 399, 13915, 568, 1399, 5345, 219271, 79400, 8195, 4956,
634, 12963, 2430, 32003, 2598, 3772, 5759, 11695, 35459, 0, 8959,
66760, 30628, 52, 859, 14417, 1688, 3287, 0, 22661, 31184, 4169,
78, 4647, 49, 1241, 5684, 11788, 43606, 2662, 9887, 17844, 64693,
28943, 55279, 5321, 18504, 197130, 298, 77454, 22884, 5359, 117578,
14252, 16361, 3775, 4375, 4402, 39140, 13047, 2892, 21982, 458,
68849, 4966, 72827, 29788, 319, 20712, 10327, 11536, 21070, 2545,
51660, 12031, 16566, 3263, 64641, 82188, 101116, 3510, 694, 5999,
70, 21446, 1908, 66598, 52434, 39591),
per.change = c(NA, NA, NA, -0.0170667179320969, -0.0310932741933139, -0.00876883009292576,
-0.0337896071523766, NA, -0.0383733367412093, 0.0313493941187739,
NA, 0.00309694472510056, 0.00440176543788437, 0.0162746873496682,
0.0633103545787895, -0.0304660643253923, -0.0477002654740667,
0.0465198702443307, 0.0848090520628948, NA, 0.00136588406239499,
0.02768520985601, -0.0439813588624489, 0.150138388427594, 0,
0.0393789469840049, -0.0339055127886893, -0.0253342452162168,
0.06662884765506, 0.0489644208362528, NA, NA, 0.118029428502036,
0.00145541011350925, NA, NA, 0.129622031181149, NA, -0.0308115567673083,
NA, -0.0425221990122855, 0.0294010131077341, -0.0227758522729325,
NA, 0.0664264452451825, -0.0542076704097344, -0.0630814426046203,
-0.0202991435976089, -0.0228409071757005, -0.0481243087379853,
-0.0222487699626362, 0.00268148684149014, 0.032275119594268,
0.0592311473502147, NA, -0.0402932362775077, 0.0225902785267178,
-0.0245393760611263, -0.0910224764171599, 0.0248344347525319,
-0.0132098512036838, -0.00480626865142122, -0.0207390648567119,
-0.0210938178547339, 0.0653168963830966, -0.0258970505397598,
-0.0266521730813433, NA, 0.00766938294024467, 0.0361020509477387,
-0.00356050066471782, -0.0303002256316303, -0.0493053708804782,
NA, 0.095008528099584, 0.00673520533179099, -0.0145235131366679,
-0.0915368065151916, NA, -0.0633484162895928, -0.0364410398900781,
0.0297277420555085, -0.0259575275766121, 0.000333945513698282,
-0.040540978212464, -0.105289527646177, 0.00931710632328382,
-0.00481869261842514, 0.00990973004165871, -0.0207742441119774,
-0.0725670373822072, -0.00197394223951389, NA, NA, 0.069556715359811,
NA, -0.00233517445977803, -0.118707735630186, -0.0283717012552832,
0.0224756418583045),
lmb_eff = c(9348, 5383, 86.5, 1160, 1520.7, 37832, 1800.6, 9421, 24693,
80761, 20754, 297008.8, 3067, 10871, 1798, 3515, 19089, 261037,
107881, 4737, 10114, 396, 29462, 14639, 16328, 6186.5, 12572,
9930, 15188, 48112, 967, 16967.9, 71785, 69608.5, 742, 1492,
8099.5, 5723, 5218, 88, 35519, 31853, 9063, 654, 13276, 2439,
14262.5, 10526, 15113, 118817.9, 3646, 2808.7, 137263.9, 143763,
15816, 50026, 17221.6, 21516, 148777, 966, 140824.7, 35259.8,
6615.5, 113492, 39590, 68170, 16415, 3580, 8151, 32918, 20386,
3825.5, 19453, 208, 51380, 6208, 137409, 17409.8, 1028, 12007,
15413.6, 16622.5, 32974, 3397.5, 53812.6, 15057, 31256.6, 7124.8,
72063, 70913, 65447, 4228.8, 1020, 14887, 212, 14966, 10721.5,
58063, 100834, 43175),
stringsAsFactors = FALSE)
I am running a gam that looks like this:
gam<-gam(Landings~s(per.change)+offset(log(lmb_eff)),data=df)
I try to use the data_slice function and I am keep getting an error.
ds <- data_slice(gam, per.change = evenly(per.change, n = 100), Season = evenly(Season), lmb_eff=1)
Error in eval(predvars, data, env) : object 'lmb_eff' not found
I see in the help file that there is an offset = that I can add to the code but I have tried a bunch of iterations and not had any luck figuring out what I need to specify so that I can use data_slice() with a model that has an offset.
It's a bug in an internal helper function https://github.com/gavinsimpson/gratia/issues/189
In the meantime, until I fix this:
new_df <- with(df,
tidyr::expand_grid(per.change = evenly(per.change, n = 100)))
ds <- dplyr::bind_cols(new_df, lmb_eff = rep(1, nrow(new_df)))
fitted_values(m, data = ds)

Is there an explanation for this R function merge() error?

I am trying to use the R merge function to combine two data.frames, but keep getting the following error:
Error in fix.by(by.y, y) : 'by' must specify a uniquely valid column
I am not sure what this error means or how to resolve it.
My code thus far is the following:
movies <- read_csv("movies.csv")
firsts = vector(length = nrow(movies))
for (i in 1:nrow(movies)) {
firsts[i] = movies$director[i] %>% str_split(" ", n = 2) %>% unlist %>% .[1]
}
movies$firsts = firsts
movies <- movies[-c(137, 147, 211, 312, 428, 439, 481, 555, 602, 830, 850, 1045, 1080, 1082, 1085, 1096, 1255, 1258, 1286, 1293, 1318, 1382, 1441, 1456, 1494, 1509, 1703, 1719, 1735, 1944, 1968, 1974, 1977, 2098, 2197, 2409, 2516, 2546, 2722, 2751, 2988, 3191,
3227, 3270, 3283, 3285, 3286, 3292, 3413, 3423, 3470, 3480, 3511, 3676, 3698, 3826, 3915, 3923, 3954, 4165, 4381, 4385, 4390, 4397, 4573, 4711, 4729, 4774, 4813, 4967, 4974, 5018, 5056, 5258, 5331, 5405, 5450, 5469, 5481, 4573, 5708, 5715, 5786, 5886, 5888, 5933, 5934, 6052, 6091, 6201, 6234, 6236, 6511, 6544, 6551, 6562, 6803, 4052, 4121, 4326),]
movies <- movies[-c(4521,5846),]
g <- gender_df(movies, name_col = "firsts", year_col = "year", method = c("ssa"))
merge(movies, g, by = c("firsts", "name"), all = FALSE)
I thinks you are trying to give the by argument a non-valid value. Indeed, the documentation tells:
By default the data frames are merged on the columns with names they
both have, but separate specifications of the columns can be given by
by.x and by.y. The rows in the two data frames that match on the
specified columns are extracted, and joined together. If there is more
than one match, all possible matches contribute one row each. For the
precise meaning of ‘match’, see match.
In your case, you shall try the following:
merge(x = movies,y = g, by.x = "firsts", by.y = "name", all = FALSE)

Spreading a data frame using an "external" df?

I have two data frames data1 and data2. I am trying to spread my data or create dummy variables on one column x2 in data1. I can do the following:
library(dummies)
x2dummy <- dummy(data1$x2)
final_out <- cbind(data1, x1dummy)
Which will give me a large data frame of 190 columns and 500 observations, however the universe of x2 items is larger than that in the current data frame data1. I have a sort of dictionary or a different data frame consisting of all the unique items which can be chosen data2. How can I spread my data data1 by data2 so that I will have 441 dummy variable columns (the length of data2) and populate it with the items in data1?
EDIT: Adding new smaller sample of data:
Data 1:
data1 <- structure(list(y = c(440000, 550000, 990, 135000, 267000, 135000,
239000, 170000, 855000, 158000, 1200, 256000, 86000, 98700, 450000,
130000, 465000, 308000, 680000, 305000), x1 = c(240, 156, 52,
74, 85, 70, 160, 176, 386, 65, 52, 90, 87, 193, 110, 105, 126,
76, 153, 133), x2 = c(8338, 8860, 8003, 8207, 8901, 8224, 8811,
8508, 8840, 8940, 8012, 8223, 8206, 8490, 8023, 8490, 8870, 8024,
8011, 8394)), .Names = c("y", "x1", "x2"), row.names = c(NA,
20L), class = "data.frame")
Data2:
data2 <- c(4375, 8001, 8002, 8003, 8004, 8005, 8006, 8007, 8008, 8009,
8010, 8011, 8012, 8013, 8014)
EDIT:
Thanks for the edits from the community however now data2 does not contain the full universe of information. For example; in data1 -> x2 = 8206 however this does not appear in data2 above which is what I am trying to spread the data by.
I want to spread the columns of a new data frame by all unique values in data2 and then populate these columns with the values in data1 column x2.
Based on the small data in data1 I will have a very sparse matrix.
Data2
data2 <- structure(list(x2_dictionary = c(4375, 8001, 8002, 8003, 8004,
8005, 8006, 8007, 8008, 8009, 8010, 8011, 8012, 8013, 8014, 8015,
8016, 8017, 8018, 8019, 8020, 8021, 8022, 8023, 8024, 8025, 8026,
8026, 8027, 8028, 8029, 8030, 8031, 8032, 8033, 8034, 8035, 8036,
8037, 8038, 8039, 8040, 8041, 8042, 8100, 8104, 8105, 8106, 8107,
8110, 8120, 8130, 8140, 8146, 8148, 8148, 8150, 8160, 8161, 8170,
8172, 8173, 8174, 8175, 8178, 8180, 8181, 8182, 8183, 8183, 8183,
8184, 8184, 8185, 8186, 8187, 8188, 8189, 8190, 8191, 8192, 8193,
8194, 8195, 8196, 8197, 8198, 8201, 8202, 8203, 8204, 8205, 8206,
8207, 8208, 8210, 8211, 8212, 8213, 8214, 8220, 8221, 8222, 8223,
8224, 8225, 8226, 8227, 8228, 8230, 8231, 8232, 8233, 8240, 8241,
8242, 8243, 8250, 8251, 8251, 8253, 8254, 8254, 8255, 8256, 8256,
8259, 8260, 8261, 8262, 8263, 8269, 8269, 8270, 8270, 8271, 8272,
8273, 8274, 8275, 8275, 8278, 8278, 8279, 8280, 8281, 8281, 8281,
8281, 8282, 8282, 8289, 8289, 8290, 8291, 8292, 8293, 8294, 8295,
8296, 8297, 8298, 8299, 8301, 8302, 8303, 8304, 8310, 8317, 8318,
8319, 8320, 8328, 8329, 8330, 8338, 8339, 8340, 8348, 8349, 8350,
8350, 8358, 8359, 8360, 8370, 8380, 8384, 8389, 8390, 8391, 8392,
8393, 8394, 8395, 8396, 8397, 8398, 8401, 8401, 8402, 8403, 8410,
8415, 8416, 8420, 8430, 8440, 8440, 8445, 8450, 8455, 8458, 8458,
8459, 8459, 8460, 8460, 8460, 8461, 8469, 8469, 8470, 8470, 8471,
8472, 8474, 8476, 8479, 8480, 8490, 8495, 8500, 8503, 8503, 8504,
8504, 8505, 8506, 8507, 8508, 8508, 8509, 8510, 8510, 8511, 8511,
8512, 8513, 8514, 8515, 8516, 8518, 8519, 8519, 8519, 8519, 8520,
8521, 8529, 8530, 8530, 8540, 8550, 8551, 8552, 8553, 8554, 8559,
8560, 8569, 8569, 8570, 8571, 8572, 8573, 8580, 8585, 8587, 8588,
8589, 8589, 8589, 8590, 8591, 8591, 8592, 8593, 8600, 8607, 8610,
8611, 8612, 8613, 8619, 8619, 8619, 8620, 8629, 8630, 8635, 8640,
8650, 8660, 8670, 8672, 8672, 8680, 8690, 8691, 8692, 8693, 8693,
8694, 8694, 8695, 8695, 8696, 8696, 8697, 8698, 8699, 8699, 8699,
8699, 8700, 8710, 8711, 8712, 8717, 8717, 8718, 8719, 8719, 8719,
8719, 8720, 8729, 8730, 8731, 8731, 8732, 8732, 8733, 8734, 8734,
8735, 8736, 8737, 8738, 8739, 8739, 8740, 8750, 8753, 8754, 8755,
8756, 8757, 8758, 8759, 8760, 8769, 8770, 8770, 8773, 8775, 8776,
8777, 8779, 8780, 8781, 8782, 8783, 8784, 8785, 8786, 8787, 8787,
8787, 8787, 8788, 8789, 8790, 8791, 8792, 8792, 8793, 8794, 8795,
8796, 8797, 8798, 8798, 8799, 8800, 8801, 8810, 8811, 8812, 8818,
8820, 8830, 8840, 8840, 8849, 8850, 8859, 8860, 8870, 8871, 8880,
8901, 8902, 8903, 8904, 8905, 8906, 8907, 8908, 8911, 8912, 8913,
8914, 8915, 8916, 8917, 8918, 8921, 8922, 8923, 8924, 8930, 8940,
8950, 8960, 8970, 8980, 17532, 43421, 80338)), class = "data.frame", row.names = c(NA,
-441L), .Names = "x2_dictionary")

R - How to perform cross-year date operations?

I am working with daily measurements of temperature. In total I have about 40 years of observations. How can I perform date operations covering a time interval that crosses years?
For example, I want to sum the values from every october-to-february period. However, the sum should be taken only on the contiguous period of oct-nov-dec-jan-feb.
"Isolated" months should not be taken into account, like for example jan and feb of the first year, and oct-nov-dec of the last year. The sum has to run over the contiguous period only (from oct-nov-dec-jan-fev).
For example, this is what I am looking for:
1st year 2nd year 3rd year
J-F-M-A-M-J-J-A-S-**O-N-D J-F**-M-A-M-J-J-A-S-**O-N-D J-F**-M-A-M-J-J-A-S-O-N-D
But this is not OK:
1st year 2nd year 3rd year
**J-F**-M-A-M-J-J-A-S-**O-N-D J-F**-M-A-M-J-J-A-S-**O-N-D J-F**-M-A-M-J-J-A-S-**O-N-D**
This is a sample data frame to work on:
df <- structure(list(date = structure(c(-3653, -3622, -3593, -3562,
-3532, -3501, -3471, -3440, -3409, -3379, -3348, -3318, -3287,
-3256, -3228, -3197, -3167, -3136, -3106, -3075, -3044, -3014,
-2983, -2953, -2922, -2891, -2863, -2832, -2802, -2771, -2741,
-2710, -2679, -2649, -2618, -2588, -2557, -2526, -2498, -2467,
-2437, -2406, -2376, -2345, -2314, -2284, -2253, -2223, -2192,
-2161, -2132, -2101, -2071, -2040, -2010, -1979, -1948, -1918,
-1887, -1857, -1826, -1795, -1767, -1736, -1706, -1675, -1645,
-1614, -1583, -1553, -1522, -1492, -1461, -1430, -1402, -1371,
-1341, -1310, -1280, -1249, -1218, -1188, -1157, -1127, -1096,
-1065, -1037, -1006, -976, -945, -915, -884, -853, -823, -792,
-762, -731, -700, -671, -640, -610, -579, -549, -518, -487, -457,
-426, -396, -365, -334, -306, -275, -245, -214, -184, -153, -122,
-92, -61, -31, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304,
334, 365, 396, 424, 455, 485, 516, 546, 577, 608, 638, 669, 699,
730, 761, 790, 821, 851, 882, 912, 943, 974, 1004, 1035, 1065,
1096, 1127, 1155, 1186, 1216, 1247, 1277, 1308, 1339, 1369, 1400,
1430, 1461, 1492, 1520, 1551, 1581, 1612, 1642, 1673, 1704, 1734,
1765, 1795, 1826, 1857, 1885, 1916, 1946, 1977, 2007, 2038, 2069,
2099, 2130, 2160, 2191, 2222, 2251, 2282, 2312, 2343, 2373, 2404,
2435, 2465, 2496, 2526, 2557, 2588, 2616, 2647, 2677, 2708, 2738,
2769, 2800, 2830, 2861, 2891, 2922, 2953, 2981, 3012, 3042, 3073,
3103, 3134, 3165, 3195, 3226, 3256, 3287, 3318, 3346, 3377, 3407,
3438, 3468, 3499, 3530, 3560, 3591, 3621, 3652, 3683, 3712, 3743,
3773, 3804, 3834, 3865, 3896, 3926, 3957, 3987, 4018, 4049, 4077,
4108, 4138, 4169, 4199, 4230, 4261, 4291, 4322, 4352, 4383, 4414,
4442, 4473, 4503, 4534, 4564, 4595, 4626, 4656, 4687, 4717, 4748,
4779, 4807, 4838, 4868, 4899, 4929, 4960, 4991, 5021, 5052, 5082,
5113, 5144, 5173, 5204, 5234, 5265, 5295, 5326, 5357, 5387, 5418,
5448, 5479, 5510, 5538, 5569, 5599, 5630, 5660, 5691, 5722, 5752,
5783, 5813, 5844, 5875, 5903, 5934, 5964, 5995, 6025, 6056, 6087,
6117, 6148, 6178, 6209, 6240, 6268, 6299, 6329, 6360, 6390, 6421,
6452, 6482, 6513, 6543, 6574, 6605, 6634, 6665, 6695, 6726, 6756,
6787, 6818, 6848, 6879, 6909, 6940, 6971, 6999, 7030, 7060, 7091,
7121, 7152, 7183, 7213, 7244, 7274), class = "Date"), temp = c(22.9223529411765,
23.0705882352941, 23.1094117647059, 20.7835294117647, 17.4517647058824,
17.3176470588235, 18.0494117647059, 19.6188235294118, 21.3023529411765,
23.1105882352941, 22.2364705882353, 22.7482352941176, 23.5870588235294,
24.0023529411765, 23.0094117647059, 22.0176470588235, 19.4917647058824,
18.1011764705882, 18.3164705882353, 20.0623529411765, 22.8717647058824,
23.2576470588235, 23.68, 22.3694117647059, 22.9517647058824,
23.6976470588235, 23.3294117647059, 20.8564705882353, 18.16,
15.8988235294118, 15.7988235294118, 18.4176470588235, 20.8423529411765,
20.3247058823529, 22.3070588235294, 22.2035294117647, 24.2235294117647,
23.6976470588235, 24.4082352941176, 21.1752941176471, 18.1023529411765,
16.1211764705882, 18.3164705882353, 19.7635294117647, 23.1294117647059,
22.9964705882353, 23.6552941176471, 22.6964705882353, 23.6011764705882,
23.6517647058824, 23.7035294117647, 22.4352941176471, 18.5835294117647,
16.5976470588235, 15.7741176470588, 19.2541176470588, 20.8776470588235,
20.5729411764706, 21.1729411764706, 21.5870588235294, 22.4576470588235,
23.6058823529412, 21.84, 21.6694117647059, 19.2458823529412,
18.7517647058824, 17.7811764705882, 19.4764705882353, 21.9270588235294,
21.5470588235294, 22.88, 23.2458823529412, 24.2776470588235,
25.2470588235294, 23.4694117647059, 21.4435294117647, 19.3941176470588,
18.5447058823529, 17.6, 18.3764705882353, 19.8529411764706, 22.0823529411765,
22.7294117647059, 23.4011764705882, 23.3611764705882, 24.2505882352941,
23.2870588235294, 21.9482352941176, 20.5552941176471, 18.0788235294118,
18.5929411764706, 20.8752941176471, 21.9023529411765, 23.6105882352941,
22.4070588235294, 21.5635294117647, 23.3129411764706, 22.9741176470588,
23.3670588235294, 19.6105882352941, 16.9941176470588, 17.7670588235294,
17.4858823529412, 17.8517647058824, 20.26, 22.1576470588235,
23.8364705882353, 23.4447058823529, 24.8129411764706, 25.1764705882353,
24.2694117647059, 21.5035294117647, 20.0458823529412, 18.4694117647059,
18.4541176470588, 19.5388235294118, 22.02, 20.5364705882353,
22.9858823529412, 21.9752941176471, 23.7729411764706, 24.0576470588235,
24.0941176470588, 22.1552941176471, 21.2329411764706, 19.5611764705882,
17.8788235294118, 18.6823529411765, 20.1541176470588, 21.6258823529412,
21.5211764705882, 23.9811764705882, 24.8352941176471, 24.5882352941176,
24.1729411764706, 21.1035294117647, 19.0435294117647, 17.08,
17.4529411764706, 19.1458823529412, 20.4447058823529, 20.7129411764706,
21.5047058823529, 22.6952941176471, 23.4364705882353, 23.1, 24.1847058823529,
19.8105882352941, 19.9847058823529, 20.5188235294118, 17.7658823529412,
19.4435294117647, 20.7588235294118, 21.7835294117647, 22.7788235294118,
23.2388235294118, 24.9129411764706, 25.6, 23.5647058823529, 24.0058823529412,
19.7823529411765, 19.3152941176471, 18.7741176470588, 19.0305882352941,
20.5576470588235, 21.3611764705882, 21.4247058823529, 23.4811764705882,
23.6505882352941, 25.1870588235294, 23.3541176470588, 21.4823529411765,
18.7364705882353, 17.7235294117647, 18.3976470588235, 19.7235294117647,
21.0741176470588, 21.6094117647059, 22.9635294117647, 22.4011764705882,
23.4152941176471, 24.7741176470588, 24.3270588235294, 20.7976470588235,
18.8764705882353, 17.7788235294118, 16.4129411764706, 21.4117647058824,
22.3317647058824, 21.66, 22.3694117647059, 23.0917647058824,
24.4541176470588, 23.2847058823529, 23.3164705882353, 21.2529411764706,
19.1258823529412, 17.3882352941176, 17.3823529411765, 19.0529411764706,
19.6576470588235, 20.2976470588235, 21.9023529411765, 23.3094117647059,
24.0117647058824, 25.5611764705882, 24.9129411764706, 21.3964705882353,
19.9870588235294, 18.3929411764706, 20.9917647058824, 20.3058823529412,
21.4435294117647, 23.1941176470588, 22.8388235294118, 22.5176470588235,
24.6317647058824, 24.6541176470588, 24.2, 20.84, 18.4576470588235,
17.5011764705882, 19.16, 20.54, 20.1517647058824, 22.6776470588235,
22.7470588235294, 22.7882352941176, 22.0811764705882, 24.2152941176471,
22.9235294117647, 20.8411764705882, 19.6188235294118, 17.16,
16.0529411764706, 20.3223529411765, 19.9752941176471, 22.5152941176471,
22.2705882352941, 23.1541176470588, 23.1047058823529, 23.9517647058824,
24.8176470588235, 22.18, 20.5023529411765, 17.3505882352941,
19.1917647058824, 19.9894117647059, 19.0235294117647, 22.8235294117647,
22.7094117647059, 23.8741176470588, 24.0517647058824, 25.1764705882353,
23.9235294117647, 21.2929411764706, 20.6117647058824, 17.1305882352941,
16.3470588235294, 19.6470588235294, 21.3341176470588, 20.2176470588235,
23.7435294117647, 22.6741176470588, 22.9070588235294, 24.7152941176471,
23.2905882352941, 20.5776470588235, 18.9635294117647, 19.0658823529412,
18.8423529411765, 20.0729411764706, 21.3047058823529, 22.1588235294118,
24.0388235294118, 22.1917647058824, 24.0517647058824, 24.8729411764706,
23.0117647058824, 23, 21.3094117647059, 19.4105882352941, 20.3470588235294,
19.4482352941176, 20.0670588235294, 21.6364705882353, 23.4211764705882,
23.16, 25.4788235294118, 26.4741176470588, 24.0482352941176,
21.4176470588235, 21.7164705882353, 19.0905882352941, 19.6752941176471,
18.1611764705882, 20.0482352941176, 23.4917647058824, 23.4894117647059,
22.5482352941176, 23.1376470588235, 24.9811764705882, 24.1552941176471,
22.8423529411765, 19.7435294117647, 16.4, 17.3105882352941, 20.5235294117647,
21.0494117647059, 23.1352941176471, 23.9435294117647, 23.9058823529412,
24.9835294117647, 24.6952941176471, 24.0047058823529, 23.3164705882353,
21.5823529411765, 18.3447058823529, 18.1964705882353, 20.0035294117647,
20.7152941176471, 22.5705882352941, 24.6541176470588, 23.2329411764706,
25.0517647058824, 24.3329411764706, 23.5811764705882, 22.9988235294118,
19.4976470588235, 17.3188235294118, 19.5635294117647, 19.0211764705882,
19.7223529411765, 22.6858823529412, 23.9423529411765, 23.6905882352941,
25.7129411764706, 23.9505882352941, 24.4376470588235, 22.6070588235294,
19.8882352941176, 17.2058823529412, 16.4211764705882, 20.02,
21.9458823529412, 21.9341176470588, 22.74, 23.8, 23.9611764705882,
24.4564705882353, 24, 23.2129411764706, 19.4729411764706, 17.7105882352941,
16.9682352941176, 19.0341176470588, 20.2917647058824, 20.7776470588235,
22.9364705882353, 22.7894117647059)), .Names = c("date", "temp"
), row.names = c(NA, -360L), class = "data.frame")
Any input appreciated.
Hopefully this helps:
df$date = as.POSIXct(df$date,format="%Y-%m-%d")
df$year = as.numeric(format(df$date,format="%Y"))
df$month = as.numeric(format(df$date,format="%m"))
years = unique(df$year)
# initialize a new data frame to store in your summed values
newdf=NULL
# run through a loop starting at your second year and ending at second last
for(i in 2:(length(years)-1)){
#data from year1
start = df[df$year==years[i] & df$month %in% c(10,11,12),]
end = df[df$year==years[i+1] & df$month %in% c(1,2),]
data1 = rbind(start,end)
# in case you have NAs in your data you can add ra.rm = T
sum.data = sum(data1$temp,na.rm = T)
df1 = as.data.frame(list(Year = years[i],
sum.data = sum.data))
# or paste year 1 and year 2 together
#df1 = as.data.frame(list(Year = paste(years[i],years[i+1],sep="-"),
# sum.data = sum.data))
newdf = rbind(newdf,df1)
}
head(newdf)

Time Series based Forecasting for Daily Data but Seasonality is Quarterly - in R

I have demand for a product on daily bases for last 4 years. This demand has quarterly seasonal patterns, as shown in following image
I would like to do time series based forecasting on this data. Following is my code
myts = ts(forecastsku1$Value,frequency=90)
fit <- stl(myts, s.window="period")
plot(fit)
fit <- decompose(myts)
plot(fit)
Here instead of 4 seasonal factor ts is creating 90 seasonal factor, which is not what I want. I want to apply same seasonality on 3 month duration and then do forecasting.
Data for reference
dput(head(forecastsku1,100))
structure(list(date = structure(c(14625, 14626, 14627, 14628, 14629, 14630, 14631, 14632, 14633, 14634, 14635, 14636, 14637,
14638, 14639, 14640, 14641, 14642, 14643, 14644, 14645, 14646, 14647, 14648, 14649, 14650, 14651, 14652, 14653, 14654, 14655,
14656, 14657, 14658, 14659, 14660, 14661, 14662, 14663, 14664, 14665, 14666, 14667, 14668, 14669, 14670, 14671, 14672, 14673,
14674, 14675, 14676, 14677, 14678, 14679, 14680, 14681, 14682, 14683, 14684, 14685, 14686, 14687, 14688, 14689, 14690, 14691,
14692, 14693, 14694, 14695, 14696, 14697, 14698, 14699, 14700, 14701, 14702, 14703, 14704, 14705, 14706, 14707, 14708, 14709,
14710, 14711, 14712, 14713, 14714, 14715, 14716, 14717, 14718, 14719, 14720, 14721, 14722, 14723, 14724), class = "Date"),
Value = c(1407, 1413, 1407, 1406, 1401, 1410, 1411, 1416, 1404, 1409, 1414, 1414, 1400, 1421, 1398, 1404, 1397, 1404, 1407, 1409, 1406, 1395, 1397,
1403, 1412, 1399, 1409, 1393, 1405, 1403, 1406, 1402, 1405, 1386, 1393, 1405, 1397, 1393, 1402, 1402, 1393, 1391, 1410, 1402, 1408,
1394, 1404, 1398, 1406, 1389, 1401, 1391, 1394, 1384, 1377, 1390, 1395, 1399, 1384, 1397, 1398, 1384, 1377, 1394, 1398, 1394, 1391,
1403, 1382, 1390, 1385, 1403, 1390, 1388, 1391, 1384, 1392, 1390, 1381, 1387, 1395, 1390, 1388, 1384, 1387, 1395, 1380, 1378, 1383,
1384, 1232, 1247, 1232, 1248, 1236, 1236, 1231, 1237, 1224, 1236)),
.Names = c("date", "Value"), row.names = 13150:13249, class = "data.frame")
Can anyone help me in this case? Please let me know if more data required.
myts = ts(forecastsku1$Value,frequency=4)
fit <- decompose(myts)
plot(fit)
Result would be:
It is creating a 90 seasonal factor because your frequency is 90 in the ts definition. What you need to do is to specify a start and end in the ts and the period=4 so that the observations can be segregated the way you want them to be.. if you can successfully create a 4 seasonal factor, you can obviousy predict quarterly (4*3=12) . So instead of these dates I think it is more clear to have like start=c(2005,1) .Hopefully this is useful
this is an old question, but still, maybe my answer is of some value.
You can seasonally adjust daily data using the dsa package (disclaimer: I'm the author).
I tried to replicate your time series (or something similar) to give you an idea of how to seasonally adjust them (the setting of the seasonal adjustment try to help modelling the jumping behaviour of the time series appropriately):
# loading packages
library(dsa); library(xts)
# Replication of the data
set.seed(23)
data <- seq(1250, 1000, , length.out=365.25*4) + rnorm(365.25*4, 0, 5)
time <- seq(as.Date("2008-01-01"), by="days", length.out=365.25*4)
x <- xts(data, time)
ind <- as.numeric(format(zoo::index(x), "%m")) # Indicator of day of year
x[ind==1 | ind==2 | ind==3 | ind==7 | ind==8 | ind==9] <-
x[ind==1 | ind==2 | ind==3 | ind==7 | ind==8 | ind==9] + 200
# Seasonally adjusting the data
result <- dsa(x, fourier_number=40, reiterate3=4, reg.create=NULL, cval=30)
sa <- result$output[,1]
xtsplot(result$output[,c(2,1)], names=c("original", "seasonally adjusted"))
output(result) # creates a html in your working directory.

Resources