Loess smooth extracted values by group errors - r

I'm trying to extract the values for a dataset smoothed by group (like how I would get them with geom_smooth(), but I need to be able to plot the actual fitted values). I tried all the solutions presented in this thread loess regression on each group with dplyr::group_by() but I keep getting the following errors for all of them
From the 'neat tidyverse way' answer:
library(dplyr)
library(tidyr)
library(purrr)
models <- test %>%
tidyr::nest(-id) %>%
dplyr::mutate(
# Perform loess calculation on each CpG group
m = purrr::map(data, loess,
formula = y ~ x, span = .5),
# Retrieve the fitted values from each model
fitted = purrr::map(m, `[[`, "fitted")
)
Warning message:
All elements of `...` must be named.
Did you want `data = c(x, y)`?
From the accepted answer:
test <- test %>%
group_by(id) %>%
arrange(id, x) %>%
mutate(Loess = predict(loess(y ~ x, span = .5, data=.),
data.frame(x = seq(min(x), max(x), 1))))
Error: Problem with `mutate()` input `Loess`.
x Input `Loess` can't be recycled to size 60.
ℹ Input `Loess` is `predict(...)`.
ℹ Input `Loess` must be size 60 or 1, not 3.
ℹ The error occurred in group 2: id = 2.
From the comments
test2 <- test %>% nest(-id) %>%
mutate(fit = map(data, ~ loess(y ~ x, .))) %>%
unnest(map2(fit, data, augment))
Error: object 'fit' not found
Run `rlang::last_error()` to see where the error occurred.
In addition: Warning message:
All elements of `...` must be named.
Did you want `data = c(x, y)`?
I'm not very experienced with the tidyverse, so I'm not completely sure what I'm doing differently compared to those examples. Any help is greatly appreciated.
#data
test<-structure(list(x = c(-8.09566199976713, -8.06147618447585, -8.00975809213924,
-7.96200959687978, -7.91437069721383, -7.89794310600054, -7.90270753398918,
-7.90758707202371, -7.91285423074951, -7.92478106752289, -7.93691825045689,
-8.01874278066444, -7.963527427585, -7.93407938329611, -7.94323616248026,
-7.94085332876942, -7.93203791319322, -7.94357702787401, -7.91377080575536,
-7.88252433889846, -7.85499829071686, -7.86566601358475, -7.91275434062848,
-7.96059415201768, -8.00264819208551, -8.02577207019623, -7.95146178490521,
-7.86605403953235, -7.78476412041177, -7.88927033537369, -7.8876796916652,
-7.95187190641719, -7.98401088593829, -8.00012692860024, -8.01433523159787,
-8.00807092407049, -8.00178416213873, -8.00011043534128, -8.02862971973644,
-8.14101616921556, -8.05197547777147, -8.06579012466988, -8.07977339594891,
-9.91048749031936, -9.66560592747977, -10.6156432290275, -10.8015737085674,
-11.1501986169795, -11.7159178445694, -11.852600007071, -11.9500158919174,
-11.8099796968354, -11.5041369336859, -11.2139686794437, -10.9382871968791,
-10.6564566029614, -9.79377558892992, -9.09630034541423, -10.9883787360288,
-11.2269861445731, -11.4824854018762, -11.6079686487163, -10.428108852748,
-9.6062104865303, -9.46949542587671, -9.85124539788883, -10.276205029234,
-10.4776426742835, -10.6400079178346, -10.4147379554317, -10.2038339860429,
-10.0934609736433, -10.0824486553681, -9.96530249613719, -9.94853909056431,
-9.93151185565445, -9.91417478382807, -9.90180880165919, -9.91333691008172,
-10.0373498803589, -10.1768025338422, -10.320572551138, -10.4688466820861,
-10.6218220581835, -10.6343234090347, -10.3264187862842, -9.98775786739598,
-9.58260323586509, -9.42043071710701, -9.43924036934577, -9.52799164804214,
-9.62866487252844, -9.72887391315455, -9.81243902432872, -9.70821477146116,
-10.349558979838, -10.1458405548694, -9.99415308822563, -9.84536020757233,
-9.79599547035806, -9.75295135996101, -9.79904377903862, -9.25340041876441,
-15.3471460066526, -15.8962098900031, -16.4807765780841, -16.3237393813877,
-17.3021769512891, -16.3800352448021, -15.8070127149332, -15.3891912546897,
-14.9758961247955, -14.3049193201036, -13.6951530131572, -13.6426403073008,
-13.7192755520964, -13.317953401665, -12.9213325990039, -12.4594681682978,
-12.0191583645286, -11.8053727831142, -11.6872352172752, -11.5737935404884,
-11.4107390421452, -11.1892161252634, -10.993043829208, -10.9330292789471,
-10.8749587447015, -10.839113401307, -10.812108561392, -10.9713463919414,
-11.1680590680373, -11.2539207698049, -11.3405390697112, -11.4280103071663,
-11.5164358909023, -11.6059223744961, -11.6965817015189, -11.6020622433579,
-11.5436710566386, -11.5081631613201, -11.6534368262729, -11.808341217397,
-11.9736843692008, -12.1503511448775), y = c(0.657500808303136,
0.738124328538192, 0.846237100479524, 0.905780104908876, 0.964065611141255,
0.97130448683885, 0.961952809454258, 0.952520035393847, 0.942792088995166,
0.927852423620437, 0.912757550459065, 0.759400682943845, 0.816109383509902,
0.87020033850233, 0.915815641213451, 0.967873386043728, 1.02174887297134,
1.05888614752619, 1.09417181629346, 1.12927656453054, 1.15845597740891,
1.14281223660865, 1.09951983390484, 1.05556252495082, 1.02150454110485,
1.00407662866727, 1.0383078668771, 1.07743614470557, 1.1142850320069,
0.849172703363773, 0.928570803393031, 1.03718890041412, 1.04033830574734,
1.03347118859473, 1.02808408212257, 1.03805023835948, 1.04800826979005,
1.05065203275594, 1.00517755278862, 0.816786352254452, 0.987099861643988,
0.986294681427012, 0.985411697682411, 6.30490194156339, 6.23591979381374,
6.54324631274467, 6.69393500338639, 6.89041829514468, 7.17507154906183,
7.18466565037316, 7.17069075506272, 7.09749953887751, 6.99963625007418,
6.90682321485407, 6.818675464992, 6.72319228655747, 6.35779144412249,
6.07675755691038, 6.71116104273407, 6.84049060491686, 6.97924607466303,
7.05864889503111, 6.59454969605345, 6.27016677264436, 6.19033669905758,
6.37773429443436, 6.58723774958637, 6.68536624180541, 6.76713349215161,
6.63576249672338, 6.51308895413708, 6.43262161144123, 6.42705851495013,
6.37935821335848, 6.37095186987229, 6.36245251810731, 6.35384217814464,
6.34823344504427, 6.3568189796202, 6.40093531258066, 6.45018168045593,
6.50106303671575, 6.55364891437324, 6.60801264620102, 6.61714449186606,
6.55356637339973, 6.47759629080218, 6.34852444038794, 6.26379776807968,
6.23209002392533, 6.22130259164532, 6.21289851462945, 6.20309983882547,
6.18744917387348, 6.10894055296196, 6.25148917581075, 6.17293529625129,
6.15075529839679, 6.12848960212141, 6.1326318372733, 6.13833262010545,
6.16395466665727, 6.05567879845981, 6.25186104520219, 6.44281768293323,
6.65324901403219, 6.72950091493087, 7.03322640156021, 6.88309899890625,
6.70207702789022, 6.54079770390583, 6.38549695194146, 6.18839711001894,
6.01102800783953, 5.87133691513539, 5.77306022102593, 5.67740669749789,
5.58788061886448, 5.50325665174288, 5.42363100637011, 5.35172803418452,
5.28384694735869, 5.21822380115916, 5.14919582516762, 5.07712142295026,
5.0106828132089, 4.95786444890208, 4.90608940527112, 4.85658988565748,
4.80812641990368, 4.77955320604372, 4.75486746882052, 4.73784287455848,
4.72038983462283, 4.70251299307059, 4.68421637386314, 4.66550330220186,
4.64637629394034, 4.64370313837652, 4.65439969630172, 4.67177327419859,
4.74790550644152, 4.82753161090928, 4.91100496765981, 4.9987159528997
), id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L)), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L,
22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L,
35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L,
48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L,
61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L, 73L,
74L, 75L, 76L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L,
87L, 88L, 89L, 90L, 91L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 99L,
100L, 101L, 102L, 103L, 131L, 132L, 133L, 134L, 135L, 136L, 137L,
138L, 139L, 140L, 141L, 142L, 143L, 144L, 145L, 146L, 147L, 148L,
149L, 150L, 151L, 152L, 153L, 154L, 155L, 156L, 157L, 158L, 159L,
160L, 161L, 162L, 163L, 164L, 165L, 166L, 167L, 168L, 169L, 170L,
171L, 172L), class = "data.frame")

First group by id, then fit a model to each group's data and use broom::augment to extract the fitted values and, as a bonus, the residuals. This preserves the inputs x and y; it will be easier to plot x vs .fitted for example.
library("tidyverse")
test %>%
group_by(id) %>%
group_modify(
# .x refers to the subset of rows that belong to a group.
# It's a smaller data frame with the same columns as the input
# but fewer rows.
~ loess(y ~ x, span = .5, data = .x) %>% broom::augment()
)
#> # A tibble: 145 × 5
#> # Groups: id [3]
#> id y x .fitted .resid
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.658 -8.10 0.834 -0.176
#> 2 1 0.738 -8.06 0.897 -0.159
#> 3 1 0.846 -8.01 0.989 -0.142
#> 4 1 0.906 -7.96 0.975 -0.0688
#> 5 1 0.964 -7.91 0.982 -0.0180
#> 6 1 0.971 -7.90 0.983 -0.0121
#> 7 1 0.962 -7.90 0.980 -0.0179
#> 8 1 0.953 -7.91 0.981 -0.0289
#> 9 1 0.943 -7.91 0.983 -0.0404
#> 10 1 0.928 -7.92 0.977 -0.0487
#> # … with 135 more rows
The original data frame might have more columns than x and y that are not used in the loess fit. To keep those columns, pass the group data .x to the augment function as well.
test %>%
mutate(
# Extra columns that we don't need for the loess fit but we want to keep.
z = rnorm(n()),
w = rnorm(n())
) %>%
group_by(id) %>%
group_modify(
# Now `broom::augment` appends .fitted and .resid to the original columns.
~ loess(y ~ x, span = .5, data = .x) %>% broom::augment(.x)
)
#> # A tibble: 145 × 7
#> # Groups: id [3]
#> id x y z w .fitted .resid
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 -8.10 0.658 1.31 -0.199 0.834 -0.176
#> 2 1 -8.06 0.738 0.395 -1.74 0.897 -0.159
#> 3 1 -8.01 0.846 -1.34 0.382 0.989 -0.142
#> 4 1 -7.96 0.906 0.376 0.231 0.975 -0.0688
#> 5 1 -7.91 0.964 -0.224 0.327 0.982 -0.0180
#> 6 1 -7.90 0.971 0.678 -0.504 0.983 -0.0121
#> 7 1 -7.90 0.962 0.736 -0.0186 0.980 -0.0179
#> 8 1 -7.91 0.953 0.368 0.0313 0.981 -0.0289
#> 9 1 -7.91 0.943 -1.35 1.02 0.983 -0.0404
#> 10 1 -7.92 0.928 0.280 -0.471 0.977 -0.0487
#> # … with 135 more rows
Created on 2022-03-08 by the reprex package (v2.0.1)

The input data in predict is going to be of different length than the original data I would suggest to save the output in list and then use unnest to get one long dataframe.
library(dplyr)
library(tidyr)
test %>%
arrange(id, x) %>%
group_by(id) %>%
summarise(Loess = list(predict(loess(y ~ x, span = .5, data=cur_data()),
data.frame(x = seq(min(x), max(x), 1))))) %>%
unnest(Loess)
# id Loess
# <int> <dbl>
# 1 1 0.784
# 2 2 7.20
# 3 2 6.78
# 4 2 6.34
# 5 3 7.01
# 6 3 6.73
# 7 3 6.43
# 8 3 6.11
# 9 3 5.73
#10 3 5.31
#11 3 4.82

Related

How to calculate Cohen's D across 50 points in R

I have the following DF:
structure(list(AgeGroup = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Young",
"Old"), class = "factor"), variable = structure(c(1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L,
10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L,
17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L,
23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L,
30L, 30L, 31L, 31L, 32L, 32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L,
36L, 37L, 37L, 38L, 38L, 39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L,
43L, 43L, 44L, 44L, 45L, 45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L,
49L, 50L, 50L), .Label = c("Point.1", "Point.2", "Point.3", "Point.4",
"Point.5", "Point.6", "Point.7", "Point.8", "Point.9", "Point.10",
"Point.11", "Point.12", "Point.13", "Point.14", "Point.15", "Point.16",
"Point.17", "Point.18", "Point.19", "Point.20", "Point.21", "Point.22",
"Point.23", "Point.24", "Point.25", "Point.26", "Point.27", "Point.28",
"Point.29", "Point.30", "Point.31", "Point.32", "Point.33", "Point.34",
"Point.35", "Point.36", "Point.37", "Point.38", "Point.39", "Point.40",
"Point.41", "Point.42", "Point.43", "Point.44", "Point.45", "Point.46",
"Point.47", "Point.48", "Point.49", "Point.50"), class = "factor"),
value = c(0.714518666666667, 0.723876630952381, 0.728961368421053,
0.735228897233202, 0.701283807017544, 0.71396457312253, 0.663229964912281,
0.68923661660079, 0.613014666666667, 0.652671079051383, 0.547104,
0.602951166007905, 0.504106245614035, 0.558832648221344,
0.487034052631579, 0.515752438735178, 0.451825245614035,
0.476300007905138, 0.442370175438596, 0.441173656126482,
0.438668315789474, 0.435859173913043, 0.450059526315789,
0.434047494071146, 0.478947649122807, 0.450561841897233,
0.481134438596491, 0.461228027667984, 0.446763543859649,
0.451031316205534, 0.396206754385965, 0.406836889328063,
0.357049368421053, 0.368716249011858, 0.343943631578947,
0.368048932806324, 0.376060403508772, 0.398834193675889,
0.413613877192982, 0.434683889328063, 0.434964894736842,
0.448746023715415, 0.451208631578947, 0.450663276679842,
0.470569192982456, 0.473143399209486, 0.515300736842105,
0.502499193675889, 0.543379719298246, 0.507495533596838,
0.550050701754386, 0.498506288537549, 0.541725807017544,
0.482379664031621, 0.517293315789474, 0.458068636363636,
0.485205245614035, 0.423109671936759, 0.438844403508772,
0.385925747035573, 0.39522349122807, 0.362403612648221, 0.374209192982456,
0.350889750988142, 0.354036315789474, 0.336213118577075,
0.340668122807018, 0.327800648221344, 0.326388666666667,
0.322577146245059, 0.328114842105263, 0.319440624505929,
0.342721666666667, 0.323974818181818, 0.357620473684211,
0.335501339920949, 0.372856842105263, 0.343831292490119,
0.377362315789474, 0.361571442687747, 0.393890736842105,
0.377489727272727, 0.419330684210526, 0.38274228458498, 0.419797666666667,
0.387899881422925, 0.423127684210526, 0.385955055335968,
0.42140750877193, 0.377730351778656, 0.403711631578947, 0.366319122529644,
0.390753140350877, 0.355189754940711, 0.373226596491228,
0.347452173913044, 0.348689877192982, 0.340376324110672,
0.329466947368421, 0.344867375494071)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
which a subset looks like:
A tibble: 100 x 3
AgeGroup variable value
<fct> <fct> <dbl>
1 Young Point.1 0.715
2 Old Point.1 0.724
3 Young Point.2 0.729
4 Old Point.2 0.735
5 Young Point.3 0.701
6 Old Point.3 0.714
7 Young Point.4 0.663
8 Old Point.4 0.689
9 Young Point.5 0.613
10 Old Point.5 0.653
I have an output using:
Cho_D <- DF %>%
rstatix::cohens_d(value ~ variable, var.equal = TRUE)
But this provides me with a lot of unnecessary calculations like Point.1 and Point.3, Point.1 and Point.4, etc.
I would like to calculate Cohen's D for each successive points. So for example:
Point.1:Point.2, Point.2:Point.3, etc. The end goal is to plot D values on the Y-axis and Points 1 through 50 on the X-axis.
In base R you can accomplish the same using embed + apply:
L <- split(DF$value, DF$variable)
mat <- embed(names(L), 2)[,2:1]
res <- apply(mat, 1, function(x) rstatix::cohens_d(stack(L[x]), values~ind))
do.call(rbind, res)
A tibble: 49 x 7
.y. group1 group2 effsize n1 n2 magnitude
<chr> <chr> <chr> <dbl> <int> <int> <ord>
1 values Point.1 Point.2 -2.29 2 2 large
2 values Point.2 Point.3 3.46 2 2 large
3 values Point.3 Point.4 2.17 2 2 large
4 values Point.4 Point.5 1.83 2 2 large
5 values Point.5 Point.6 1.69 2 2 large
6 values Point.6 Point.7 1.11 2 2 large
7 values Point.7 Point.8 0.973 2 2 large
8 values Point.8 Point.9 1.98 2 2 large
9 values Point.9 Point.10 1.82 2 2 large
10 values Point.10 Point.11 2.95 2 2 large
# ... with 39 more rows
if you can use the effsize::cohen.d then this function will be faster than all the options given so far:
my_cohen <- function(data){
L <- split(data$value, data$variable)
mat <- embed(names(L), 2)
res <- apply(mat, 1, function(x)
effsize::cohen.d(L[[x[2]]], L[[x[1]]])$estimate)
data.frame(mat, res)
}
my_cohen(DF)
X1 X2 res
1 Point.2 Point.1 -2.29025540
2 Point.3 Point.2 3.45998958
3 Point.4 Point.3 2.16986489
4 Point.5 Point.4 1.82991671
5 Point.6 Point.5 1.68816593
6 Point.7 Point.6 1.11414226
We could nest the 'value', get the lead of the list column, and apply cohen.d by looping over the two list
library(dplyr)
library(effsize)
library(purrr)
out <- DF %>%
select(-AgeGroup) %>%
nest(data = value) %>%
mutate(across(everything(), lead, .names = "{.col}_lead")) %>%
slice(-n()) %>%
mutate(cohen_d = map2_dbl(data, data_lead,
~ cohen.d(.x$value, .y$value)$estimate))
-output
head(out, 2)
# A tibble: 2 x 5
variable data variable_lead data_lead cohen_d
<fct> <list> <fct> <list> <dbl>
1 Point.1 <tibble [2 × 1]> Point.2 <tibble [2 × 1]> -2.29
2 Point.2 <tibble [2 × 1]> Point.3 <tibble [2 × 1]> 3.46
comparing with OP's filtered output
Cho_D %>%
slice(seq(1, n(), by = 49)) %>%
slice_head(n = 2)
# A tibble: 2 x 7
# .y. group1 group2 effsize n1 n2 magnitude
# <chr> <chr> <chr> <dbl> <int> <int> <ord>
#1 value Point.1 Point.2 -2.29 2 2 large
#2 value Point.2 Point.3 3.46 2 2 large
Benchmarks
With the number of comparisons reduced, the timings are below
system.time({Cho_D <- DF %>%
rstatix::cohens_d(value ~ variable, var.equal = TRUE)
})
# user system elapsed
# 16.316 0.060 16.330
system.time({out <- DF %>%
select(-AgeGroup) %>%
nest(data = value) %>%
mutate(across(everything(), lead, .names = "{.col}_lead")) %>%
slice(-n()) %>%
mutate(cohen_d = map2_dbl(data, data_lead,
~ cohen.d(.x$value, .y$value)$estimate))})
# user system elapsed
# 0.031 0.005 0.037

Error in if (start > end) stop("'start' cannot be after 'end'") : missing value where TRUE/FALSE needed in R when try perform minutes forecast

I want perform minutes forecast for variable sales
timeseries=structure(list(Data = structure(c(10L, 14L, 18L, 22L, 26L, 29L,
32L, 35L, 38L, 1L, 4L, 7L, 11L, 15L, 19L, 23L, 27L, 30L, 33L,
36L, 39L, 2L, 5L, 8L, 12L, 16L, 20L, 24L, 28L, 31L, 34L, 37L,
40L, 3L, 6L, 9L, 13L, 17L, 21L, 25L), .Label = c("01.01.2018",
"01.01.2019", "01.01.2020", "01.02.2018", "01.02.2019", "01.02.2020",
"01.03.2018", "01.03.2019", "01.03.2020", "01.04.2017", "01.04.2018",
"01.04.2019", "01.04.2020", "01.05.2017", "01.05.2018", "01.05.2019",
"01.05.2020", "01.06.2017", "01.06.2018", "01.06.2019", "01.06.2020",
"01.07.2017", "01.07.2018", "01.07.2019", "01.07.2020", "01.08.2017",
"01.08.2018", "01.08.2019", "01.09.2017", "01.09.2018", "01.09.2019",
"01.10.2017", "01.10.2018", "01.10.2019", "01.11.2017", "01.11.2018",
"01.11.2019", "01.12.2017", "01.12.2018", "01.12.2019"), class = "factor"),
client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Horns", "Kornev"), class = "factor"), stuff = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("chickens",
"hooves", "Oysters"), class = "factor"), Sales = c(374L,
12L, 120L, 242L, 227L, 268L, 280L, 419L, 12L, 172L, 336L,
117L, 108L, 150L, 90L, 117L, 116L, 146L, 120L, 211L, 213L,
67L, 146L, 118L, 152L, 122L, 201L, 497L, 522L, 65L, 268L,
441L, 247L, 348L, 445L, 477L, 62L, 226L, 476L, 306L)), .Names = c("Data",
"client", "stuff", "Sales"), class = "data.frame", row.names = c(NA,
-40L))
Update input data. Now they are write
So i run my script and i get the error
Error in if (start > end) stop("'start' cannot be after 'end'") :
missing value where TRUE/FALSE needed
# Perform forecast
library("lubridate")
# first the grouping variable
timeseries$group <- paste0(timeseries$client,timeseries$stuff)
# determine all groups
groups <- unique(timeseries$group)
# find starting date per group and save them as a list of elements c('YEAR','Month')
timeseries$date <- as.Date(as.character(timeseries$Data), '%dd.%mm.%YY.hh:mm:ss')
timeseries <- timeseries[order(timeseries$date),]
start_dates <- format(timeseries$date[match(groups, timeseries$group)], "%Y %m %d hh:mm")
start_dates <- strsplit(start_dates, ' ')
# Back to your code
# now the list
listed <- split(timeseries,timeseries$group)
str(listed)
# Edited the lapply funcion in order to consider the starting dates
# to have a smaller output, I post the str(listed)
library("forecast")
library("lubridate")
listed_ts <- lapply(seq_along(listed),
function(k) ts(listed[[k]][["Sales"]], start = as.integer(start_dates[[k]])) )
listed_ts
listed_arima <- lapply(listed_ts,function(x) auto.arima(x,allowmean = F ))
#Now the forecast for each arima:
listed_forecast <- lapply(listed_arima,function(x) forecast(x,1440) ) # forecast on 1440 minutes ahead
listed_forecast
#If you need to flat it down to a data.frame, do.call and rbind help:
do.call(rbind,listed_forecast)
# get fprecast of initial value
lapply(listed_arima, fitted)
What does mean this error?
Most likely I’m doing something wrong, indicating the time format.
How for each group to get a per-minute forecast for 1440 minutes?
I.E. expected format
Data client stuff Sales
10.11.2017 5:15 Horns chickens 336
10.11.2017 5:16 Horns chickens 336
10.11.2017 5:17 Horns chickens 336
10.11.2017 5:18 Horns chickens 336
max, not sure if the ARIMA results are good here but hopefully this will give you enough info to get unstuck as it shows how to apply a model to groups:
library(dplyr)
library(magrittr)
library(data.table)
temp <- rbindlist(listed, fill=TRUE)
temp %>% group_by(group) %>% do(data.frame(v_hat = forecast(auto.arima(.$Sales), h=100)))
# A tibble: 300 x 6
# Groups: group [3]
group v_hat.Point.Forecast v_hat.Lo.80 v_hat.Hi.80 v_hat.Lo.95 v_hat.Hi.95
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Hornschickens 224. 51.0 397. -40.5 488.
2 Hornschickens 224. 51.0 397. -40.5 488.
3 Hornschickens 224. 51.0 397. -40.5 488.
4 Hornschickens 224. 51.0 397. -40.5 488.
5 Hornschickens 224. 51.0 397. -40.5 488.
6 Hornschickens 224. 51.0 397. -40.5 488.
7 Hornschickens 224. 51.0 397. -40.5 488.
8 Hornschickens 224. 51.0 397. -40.5 488.
9 Hornschickens 224. 51.0 397. -40.5 488.
10 Hornschickens 224. 51.0 397. -40.5 488.
....

Calculate mean for column grouped by values of two other columns [duplicate]

This question already has answers here:
How to group by two columns in R
(4 answers)
Closed 4 years ago.
I have a dataframe with 5 columns. I know how to calculate the mean for one column grouped by another column. However, i need to group it by two columns. For example, I want to calculate the mean for column 5 grouped by column 1 and column 2.
df <- structure(list(Country = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L), .Label = c("AT", "CH", "DE"), class = "factor"),
Occupation = c(1L, 3L, 5L, 3L, 1L, 2L, 5L, 3L, 5L, 3L, 1L,
2L, 1L, 5L, 3L, 3L, 1L, 3L, 2L, 5L, 5L, 1L, 2L, 1L, 3L),
Age = c(20L, 46L, 30L, 12L, 73L, 53L, 19L, 43L, 65L, 53L,
19L, 34L, 76L, 25L, 45L, 39L, 18L, 59L, 37L, 24L, 19L, 60L,
51L, 32L, 29L), Gender = structure(c(1L, 1L, 2L, 2L, 2L,
1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L), .Label = c("female", "male"), class = "factor"),
Income = c(100L, 80L, 78L, 29L, 156L, 56L, 95L, 104L, 87L,
56L, 203L, 45L, 112L, 78L, 56L, 140L, 99L, 67L, 89L, 109L,
43L, 145L, 30L, 101L, 77L)), class = "data.frame", row.names = c(NA,
-25L))
head(df)
Country Occupation Age Gender Income
1 AT 1 20 female 100
2 AT 3 46 female 80
3 AT 5 30 male 78
4 AT 3 12 male 29
5 AT 1 73 male 156
6 AT 2 53 female 56
So what I want to to is calculate the mean for column ‘income’, grouped by country and occupation. E.g., I want to calculate the mean of ‘income’ for all those people living in country ‘AT’ with occupation ‘3’, the mean of ‘income’ for all those living in country ‘CH’ with occupation ‘1’ and so on.
(1) base method (aggregate)
mean.df <- aggregate(Income ~ Country + Occupation, df, mean)
names(mean.df)[3] <- "Income_Mean"
merge(df, mean.df)
(2) base method (tapply)
mean.df1 <- tapply(df$Income, list(df$Country, df$Occupation), mean)
mean.df2 <- as.data.frame(as.table(mean.df1))
names(mean.df2) <- c("Country", "Occupation", "Income_Mean")
merge(df, mean.df2)
(3) stats method (ave)
df2 <- df
df2$Income_Mean <- ave(df$Income, df$Country, df$Occupation)
(4) dplyr method
df %>% group_by(Country, Occupation) %>%
mutate(Income_Mean = mean(Income))
Output :
Country Occupation Age Gender Income Income_Mean
<fct> <int> <int> <fct> <int> <dbl>
1 AT 1 20 female 100 128
2 AT 3 46 female 80 71
3 AT 5 30 male 78 86.5
4 AT 3 12 male 29 71
5 AT 1 73 male 156 128
6 AT 2 53 female 56 56
7 AT 5 19 male 95 86.5
8 AT 3 43 male 104 71
9 CH 5 65 male 87 82.5
10 CH 3 53 female 56 84
# ... with 15 more rows
Using sqldf:
sqldf("select Country,Occupation,Age,Gender,avg(Income) from df group by Country,Occupation")
OR
Using data.table:
library(data.table)
df=data.table(df)
df[, mean(Income), by = list(Country,Occupation)]
Output:
Country Occupation Age Gender avg(Income)
1 AT 1 73 male 128.0
2 AT 2 53 female 56.0
3 AT 3 43 male 71.0
4 AT 5 19 male 86.5
5 CH 1 18 female 138.0
6 CH 2 34 male 45.0
7 CH 3 39 male 84.0
8 CH 5 25 female 82.5
9 DE 1 32 female 123.0
10 DE 2 51 female 59.5
11 DE 3 29 male 72.0
12 DE 5 19 male 76.0

factor season has new levels 4 , when performing Arima by group in R

Here example of my dataset
ts=structure(list(Data = structure(c(10L, 14L, 18L, 22L, 26L, 29L,
32L, 35L, 38L, 1L, 4L, 7L, 11L, 15L, 19L, 23L, 27L, 30L, 33L,
36L, 39L, 2L, 5L, 8L, 12L, 16L, 20L, 24L, 28L, 31L, 34L, 37L,
40L, 3L, 6L, 9L, 13L, 17L, 21L, 25L), .Label = c("01.01.2018",
"01.01.2019", "01.01.2020", "01.02.2018", "01.02.2019", "01.02.2020",
"01.03.2018", "01.03.2019", "01.03.2020", "01.04.2017", "01.04.2018",
"01.04.2019", "01.04.2020", "01.05.2017", "01.05.2018", "01.05.2019",
"01.05.2020", "01.06.2017", "01.06.2018", "01.06.2019", "01.06.2020",
"01.07.2017", "01.07.2018", "01.07.2019", "01.07.2020", "01.08.2017",
"01.08.2018", "01.08.2019", "01.09.2017", "01.09.2018", "01.09.2019",
"01.10.2017", "01.10.2018", "01.10.2019", "01.11.2017", "01.11.2018",
"01.11.2019", "01.12.2017", "01.12.2018", "01.12.2019"), class = "factor"),
client = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Horns", "Kornev"), class = "factor"), stuff = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("chickens",
"hooves", "Oysters"), class = "factor"), Sales = c(374L,
12L, 120L, 242L, 227L, 268L, 280L, 419L, 12L, 172L, 336L,
117L, 108L, 150L, 90L, 117L, 116L, 146L, 120L, 211L, 213L,
67L, 146L, 118L, 152L, 122L, 201L, 497L, 522L, 65L, 268L,
441L, 247L, 348L, 445L, 477L, 62L, 226L, 476L, 306L)), .Names = c("Data",
"client", "stuff", "Sales"), class = "data.frame", row.names = c(NA,
-40L))
I want perform time series using Arima models by group
#if using dummy
fun_tslm <- function(x, start = "2017-01-04", freq = 12){
tsw <- ts(x[["Sales"]], start = decimal_date(as.Date(start)), frequency = freq)
#View(tsw)
mytslm <- tslm(tsw ~ trend + season)
mytslm
}
fun_forecast <- function(x, h = 14){
residarima1 <- auto.arima(x[["residuals"]])
residualsArimaForecast <- forecast(residarima1, h = h)
residualsF <- as.numeric(residualsArimaForecast$mean)
regressionForecast <- forecast(x, h = h)
regressionF <- as.numeric(regressionForecast$mean)
forecastR <- regressionF + residualsF
forecastR
}
tslm_list <- lapply(group_list, fun_tslm)
fore_list <- lapply(tslm_list, fun_forecast)
When I run this script
I got the error
Error in model.frame.default(Terms, newdata, na.action = na.action,
xlev = object$xlevels) : factor season has new levels 4
But indeed I want to get output with Arima metrics where I can see
1.forecast initial value
2.forecast for 14 monthes with CI
outputs for initial values and forecasted values should be in two different data.frame.
How to do it?
Some parts are not too much clear in your script and your data, so I can try to give you a partial answer, to see how to get the result you want:
# I called your dataset in this way, because ts is a function
timeseries
Now, the idea is to convert to a list your data frame, each component of the list is a group, that is a time series. I imagined that each group is client + stuff, but we can manage it in different way:
# first the grouping variable
timeseries$group <- paste0(timeseries$client,timeseries$stuff)
# EDIT here you convert the Data class as class(date)
library(lubridate)
timeseries$Data <- dmy(timeseries$Data)
# now the list
listed <- split(timeseries,timeseries$group)
Now we have to define each component of the list as a time series, using lapply and ts function:
# I do not understand why all your ts start with "2017-01-04", when in the example they are not (probably because it's an example)
# EDIT: convert the start date
listed_ts <- lapply(listed,
function(x) ts(x[["Sales"]], start = ymd("2017-01-04"), frequency = 12) )
listed_ts
$`Hornschickens`
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
17170 374 12 120 242 227 268 280 419 12 172 336
$Hornshooves
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
17170 497 522 65 268 441 247 348 445 477 62 226 476
17171 306
$KornevOysters
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
17170 117 108 150 90 117 116 146 120 211 213 67 146
17171 118 152 122 201
The next step is to auto.arima each time series, with the lapply logic:
library(forecast)
listed_arima <- lapply(listed_ts,function(x) auto.arima(x) )
# partial result
> listed_arima
$`Hornschickens`
Series: x
ARIMA(0,0,0) with non-zero mean
Coefficients:
mean
223.8182
s.e. 38.7707
sigma^2 estimated as 18188: log likelihood=-69.03
AIC=142.06 AICc=143.56 BIC=142.86
...
Now the forecast for each arima:
listed_forecast <- lapply(listed_arima,function(x) forecast(x,1) )
If you need to flat it down to a data.frame, do.call and rbind help:
do.call(rbind,listed_forecast)
method model level mean lower upper x series fitted residuals
Hornschickens "ARIMA(0,0,0) with non-zero mean" List,18 Numeric,2 223.8182 Numeric,2 Numeric,2 Integer,11 "x" Numeric,11 Numeric,11
Hornshooves "ARIMA(0,0,0) with non-zero mean" List,18 Numeric,2 336.9231 Numeric,2 Numeric,2 Integer,13 "x" Numeric,13 Numeric,13
KornevOysters "ARIMA(0,0,0) with non-zero mean" List,18 Numeric,2 137.125 Numeric,2 Numeric,2 Integer,16 "x" Numeric,16 Numeric,16
I think you can twist it a bit more to have a better result. And if you are wondering why for this example, if you put more than 1 in the auto.arima function to predict, but the result is a constant, the answer is here, also pointed out by the method column on the output.

replacement missing values by groups in R

How can i replace the missing values for each group separately?
The reproducible example:
mydata=structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), group.2 = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L), x1 = c(20L, 4L, 91L, NA, 94L, 69L, 38L,
NA, 29L, 69L, 55L, 86L, 81L, 11L, NA, 12L, 65L, 90L, 74L, NA,
49L, 90L), x2 = c(44L, 94L, NA, 1L, 67L, NA, 73L, 22L, 44L, 24L,
NA, 54L, 70L, 65L, 97L, 10L, 97L, NA, 74L, 97L, 34L, 29L)), class = "data.frame", row.names = c(NA,
-22L))
Now i found how to replace the missing values without groups.
library(dplyr)
mydata %>% mutate_at(vars(starts_with("x1")), funs(ifelse(is.na(.) & is.numeric(.) ,mean(., na.rm = TRUE),.)))
But i need to replace for each groups (group1,group2) separately.
edit to small dataset
structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L), group.2 = c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L), x1 = c(63L, 67L, 57L, NA, 65L, 75L, 57L, 80L, 42L,
NA, 35L, 80L), x2 = c(46L, 1L, NA, 41L, 80L, NA, 74L, 73L, NA,
13L, 83L, NA)), class = "data.frame", row.names = c(NA, -12L))
mydata=structure(list(group1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), group2 = c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L), x1 = c(20L, 4L, 91L, NA, 94L, 69L, 38L,
NA, 29L, 69L, 55L, 86L, 81L, 11L, NA, 12L, 65L, 90L, 74L, NA,
49L, 90L), x2 = c(44L, 94L, NA, 1L, 67L, NA, 73L, 22L, 44L, 24L,
NA, 54L, 70L, 65L, 97L, 10L, 97L, NA, 74L, 97L, 34L, 29L)), class = "data.frame", row.names = c(NA,
-22L))
library(tidyverse)
mydata %>%
unite(group, group1, group2) %>% # combine groups
mutate(id = row_number()) %>% # add the row number as an id (useful when reshaping)
gather(var, value, -group, -id) %>% # reshape data
group_by(group, var) %>% # for each group combination and variable
mutate(value = ifelse(is.na(value), mean(value, na.rm = T), value)) %>% # replace NAs with mean
spread(var, value) %>% # reshape again
arrange(id) %>% # keep order of original dataset
select(-id) %>% # remove id
ungroup() %>% # forget the grouping
separate(group, c("group1","group2")) # split the groups again
# # A tibble: 22 x 4
# group1 group2 x1 x2
# <chr> <chr> <dbl> <dbl>
# 1 1 1 20 44
# 2 1 2 4 94
# 3 1 1 91 61.3
# 4 1 2 36.5 1
# 5 1 1 94 67
# 6 1 2 69 39
# 7 1 1 38 73
# 8 1 2 36.5 22
# 9 2 1 29 44
# 10 2 2 69 24
# # ... with 12 more rows

Resources