Related
I have the following data which looks like:
Pza_de_Espana Escuelas_Aguirre Av_Ramon_y_Cajal Arturo_Soria C_Farolillo Casa_de_Campo Barajas
1 12 29 26 27 19 4 31
2 40 42 55 49 41 25 53
3 51 51 73 57 56 51 60
4 53 52 65 56 64 64 56
5 46 46 59 50 53 34 65
6 30 34 39 39 34 20 50
7 31 39 40 28 37 28 37
I can run the following to get the first stage linear regression fitted values for each combination of the columns.
firstStage <- combn(names(data)[-1], 2, FUN = function(x)
lm(reformulate(x[1], response = x[2]), data = data), simplify = FALSE)
library(dplyr)
library(purrr)
firstStagePreds <- firstStage %>%
map(., ~ pluck(., "fitted.values"))
firstStageFittedValues <- firstStagePreds %>%
bind_cols()
Which gives me:
# A tibble: 10 × 171
...1 ...2 ...3 ...4 ...5 ...6 ...7 ...8 ...9 ...10 ...11 ...12 ...13 ...14 ...15 ...16 ...17 ...18
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 33.1 25.7 19.1 1.93 35.6 28.7 35.1 25.0 19.5 22.8 30.1 14.8 23.2 81.6 19.2 5.92 14.3 74.6
2 53.4 43.8 43.0 31.7 50.1 47.3 57.3 47.1 37.9 42.5 46.0 32.8 43.8 81.7 42.4 23.8 29.8 74.9
3 67.5 56.4 59.6 52.2 60.1 60.2 72.7 62.4 50.6 56.2 57.0 45.3 58.1 81.7 58.4 36.2 40.6 75.2
4 69.0 57.8 61.4 54.5 61.2 61.6 74.4 64.1 52.0 57.7 58.2 46.6 59.7 81.7 60.2 37.6 41.8 75.2
5 59.7 49.4 50.4 40.8 54.5 53.0 64.2 53.9 43.5 48.6 50.9 38.3 50.2 81.7 49.5 29.3 34.6 75.0
Where the column names are ...1 ...2 ...3 etc. I would like to use the combn(names(data)... to rename the lists in the firstStage part of the code. Then I can use the bind_cols(.id = "myListNames") to have more meaningful column names. So how can I use the output of the combn store them and rename the list?
Data:
data <- structure(list(Pza_de_Espana = c(12, 40, 51, 53, 46, 30, 31,
30, 26, 47), Escuelas_Aguirre = c(29, 42, 51, 52, 46, 34, 39,
31, 39, 41), Av_Ramon_y_Cajal = c(26, 55, 73, 65, 59, 39, 40,
49, 47, 56), Arturo_Soria = c(27, 49, 57, 56, 50, 39, 28, 25,
35, 50), C_Farolillo = c(19, 41, 56, 64, 53, 34, 37, 22, 25,
50), Casa_de_Campo = c(4, 25, 51, 64, 34, 20, 28, 7, 9, 38),
Barajas = c(31, 53, 60, 56, 65, 50, 37, 41, 36, 54), Pza_del_Carmen = c(24,
46, 59, 63, 54, 35, 43, 39, 40, 47), Moratalaz = c(35, 62,
76, 69, 67, 48, 37, 39, 47, 66), Cuatro_Caminos = c(19, 40,
65, 64, 52, 33, 50, 37, 33, 51), Barrio_de_Pilar = c(23,
40, 53, 52, 40, 29, 28, 19, 31, 41), Vallecas = c(21, 44,
55, 56, 54, 37, 28, 26, 33, 47), Mendez_Alvaro = c(24, 44,
55, 59, 51, 32, 51, 42, 31, 51), Retiro = c(13, 31, 44, 50,
36, 22, 30, 22, 21, 37), Ensanche_Vallecas = c(21, 47, 55,
56, 62, 34, 32, 29, 32, 45), Plaza_Eliptica = c(81.989743981193,
81.9187518755066, 81.8477597698195, 81.7767676641325, 81.7057755584454,
81.6347834527583, 81.5637913470712, 81.4927992413841, 81.421807135697,
81.3508150300099), Sanchinarro = c(16, 52, 61, 56, 46, 30,
28, 25, 33, 48), El_Pardo = c(7, 28, 30, 42, 31, 16, 12,
8, 13, 29), Parque_Juan_Carlos_1 = c(17, 36, 41, 41, 35,
27, 18, 12, 20, 32), Tres_Olivos = c(76.7710873995529, 76.3531164480543,
75.935145496561, 75.5171745450677, 75.0992035935744, 74.6812326420812,
74.2632616905879, 73.8452907390946, 73.4273197876013, 73.009348836108
)), row.names = c(NA, 10L), class = "data.frame")
You can return only the fitted values from combn.
library(dplyr)
combn(names(data)[-1], 2, FUN = function(x) {
model <- lm(reformulate(x[1], response = x[2]), data = data)
tibble(!!paste(x, collapse = ' vs ') := model$fitted.values)
}, simplify = FALSE) %>%
bind_cols()
I want to group daily data from Google Trends into weekly observations and smooth them by 7-day centered moving average? How can I do this? In which order?
Should I first group data? Or should I use centered moving average on daily data?
This is my data:
dput(multiTimeline)
structure(list(day = structure(c(1598400000, 1598486400, 1598572800,
1598659200, 1598745600, 1598832000, 1598918400, 1599004800, 1599091200,
1599177600, 1599264000, 1599350400, 1599436800, 1599523200, 1599609600,
1599696000, 1599782400, 1599868800, 1599955200, 1600041600, 1600128000,
1600214400, 1600300800, 1600387200, 1600473600, 1600560000, 1600646400,
1600732800, 1600819200, 1600905600, 1600992000, 1601078400, 1601164800,
1601251200, 1601337600, 1601424000, 1601510400, 1601596800, 1601683200,
1601769600, 1601856000, 1601942400, 1602028800, 1602115200, 1602201600,
1602288000, 1602374400, 1602460800, 1602547200, 1602633600, 1602720000,
1602806400, 1602892800, 1602979200, 1603065600, 1603152000, 1603238400,
1603324800, 1603411200, 1603497600, 1603584000, 1603670400, 1603756800,
1603843200, 1603929600, 1604016000, 1604102400, 1604188800, 1604275200,
1604361600, 1604448000, 1604534400, 1604620800, 1604707200, 1604793600,
1604880000, 1604966400, 1605052800, 1605139200, 1605225600, 1605312000,
1605398400, 1605484800, 1605571200, 1605657600, 1605744000, 1605830400,
1605916800, 1606003200, 1606089600), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), football = c(36, 36, 41, 60, 45, 38, 38, 39,
43, 49, 70, 49, 44, 46, 50, 62, 71, 92, 96, 61, 51, 45, 50, 58,
87, 81, 54, 50, 43, 49, 58, 97, 84, 55, 48, 41, 51, 56, 94, 83,
51, 47, 46, 49, 62, 97, 84, 51, 55, 51, 47, 52, 96, 79, 51, 49,
42, 44, 52, 100, 82, 49, 45, 41, 42, 50, 89, 73, 48, 40, 21,
29, 36, 75, 69, 45, 37, 39, 45, 51, 87, 69, 47, 48, 43, 37, 45,
79, 66, 46)), row.names = c(NA, -90L), class = c("tbl_df", "tbl",
"data.frame"))
Data is from 2020-08-26 to 2020-11-23.
I allowed myself to use the packages dplyr, to make data manipulation easier, and lubidrate, which makes date manipualtion easy.
The code is:
library(dplyr)
library(lubridate)
df2 <- df %>%
mutate(week = week(day)) %>%
group_by(week) %>%
summarise(average = mean(football))
The only function I used from lubidrate there was week(), if you're interested.
What I did was: first, I created another column (could have been the same one, though) that states the week. Note that this only works because your column was already in date-time format (though just date would have workes too, maybe even better). From that, I grouped by week and took the average. I hope I understood your question correctly and this will help.
It worked; this was the output:
> df2
# A tibble: 13 x 2
week average
<dbl> <dbl>
1 35 42
2 36 48.6
3 37 69
4 38 60.7
5 39 62
6 40 60.4
7 41 63.4
8 42 60.7
9 43 59.1
10 44 54.7
11 45 44.6
12 46 55.1
13 47 52.7
You can use rollmean from zoo package to do all this as a one-liner.
multiTimeline$rolling <- zoo::rollmean(multiTimeline$football, 7, na.pad = TRUE)
multiTimeline
#> # A tibble: 90 x 3
#> day football rolling
#> <dttm> <dbl> <dbl>
#> 1 2020-08-26 00:00:00 36 NA
#> 2 2020-08-27 00:00:00 36 NA
#> 3 2020-08-28 00:00:00 41 NA
#> 4 2020-08-29 00:00:00 60 42
#> 5 2020-08-30 00:00:00 45 42.4
#> 6 2020-08-31 00:00:00 38 43.4
#> 7 2020-09-01 00:00:00 38 44.6
#> 8 2020-09-02 00:00:00 39 46
#> 9 2020-09-03 00:00:00 43 46.6
#> 10 2020-09-04 00:00:00 49 47.4
#> # ... with 80 more rows
If you want to pick out the smoothed average for each week from Saturday to Friday, just use filter to select only Tuesdays. This will give you the 7-day average from the previous Saturday to the following Friday.
multiTimeline %>% filter(lubridate::wday(day) == 3)
#> # A tibble: 12 x 3
#> day football rolling
#> <dttm> <dbl> <dbl>
#> 1 2020-09-01 00:00:00 38 44.6
#> 2 2020-09-08 00:00:00 46 56
#> 3 2020-09-15 00:00:00 51 64.7
#> 4 2020-09-22 00:00:00 50 60.3
#> 5 2020-09-29 00:00:00 48 61.7
#> 6 2020-10-06 00:00:00 47 61.7
#> 7 2020-10-13 00:00:00 55 62.4
#> 8 2020-10-20 00:00:00 49 59
#> 9 2020-10-27 00:00:00 45 58.4
#> 10 2020-11-03 00:00:00 40 48
#> 11 2020-11-10 00:00:00 37 51.6
#> 12 2020-11-17 00:00:00 48 53.7
To show this is what you want, we can plot your data and the averaged line using ggplot:
ggplot(multiTimeline, aes(day, football)) +
geom_line() +
geom_line(data = multiTimeline %>% filter(lubridate::wday(day) == 3),
aes(y = rolling), col = "red", lty = 2, size = 1.5)
This question already has an answer here:
Pipe a data frame to a function whose argument pipes a dot
(1 answer)
Closed 3 years ago.
Using piping in R (with %>%), how can one pass specific vector elements from a function's output to feed the next function's arguments?
I've tried using the dot operator with position in braces (i.e., .[1], .[2]) to no avail.
The only way that was working for me was with sapply(), but I'm wondering whether there's a simpler solution I'm missing.
Example
#I have a vector containing a sequence of numbers, with some duplicates and gaps,
#and I want to use its start and end points to create an analogous consecutive sequence.
original_sequence <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
88, 89, 90, 91, 92, 93, 94, 95, 96, 98, 98, 99, 100, 101,
102, 103, 104, 105, 106, 107, 108, 109, 110)
## unsuccessful attempt #1
original_sequence %>%
range() %>%
seq()
[1] 1 2 ## this result is equivalent to the output of `seq(2)`,
## but what I want is to compute `seq(1 ,110)`.
## unsuccessful attempt #2
original_sequence %>%
range() %>%
seq(.[1]), .[2])
Error: unexpected ',' in:
" range() %>%
seq(.[1]),"
## attempt #3: somewhat successful but I wonder whether there's a better way
original_sequence %>%
range() %>%
sapply(., seq)
[[1]]
[1] 1
[[2]]
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
[39] 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
[77] 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110
Bottom line -- I was able to do it with sapply but I hope to figure out a solution in the spirit of my second attempt, because it's more handy to know a universal way to cherry-pick the specific vector elements you want to pass to the next function's arguments.
One way would be to use {} and pass input arguments to seq
library(dplyr)
original_sequence %>%
range() %>%
{seq(.[[1]], .[2])}
#[1] 1 2 3 4 5 6 7 8 9 10 11 12......
Or we can mix it with base R do.call
original_sequence %>% range() %>% {do.call(seq, as.list(.))}
Or as #Ozan147 mentioned if your sequence always starts with 1 we can use max
original_sequence %>% max %>% seq
We can use reduce
library(tidyverse)
original_sequence %>%
range %>%
reduce(seq)
#[1] 1 2 3 4 ...
Summary (tldr)
I need to perform a rolling regression on an irregular time series (i.e. the interval may not even be periodic and go from 0, 1, 2, 3... to ...7, 20, 24, 28...) that's simple numeric and does not necessarily require date/time, but the rolling window needs be by time. So if I have a timeseries that is irregularly sampled for 600 seconds and the window is 30, the regression is performed every 30 seconds, and not every 30 samples.
I've read examples, and while I could replicate doing rolling sums and medians by time, I can't seem to figure it out for regression.
The problem
First of all, I have read some of the other questions with regards to performing rolling functions on irregular time series data, such as this: optimized rolling functions on irregular time series with time-based window, and this: Rolling window over irregular time series.
The issue is that the examples provided, so far, are simple for equations like sum or median, but I have not yet figured out how to perform a simple rolling regression, i.e. using lm, that is still based on the same caveat that the window is based on an irregular time series. Also, my timeseries is much, much simpler; no date is necessary, it's simply time "elapsed".
Anyway, getting this right is important to me because with irregular time - for example, a skip in the time interval - may give an over- or underestimate of the coefficients in the rolling regression, as the sample window will include additional time.
So I was wondering if anyone can help me with creating a function that does this in the simplest way? The dataset is based on measuring a variable over time i.e. 2 variables: time, and response. Time is measured every x time elapsed units (seconds, minutes, so not date/time formatted), but once in a while it becomes irregular.
For every row in the function, it should perform a linear regression based on a width of n time units. The width should never exceed n units, but may be floored (i.e. reduced) to accomodate irregular time sampling. So for example, if the width is specified at 20 seconds, but time is sampled every 6 seconds, then the window will be rounded to 18, not 24 seconds.
I have looked at the question here: How to calculate the average slope within a moving window in R, and I tested that code on an irregular time series, but it looks like it's based on regular time series.
Sample data:
sample <-
structure(list(x = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
29, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 47, 48,
49), y = c(50, 49, 48, 47, 46, 47, 46, 45, 44, 43, 44, 43, 42,
41, 40, 41, 40, 39, 38, 37, 38, 37, 36, 35, 34, 35, 34, 33, 32,
31, 30, 29, 28, 29, 28, 27, 26, 25, 26, 25, 24, 23, 22, 21, 20,
19)), .Names = c("x", "y"), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -46L))
My current code (based on a previous question I referred to). I know it's not subsetting by time:
library(zoo)
clm <- function(z) coef(lm(y ~ x, as.data.frame(z)))
rollme <- rollapplyr(zoo(sample), 10, clm, by.column = F, fill = NA)
The expected output (manually calculated) is below. The output is different from a regular rolling regression -- the numbers are different as soon as the time interval skips at 29 (secs):
NA
NA
NA
NA
NA
NA
NA
NA
NA
-0.696969697
-0.6
-0.551515152
-0.551515152
-0.6
-0.696969697
-0.6
-0.551515152
-0.551515152
-0.6
-0.696969697
-0.6
-0.551515152
-0.551515152
-0.6
-0.696969697
-0.6
-0.551515152
-0.551515152
-0.6
-0.696969697
-0.605042017
-0.638888889
-0.716981132
-0.597560976
-0.528301887
-0.5
-0.521008403
-0.642857143
-0.566666667
-0.551515152
-0.551515152
-0.6
-0.696969697
-0.605042017
-0.638888889
-0.716981132
I hope I'm providing enough information, but let me know (or give me a guide to a good example somewhere) for me to try this?
Other things I have tried:
I've tried converting the time to POSIXct format but I don't know how to perform lm on that:
require(lubridate)
x <- as.POSIXct(strptime(sample$x, format = "%S"))
Update : Added tldr section.
Try this:
# time interval is 1
sz=10
pl2=list()
for ( i in 1:nrow(sample)){
if (i<sz) period=sz else
period=length(sample$x[sample$x>(sample$x[i]-sz) & sample$x<=sample$x[i]])-1
pl2[[i]]=seq(-period,0)
}
#update for time interval > 1
sz=10
tint=1
pl2=list()
for ( i in 1:nrow(sample)){
if (i<sz) period=sz else
period=length(sample$x[sample$x>(sample$x[i]-sz*tint) & sample$x<=sample$x[i]])-1
pl2[[i]]=seq(-period,0)
}
rollme3 <- rollapplyr(zoo(sample), pl2, clm, by.column = F, fill = NA)
> tail(rollme3)
(Intercept) x
41 47.38182 -0.5515152
42 49.20000 -0.6000000
43 53.03030 -0.6969697
44 49.26050 -0.6050420
45 50.72222 -0.6388889
46 54.22642 -0.7169811
For the sake of completeness, here is an answer which uses data.table to aggregate in a non-equi join.
Although there many similar questions, e.g., r calculating rolling average with window based on value (not number of rows or date/time variable), this question deserves an answer on its own as the OP is looking for the coefficients of a rolling regression.
library(data.table)
ws <- 10 # size of sliding window in time units
setDT(sample)[.(start = x - ws, end = x), on = .(x > start, x <= end),
as.list(coef(lm(y ~ x.x))), by = .EACHI]
x x (Intercept) x.x
1: -10 0 50.00000 NA
2: -9 1 50.00000 -1.0000000
3: -8 2 50.00000 -1.0000000
4: -7 3 50.00000 -1.0000000
5: -6 4 50.00000 -1.0000000
6: -5 5 49.61905 -0.7142857
7: -4 6 49.50000 -0.6428571
8: -3 7 49.50000 -0.6428571
9: -2 8 49.55556 -0.6666667
10: -1 9 49.63636 -0.6969697
11: 0 10 49.20000 -0.6000000
12: 1 11 48.88485 -0.5515152
13: 2 12 48.83636 -0.5515152
14: 3 13 49.20000 -0.6000000
15: 4 14 50.12121 -0.6969697
16: 5 15 49.20000 -0.6000000
17: 6 16 48.64242 -0.5515152
18: 7 17 48.59394 -0.5515152
19: 8 18 49.20000 -0.6000000
20: 9 19 50.60606 -0.6969697
21: 10 20 49.20000 -0.6000000
22: 11 21 48.40000 -0.5515152
23: 12 22 48.35152 -0.5515152
24: 13 23 49.20000 -0.6000000
25: 14 24 51.09091 -0.6969697
26: 15 25 49.20000 -0.6000000
27: 16 26 48.15758 -0.5515152
28: 17 27 48.10909 -0.5515152
29: 18 28 49.20000 -0.6000000
30: 19 29 51.57576 -0.6969697
31: 22 32 49.18487 -0.6050420
32: 23 33 50.13889 -0.6388889
33: 24 34 52.47170 -0.7169811
34: 25 35 48.97561 -0.5975610
35: 26 36 46.77358 -0.5283019
36: 27 37 45.75000 -0.5000000
37: 28 38 46.34454 -0.5210084
38: 29 39 50.57143 -0.6428571
39: 30 40 47.95556 -0.5666667
40: 31 41 47.43030 -0.5515152
41: 32 42 47.38182 -0.5515152
42: 33 43 49.20000 -0.6000000
43: 34 44 53.03030 -0.6969697
44: 37 47 49.26050 -0.6050420
45: 38 48 50.72222 -0.6388889
46: 39 49 54.22642 -0.7169811
x x (Intercept) x.x
Please note that rows 10 to 30 where the time series is regularly spaced are identical to OP's rollme.
The call to as.list() forces the result of coef(lm(...)) to appear in separate columns.
The code above uses a right aligned rolling window. However, the code can be easily adapted to support a left aligned window as well:
# left aligned window
setDT(sample)[.(start = x, end = x + ws), on = .(x >= start, x < end),
as.list(coef(lm(y ~ x.x))), by = .EACHI]
With runner one can apply any R function in irregular time series. User has to specify put data to x argument and vector of dates to idx argument (to make windows time dependent). Window width k can be a integer k = 30 or character like in seq.POSIXt k = "30 secs".
First example shows how to obtain both parameters from lm function - output will be a matrix
library(runner)
runner(
x = sample,
k = "30 secs",
idx = sample$datetime,
function(x) {
coefficients(lm(y ~ x, data = x))
}
)
Or one can execute runner separately for each parameter
library(runner)
sample$intercept <- runner(
sample,
k = "30 secs",
idx = sample$datetime,
function(x) {
coefficients(lm(y ~ x, data = x))[1]
}
)
sample$slope <- runner(
sample,
k = "30 secs",
idx = sample$datetime,
function(x) {
coefficients(lm(y ~ x, data = x))[2]
}
)
head(sample, 15)
# datetime x y intercept slope
# 1 2020-04-13 09:27:20 0 50 50.00000 NA
# 2 2020-04-13 09:27:21 1 49 50.00000 -1.0000000
# 3 2020-04-13 09:27:25 2 48 50.00000 -1.0000000
# 4 2020-04-13 09:27:29 3 47 50.00000 -1.0000000
# 5 2020-04-13 09:27:29 4 46 50.00000 -1.0000000
# 6 2020-04-13 09:27:32 5 47 49.61905 -0.7142857
# 7 2020-04-13 09:27:34 6 46 49.50000 -0.6428571
# 8 2020-04-13 09:27:38 7 45 49.50000 -0.6428571
# 9 2020-04-13 09:27:38 8 44 49.55556 -0.6666667
# 10 2020-04-13 09:27:41 9 43 49.63636 -0.6969697
# 11 2020-04-13 09:27:44 10 44 49.45455 -0.6363636
# 12 2020-04-13 09:27:47 11 43 49.38462 -0.6153846
# 13 2020-04-13 09:27:48 12 42 49.38462 -0.6153846
# 14 2020-04-13 09:27:49 13 41 49.42857 -0.6263736
# 15 2020-04-13 09:27:50 14 40 49.34066 -0.6263736
Data with datetime column
sample <- structure(
list(
datetime = c(3, 1, 4, 4, 0, 3, 2, 4, 0, 3, 3, 3, 1, 1, 1, 3, 0, 2, 4, 2, 2,
3, 0, 1, 2, 4, 0, 1, 4, 4, 1, 2, 1, 3, 0, 4, 4, 1, 3, 0, 0, 2,
1, 0, 2, 0) + Sys.time(),
x = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 47, 48, 49),
y = c(50, 49, 48, 47, 46, 47, 46, 45, 44, 43, 44, 43, 42, 41, 40, 41, 40, 39,
38, 37, 38, 37, 36, 35, 34, 35, 34, 33, 32, 31, 30, 29, 28, 29, 28, 27,
26, 25, 26, 25, 24, 23, 22, 21, 20,19)
),
.Names = c("x", "y"),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -46L)
)
I have to look up some scores and assign percentile value based on a fixed look-up table.
I've tried to solve this problem for some time now, and I have read this and this SO thread, but without solving my problem. My problem is that the raw score can be bigger then the values in the look-up table, in such cases the biggest percentile value is prescribed.
I have a look-up table like this,
lookup <- structure(list(Percentile = c(99, 95, 90, 85, 80, 75, 70, 65, 60, 55, 50, 45, 40, 35, 30, 25, 20, 15, 10, 5, 1), ACB = c(24, 19, 18, 17, 16, NA, 15, NA, 14, NA, NA, 13, NA, NA, NA, 12, NA, 11, 10, 9, 7), DFG = c(49, 39, 36, 33, 31, 30, 29, 28, 27, 26, 25, NA, 24, 23, 22, 21, 20, 19, 17, 14, 12), EIH = c(35, 30, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, NA, 14, NA, 13, 12, NA), GKJ = c(49, 39, 36, 33, 31, 30, 29, 28, 27, 26, 25, NA, 24, 23, 22, 21, 19, 18, 17, 15, 14), Total = c(112, 99, 91, 86, 82, 79, 76, 75, 73, 71, 69, 67, 66, 65, 63, 61, 59, 55, 51, 46, 39)), .Names = c("Percentile", "ACB", "DFG", "EIH", "GKJ", "Total"), row.names = c("99+", "95", "90", "85", "80", "75", "70", "65", "60", "55", "50", "45", "40", "35", "30", "25", "20", "15", "10", "5", "1"), class = "data.frame")
lookup
Percentile ACB DFG EIH GKJ Total
99+ 99 24 49 35 49 112
95 95 19 39 30 39 99
90 90 18 36 27 36 91
85 85 17 33 26 33 86
80 80 16 31 25 31 82
75 75 NA 30 24 30 79
70 70 15 29 23 29 76
65 65 NA 28 22 28 75
60 60 14 27 21 27 73
55 55 NA 26 20 26 71
50 50 NA 25 19 25 69
45 45 13 NA 18 NA 67
40 40 NA 24 17 24 66
35 35 NA 23 16 23 65
30 30 NA 22 15 22 63
25 25 12 21 NA 21 61
20 20 NA 20 14 19 59
15 15 11 19 NA 18 55
10 10 10 17 13 17 51
5 5 9 14 12 15 46
1 1 7 12 NA 14 39
and, som raw data that looks like this,
rawS_1 <- structure(list(ACB = 28, DFG = 39, EIH = 31, GKJ = NA_real_, Total = NA_real_), .Names = c("ACB", "DFG", "EIH", "GKJ", "Total"), row.names = "RawScore for ID 1", class = "data.frame")
rawS_1
ACB DFG EIH GKJ Total
RawScore for ID 1 28 39 31 NA NA
rawS_2 <- structure(list(ACB = 29, DFG = 51, EIH = 56, GKJ = 60, Total = 169), .Names = c("ACB", "DFG", "EIH", "GKJ", "Total"), row.names = "RawScore for ID 2", class = "data.frame")
rawS_2
ACB DFG EIH GKJ Total
RawScore for ID 2 29 51 56 60 169
and, this is what I would like to do,
ACB DFG EIH GKJ Total
RawScore for ID 1 12 39 19 NA NA
Percentile, ID 1 25 95 50 NA NA
ACB DFG EIH GKJ Total
RawScore for ID 2 29 51 56 60 169
Percentile, ID 2 99 99 99 99 99
I've tried to merge() with all.x = TRUE and suffixes = c(".x",".y")), but I keep getting what I don't want and help would be appreciated.
Rather than thinking of this as merging, I think you're better off thinking it as a problem of creating a function: you want a function that when given the raw value of (e.g.) ACB returns the percentile. Luckily R has a function designed to make a function from a table of numbers: approxfun.
The following code uses lapply to create a function for each column, and then shows how to call the the new functions:
vars <- names(lookup)[-1]
lookup_funs <- lapply(vars, function(var) {
df <- data.frame(x = lookup[[var]], y = lookup$Percentile)
df <- df[complete.cases(df), ]
approxfun(df$x, df$y, "constant", rule = 2)
})
names(lookup_funs) <- vars
lookup_funs$ACB(c(12, 29))
lookup_funs$Total(169)
Basic strategy is to use !is.na(vec) to index both the value and the percetile vectors. Here's a look at a single case. Which one would you prefer for input of 11 for ACB?
> rev(lookup$Percentile)[!is.na(lookup$ACB)][
findInterval( 11, c(-Inf,rev(lookup$ACB[!is.na(lookup$ACB)]), Inf))]
[1] 20
> rev(lookup$Percentile)[!is.na(lookup$ACB)][
findInterval( 11, c(-Inf,rev(lookup$ACB[!is.na(lookup$ACB)]), Inf))-1]
[1] 15
This gets you most of the way there for one row of data:
> for(i in names(rawS_1) ) {print(rawS_1[i]); print(rev(lookup$Percentile)[ !is.na(lookup[[i]]) ][ findInterval( rawS_1[i], c( rev( lookup[[i]][ !is.na(lookup[[i]] )]) ) )] )}
ACB
RawScore for ID 1 28
[1] 99
DFG
RawScore for ID 1 39
[1] 95
EIH
RawScore for ID 1 31
[1] 90
GKJ
RawScore for ID 1 NA
[1] NA
Total
RawScore for ID 1 NA
[1] NA
You do get into indexing overruns with the subtraction of 1 from indices at the high end of the scale, so you probably ought to add an extra element on the lookup vector after you decide what result you want to see.
for(i in names(rawS_2) ) {print(rawS_2[i]); print(rev(lookup$Percentile)[ !is.na(lookup[[i]]) ][ findInterval( rawS_2[i], c( rev( lookup[[i]][ !is.na(lookup[[i]] )]) ) )] )}
ACB
RawScore for ID 2 29
[1] 99
DFG
RawScore for ID 2 51
[1] 99
EIH
RawScore for ID 2 56
[1] 95
GKJ
RawScore for ID 2 60
[1] 99
Total
RawScore for ID 2 169
[1] 99