Iterate through groups of variables in a survey - R - r

As indicated here, if you want to calculate proportions of a categorical variable in the amazing srvyr package you first have to group over the variables as factors and then use an empty srvyr::survey_mean, as in this example.
My goal is to iterate over the second variables cname and sch.wide while keeping the first grouping variable stype to avoid duplicating the code.
library(survey)
library(srvyr)
data(api)
df <- apiclus1 %>%
mutate(cname=as.factor(cname)) %>%
select(pw,stype, cname,sch.wide) %>%
as_survey_design(weights=pw)
# proportions of sch.wide
df %>%
group_by(stype,sch.wide) %>%
summarise(prop=srvyr::survey_mean())
#> # A tibble: 6 x 4
#> stype sch.wide prop prop_se
#> <fct> <fct> <dbl> <dbl>
#> 1 E No 0.0833 0.0231
#> 2 E Yes 0.917 0.0231
#> 3 H No 0.214 0.110
#> 4 H Yes 0.786 0.110
#> 5 M No 0.32 0.0936
#> 6 M Yes 0.68 0.0936
# proportions of cname
df %>%
group_by(stype,cname) %>%
summarise(prop=srvyr::survey_mean())
#> # A tibble: 33 x 4
#> stype cname prop prop_se
#> <fct> <fct> <dbl> <dbl>
#> 1 E Alameda 0.0556 0.0191
#> 2 E Fresno 0.0139 0.00978
#> 3 E Kern 0.00694 0.00694
#> 4 E Los Angeles 0.0833 0.0231
#> 5 E Mendocino 0.0139 0.00978
#> 6 E Merced 0.0139 0.00978
#> 7 E Orange 0.0903 0.0239
#> 8 E Plumas 0.0278 0.0137
#> 9 E San Diego 0.347 0.0398
#> 10 E San Joaquin 0.208 0.0339
#> # ... with 23 more rows
Created on 2019-11-28 by the reprex package (v0.3.0)
Maybe the way to go here is creating lists that keep the first grouping variable and divide the data by a another group of variables, and then calculate the proportions.
I would like to find a solution that involves purrr:map or tidyverse.
Thanks in advance for the help, or for pointing to the answer!

There are multiple ways. If we pass as string, one option is to make use of group_by_at which takes strings as argument
library(purrr)
library(dplyr)
library(survey)
library(srvyr)
map(c('sch.wide', 'cname'), ~
df %>%
group_by_at(vars("stype", .x)) %>%
summarise(prop = srvyr::survey_mean()))
#[[1]]
# A tibble: 6 x 4
# stype sch.wide prop prop_se
# <fct> <fct> <dbl> <dbl>
#1 E No 0.0833 0.0231
#2 E Yes 0.917 0.0231
#3 H No 0.214 0.110
#4 H Yes 0.786 0.110
#5 M No 0.32 0.0936
#6 M Yes 0.68 0.0936
#[[2]]
# A tibble: 30 x 4
# stype cname prop prop_se
# <fct> <fct> <dbl> <dbl>
# 1 E Alameda 0.0556 0.0191
# 2 E Fresno 0.0139 0.00978
# 3 E Kern 0.00694 0.00694
# 4 E Los Angeles 0.0833 0.0231
# 5 E Mendocino 0.0139 0.00978
# 6 E Merced 0.0139 0.00978
# 7 E Orange 0.0903 0.0239
# 8 E Plumas 0.0278 0.0137
# 9 E San Diego 0.347 0.0398
#10 E San Joaquin 0.208 0.0339
# … with 20 more rows
Or another option is to wrap with quos to create a quosure list and evaluate (!!) it in group_by
map(quos(sch.wide, cname), ~
df %>%
group_by(stype, !!.x) %>%
summarise(prop = srvyr::survey_mean()))

Related

R: List of multiple svytables to data frames

I have a multiple svytable in list and from that list I want to make them separate dataframes by saving the same data structure.
For example:
library(survey)
data(api)
x <- apiclus1
dclus1 <- svydesign(id=~dnum, weights=~pw, data=x, fpc=~fpc)
n <- c("sch.wide", "cname")
for(k in seq_along(n)){
assign((paste0( n[[k]], "_1")),((svytable(as.formula(paste0("~", n[[k]], "+stype")), design = dclus1, na.action=na.pass))))
}
list<- list(sch.wide_1, cname_1)
result <- lapply(list, function(x) ((prop.table(x, margin =2)*100)))
How to make the separate data frames from result list tables?
Edit: simplified approach modifying your for loop and with the use of janitor package
for(k in seq_along(n)) {
assign((paste0(n[[k]], "_1")), ((
svytable(
as.formula(paste0("~", n[[k]], "+stype")),
design = dclus1,
na.action = na.pass
) %>% as.data.frame() %>%
pivot_wider(names_from = stype, values_from = Freq) %>%
adorn_percentages("col") %>% adorn_pct_formatting()
)))
}
now you got:
> sch.wide_1
sch.wide E H M
No 8.3% 21.4% 32.0%
Yes 91.7% 78.6% 68.0%
> cname_1
cname E H M
Alameda 5.6% 7.1% 8.0%
Fresno 1.4% 7.1% 4.0%
Kern 0.7% 0.0% 4.0%
Los Angeles 8.3% 0.0% 12.0%
Mendocino 1.4% 7.1% 4.0%
Merced 1.4% 7.1% 4.0%
Orange 9.0% 0.0% 12.0%
Plumas 2.8% 28.6% 4.0%
San Diego 34.7% 14.3% 12.0%
San Joaquin 20.8% 21.4% 16.0%
Santa Clara 13.9% 7.1% 20.0%
you can explore janitor package and modify pct formatting, total,... to get your desired output.
not sure if you're going to do it 1 by one or you need a loop for it: here's one way for getting them separately:
a <- data.frame(result[1]) %>%
pivot_wider(names_from = stype, values_from = Freq)
> a
# A tibble: 2 × 4
sch.wide E H M
<fct> <dbl> <dbl> <dbl>
1 No 8.33 21.4 32
2 Yes 91.7 78.6 68
b <- data.frame(result[2]) %>%
pivot_wider(names_from = stype, values_from = Freq)
b
# A tibble: 11 × 4
cname E H M
<fct> <dbl> <dbl> <dbl>
1 Alameda 5.56 7.14 8
2 Fresno 1.39 7.14 4
3 Kern 0.694 0 4
4 Los Angeles 8.33 0 12
5 Mendocino 1.39 7.14 4
6 Merced 1.39 7.14 4
7 Orange 9.03 0 12
8 Plumas 2.78 28.6 4
9 San Diego 34.7 14.3 12
10 San Joaquin 20.8 21.4 16
11 Santa Clara 13.9 7.14 20
want to make a loop for it?
for (ii in 1:length(result)) {
assign(
paste0("df_", ii),
as.data.frame(result[[ii]]) %>%
pivot_wider(names_from = stype, values_from = Freq)
)
}
now you have df_1 and df_2
> df_1
# A tibble: 2 × 4
sch.wide E H M
<fct> <dbl> <dbl> <dbl>
1 No 8.33 21.4 32
2 Yes 91.7 78.6 68
> df_2
# A tibble: 11 × 4
cname E H M
<fct> <dbl> <dbl> <dbl>
1 Alameda 5.56 7.14 8
2 Fresno 1.39 7.14 4
3 Kern 0.694 0 4
4 Los Angeles 8.33 0 12
5 Mendocino 1.39 7.14 4
6 Merced 1.39 7.14 4
7 Orange 9.03 0 12
8 Plumas 2.78 28.6 4
9 San Diego 34.7 14.3 12
10 San Joaquin 20.8 21.4 16
11 Santa Clara 13.9 7.14 20
there might be a shortcut for it but this is how I'm doing so far. good luck

Filter in dplyr interval of dates

I have the following simulated dataset in R:
library(tidyverse)
A = seq(from = as.Date("2021/1/1"),to=as.Date("2022/1/1"), length.out = 252)
length(A)
x = rnorm(252)
d = tibble(A,x);d
that looks like :
# A tibble: 252 × 2
A x
<date> <dbl>
1 2021-01-01 0.445
2 2021-01-02 -0.793
3 2021-01-03 -0.367
4 2021-01-05 1.64
5 2021-01-06 -1.15
6 2021-01-08 0.276
7 2021-01-09 1.09
8 2021-01-11 0.443
9 2021-01-12 -0.378
10 2021-01-14 0.203
# … with 242 more rows
Is one year of 252 trading days.Let's say I have a date of my interest which is:
start = as.Date("2021-05-23");start.
I want to filter the data set and the result to be a new dataset starting from this starting date and the next 20 index dates NOT simple days, and then to find the total indexes that the new dataset contains.
For example from the starting date and after I have :
d1=d%>%
dplyr::filter(A>start)%>%
dplyr::summarise(n())
d1
# A tibble: 1 × 1
`n()`
<int>
1 98
but I want from the starting date and after the next 20 trading days.How can I do that ? Any help?
Perhaps a brute-force attempt:
d %>%
filter(between(A, start, max(head(sort(A[A > start]), 20))))
# # A tibble: 20 x 2
# A x
# <date> <dbl>
# 1 2021-05-23 -0.185
# 2 2021-05-24 0.102
# 3 2021-05-26 0.429
# 4 2021-05-27 -1.21
# 5 2021-05-29 0.260
# 6 2021-05-30 0.479
# 7 2021-06-01 -0.623
# 8 2021-06-02 0.982
# 9 2021-06-04 -0.0533
# 10 2021-06-05 1.08
# 11 2021-06-07 -1.96
# 12 2021-06-08 -0.613
# 13 2021-06-09 -0.267
# 14 2021-06-11 -0.284
# 15 2021-06-12 0.0851
# 16 2021-06-14 0.355
# 17 2021-06-15 -0.635
# 18 2021-06-17 -0.606
# 19 2021-06-18 -0.485
# 20 2021-06-20 0.255
If you have duplicate dates, you may prefer to use head(sort(unique(A[A > start])),20), depending on what "20 index dates" means.
And to find the number of indices, you can summarise or count as needed.
You could first sort by the date, filter for days greater than given date and then pull top 20 records.
d1 = d %>%
arrange(A) %>%
filter(A > start) %>%
head(20)

Coalesce column values based on mapping to a vector of values

If I have the following two objects:
> set.seed(100)
> lookup <- sample(1:3, 20, replace=T)
> lookup
[1] 2 3 2 3 1 2 2 3 2 2 3 2 2 3 3 3 3 2 1 3
and
> tb <- tibble(A=runif(20,0,1), B=runif(20,0,1), C= runif(20,0,1))
> tb
> tb
# A tibble: 20 × 3
A B C
<dbl> <dbl> <dbl>
1 0.770 0.780 0.456
2 0.882 0.884 0.445
3 0.549 0.208 0.245
4 0.278 0.307 0.694
5 0.488 0.331 0.412
6 0.929 0.199 0.328
7 0.349 0.236 0.573
8 0.954 0.275 0.967
9 0.695 0.591 0.662
10 0.889 0.253 0.625
11 0.180 0.123 0.857
12 0.629 0.230 0.775
13 0.990 0.598 0.834
14 0.130 0.211 0.0915
15 0.331 0.464 0.460
16 0.865 0.647 0.599
17 0.778 0.961 0.920
18 0.827 0.676 0.983
19 0.603 0.445 0.0378
20 0.491 0.358 0.578
How do I use lookup to select the value of the corresponding row/column from tb?
i.e.
if the first element of lookup = 1 then I would like to select the value in A from the first row of tb
if the second element of lookup = 2 then I would like to select the value in B from the second row of tb
So I should end up with a 1d vector that is the same size as lookup. It will look like this:
> new data
> [1] 0.780 0.445 0.208 0.694 0.488 ... 0.578
Thanks!
data.frame (but not tibble or data.table) supports indexing on a matrix, so with this data,
set.seed(42)
lookup <- sample(1:3, 20, replace=T)
lookup
# [1] 1 1 1 1 2 2 2 1 3 3 1 1 2 2 2 3 3 1 1 3
tb <- tibble(A=runif(20,0,1), B=runif(20,0,1), C= runif(20,0,1))
head(tb)
# # A tibble: 6 x 3
# A B C
# <dbl> <dbl> <dbl>
# 1 0.514 0.958 0.189
# 2 0.390 0.888 0.271
# 3 0.906 0.640 0.828
# 4 0.447 0.971 0.693
# 5 0.836 0.619 0.241
# 6 0.738 0.333 0.0430
We can do
as.data.frame(tb)[cbind(seq_along(lookup), lookup)]
# [1] 0.514211784 0.390203467 0.905738131 0.446969628 0.618838207 0.333427211 0.346748248 0.388108283 0.479398564
# [10] 0.197410342 0.832916080 0.007334147 0.171264330 0.261087964 0.514412935 0.581604003 0.157905208 0.037431033
# [19] 0.973539914 0.775823363
A less-efficient method can be done without as.data.frame:
mapply(`[[`, list(tb), seq_along(lookup), lookup)
# [1] 0.514211784 0.390203467 0.905738131 0.446969628 0.618838207 0.333427211 0.346748248 0.388108283 0.479398564
# [10] 0.197410342 0.832916080 0.007334147 0.171264330 0.261087964 0.514412935 0.581604003 0.157905208 0.037431033
# [19] 0.973539914 0.775823363
## also works with `list(as.data.table(tb))`
Though it does take a big hit in performance (not a surprise):
bench::mark(
sindri_baldur1 = unlist(tb, use.names = FALSE)[seq_along(lookup) + (lookup - 1L)*nrow(tb)],
sindri_baldur2 = unlist(tb)[seq_along(lookup) + (lookup - 1L)*nrow(tb)],
base = as.data.frame(tb)[cbind(seq_along(lookup), lookup)],
mapply = mapply(`[[`, list(tb), seq_along(lookup), lookup),
paulsmith2 = {
tb %>%
mutate(lookup = lookup) %>%
rowwise %>%
mutate(new = c_across(A:C)[lookup]) %>%
pull(new)
},
check = FALSE)
# # A tibble: 5 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 sindri_baldur1 4.5us 5.3us 159430. 736B 15.9 9999 1 62.7ms <NULL> <Rprof~ <benc~ <tibb~
# 2 sindri_baldur2 13.2us 14.7us 56723. 1.44KB 0 10000 0 176.3ms <NULL> <Rprof~ <benc~ <tibb~
# 3 base 78.3us 91.6us 7334. 944B 8.59 3414 4 465.5ms <NULL> <Rprof~ <benc~ <tibb~
# 4 mapply 612.4us 779.45us 942. 720B 6.39 442 3 469.4ms <NULL> <Rprof~ <benc~ <tibb~
# 5 paulsmith2 4.37ms 5.85ms 147. 20.3KB 6.51 68 3 461.1ms <NULL> <Rprof~ <benc~ <tibb~
(I have to use check=FALSE to work with the names introduced in sindri_baldur2, otherwise all results are numerically identical.)
You could:
unlist(tb, use.names = FALSE)[seq_along(lookup) + (lookup - 1L)*nrow(tb)]
# [1] 0.78035851 0.44541398 0.20771390 0.69435071 0.48830599 0.19867907 0.23569430 0.96699908 0.59132105
# [10] 0.25339065 0.85665304 0.22990589 0.59757529 0.09151028 0.45952549 0.59939816 0.91972191 0.67639817
# [19] 0.60332436 0.57793740
You could also use.names and keep track of the original location:
unlist(tb)[seq_along(lookup) + (lookup - 1L)*nrow(tb)] |> head()
# B1 C2 B3 C4 A5 B6
# 0.7803585 0.4454140 0.2077139 0.6943507 0.4883060 0.1986791
A base R solution:
tb$lookup <- lookup
tb$new <- apply(tb, 1, function(x) x[x[4]])
new <- tb$new
new
#> [1] 0.78035851 0.44541398 0.20771390 0.69435071 0.48830599 0.19867907
#> [7] 0.23569430 0.96699908 0.59132105 0.25339065 0.85665304 0.22990589
#> [13] 0.59757529 0.09151028 0.45952549 0.59939816 0.91972191 0.67639817
#> [19] 0.60332436 0.57793740
Another possible solution, based on tidyverse:
library(tidyverse)
set.seed(100)
lookup <- sample(1:3, 20, replace=T)
tb <- tibble(A=runif(20,0,1), B=runif(20,0,1), C= runif(20,0,1))
tb %>%
mutate(lookup = lookup) %>%
rowwise %>%
mutate(new = c_across(A:C)[lookup]) %>%
pull(new)
#> [1] 0.78035851 0.44541398 0.20771390 0.69435071 0.48830599 0.19867907
#> [7] 0.23569430 0.96699908 0.59132105 0.25339065 0.85665304 0.22990589
#> [13] 0.59757529 0.09151028 0.45952549 0.59939816 0.91972191 0.67639817
#> [19] 0.60332436 0.57793740

regression by group and retain all the columns in R

I am doing a linear regression by group and want to extract the residuals of the regression
library(dplyr)
set.seed(124)
dat <- data.frame(ID = sample(111:503, 18576, replace = T),
ID2 = sample(11:50, 18576, replace = T),
ID3 = sample(1:14, 18576, replace = T),
yearRef = sample(1998:2014, 18576, replace = T),
value = rnorm(18576))
resid <- dat %>% dplyr::group_by(ID3) %>%
do(augment(lm(value ~ yearRef, data=.))) %>% ungroup()
How do I retain the ID, ID2 as well in the resid. At the moment, it only retains the ID3 in the final data frame
Use group_split then loop through each group using map_dfr to bind ID, ID2 and augment output using bind_cols
library(dplyr)
library(purrr)
dat %>% group_split(ID3) %>%
map_dfr(~bind_cols(select(.x,ID,ID2), augment(lm(value~yearRef, data=.x))), .id = "ID3")
# A tibble: 18,576 x 12
ID3 ID ID2 value yearRef .fitted .se.fit .resid .hat .sigma .cooksd
<chr> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 196 16 -0.385 2009 -0.0406 0.0308 -0.344 1.00e-3 0.973 6.27e-5
2 1 372 47 -0.793 2012 -0.0676 0.0414 -0.726 1.81e-3 0.973 5.05e-4
3 1 470 15 -0.496 2011 -0.0586 0.0374 -0.438 1.48e-3 0.973 1.50e-4
4 1 242 40 -1.13 2010 -0.0496 0.0338 -1.08 1.21e-3 0.973 7.54e-4
5 1 471 34 1.28 2006 -0.0135 0.0262 1.29 7.26e-4 0.972 6.39e-4
6 1 434 35 -1.09 1998 0.0586 0.0496 -1.15 2.61e-3 0.973 1.82e-3
7 1 467 45 -0.0663 2011 -0.0586 0.0374 -0.00769 1.48e-3 0.973 4.64e-8
8 1 334 27 -1.37 2003 0.0135 0.0305 -1.38 9.86e-4 0.972 9.92e-4
9 1 186 25 -0.0195 2003 0.0135 0.0305 -0.0331 9.86e-4 0.973 5.71e-7
10 1 114 34 1.09 2014 -0.0857 0.0500 1.18 2.64e-3 0.973 1.94e-3
# ... with 18,566 more rows, and 1 more variable: .std.resid <dbl>
Taking the "many models" approach, you can nest the data on ID3 and use purrr::map to create a list-column of the broom::augment data frames. The data list-column has all the original columns aside from ID3; map into that and select just the ones you want. Here I'm assuming you want to keep any column that starts with "ID", but you can change this. Then unnest both the data and the augment data frames.
library(dplyr)
library(tidyr)
dat %>%
group_by(ID3) %>%
nest() %>%
mutate(aug = purrr::map(data, ~broom::augment(lm(value ~ yearRef, data = .))),
data = purrr::map(data, select, starts_with("ID"))) %>%
unnest(c(data, aug))
#> # A tibble: 18,576 x 12
#> # Groups: ID3 [14]
#> ID3 ID ID2 value yearRef .fitted .se.fit .resid .hat .sigma
#> <int> <int> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 11 431 15 0.619 2002 0.0326 0.0346 0.586 1.21e-3 0.995
#> 2 11 500 21 -0.432 2000 0.0299 0.0424 -0.462 1.82e-3 0.995
#> 3 11 392 28 -0.246 1998 0.0273 0.0515 -0.273 2.67e-3 0.995
#> 4 11 292 40 -0.425 1998 0.0273 0.0515 -0.452 2.67e-3 0.995
#> 5 11 175 36 -0.258 1999 0.0286 0.0468 -0.287 2.22e-3 0.995
#> 6 11 419 23 3.13 2005 0.0365 0.0273 3.09 7.54e-4 0.992
#> 7 11 329 17 -0.0414 2007 0.0391 0.0274 -0.0806 7.57e-4 0.995
#> 8 11 284 23 -0.450 2006 0.0378 0.0268 -0.488 7.25e-4 0.995
#> 9 11 136 28 -0.129 2006 0.0378 0.0268 -0.167 7.25e-4 0.995
#> 10 11 118 17 -1.55 2013 0.0470 0.0470 -1.60 2.24e-3 0.995
#> # … with 18,566 more rows, and 2 more variables: .cooksd <dbl>,
#> # .std.resid <dbl>

Use dplyr to add a new column of based on max row value?

I've got a large database that has a series of columns with numerical. I would like to use dplyr to add a new column, mutate, which has as its values the names of the column that has the maximum value. So, for the example below
set.seed(123)
data_frame(
bob = rnorm(10),
sam = rnorm(10),
dick = rnorm(10)
)
# A tibble: 5 x 3
bob sam dick
<dbl> <dbl> <dbl>
1 -0.560 1.72 1.22
2 -0.230 0.461 0.360
3 1.56 -1.27 0.401
4 0.0705 -0.687 0.111
5 0.129 -0.446 -0.556
the new column would be equal to c('sam', 'sam', 'bob', 'dick', 'bob') because they have the maximum values of the columns in the dataset. Any thought?
This will work fine:
df$result = names(df)[apply(df, 1, which.max)]
More verbose, but tidyverse-friendly:
df %>%
#tidying
mutate(id = row_number()) %>%
gather(name, amount, -id) %>%
group_by(id) %>% arrange(id, desc(amount)) %>%
#workhorse
mutate(top.value = head(name, 1) ) %>%
#Pivot
spread(name, amount)
# A tibble: 10 x 5
# Groups: id [10]
id top.value bob dick sam
<int> <chr> <dbl> <dbl> <dbl>
1 1 sam -0.560 -1.07 1.22
2 2 sam -0.230 -0.218 0.360
3 3 bob 1.56 -1.03 0.401
4 4 sam 0.0705 -0.729 0.111
5 5 bob 0.129 -0.625 -0.556
6 6 sam 1.72 -1.69 1.79
7 7 dick 0.461 0.838 0.498
8 8 dick -1.27 0.153 -1.97
9 9 sam -0.687 -1.14 0.701
10 10 dick -0.446 1.25 -0.473
If you don't feel like using tidy data, try:
df %>%
mutate(max.name = names(.)[max.col(.)] )
a data.table version for those that will land in this question looking for a data.table alternative:
require(data.table)
setDT(df)
df[, m := names(df)[apply(.SD, 1, which.max)]]

Resources