Trying to repeat the same code for multiple dataframes - r

I have the follwing code that takes a dataframe called dft1 and then produces a resulting dataframe called dfb1. I want to repeat the same code for multiple input dataframes such as dft1, dft2 all indexed by a number towards the end and then store the results using the same pattern i.e. dfb1, dfb2, ....
I have tried many methods such as using dapply or for loops but given the nature of the code inside I wasn't able to get the intended results.
#define the function for rolling
window <- 24
rolling_lm <-
rollify(.f = function(R_excess, MKT_RF, SMB, HML) {
lm(R_excess ~ MKT_RF + SMB + HML)
}, window = window, unlist = FALSE)
#rolling over the variable
dfb1 <-
dft1 %>%
mutate(rolling_ff =
rolling_lm(R_excess,
MKT_RF,
SMB,
HML)) %>%
mutate(tidied = map(rolling_ff,
tidy,
conf.int = T)) %>%
unnest(tidied) %>%
slice(-1:-23) %>%
select(date, term, estimate, conf.low, conf.high) %>%
filter(term != "(Intercept)") %>%
rename(beta = estimate, factor = term) %>%
group_by(factor)

Add the command you want to apply to each dataframe in a function
apply_fun <- function(df) {
df %>%
mutate(rolling_ff =
rolling_lm(R_excess,
MKT_RF,
SMB,
HML)) %>%
mutate(tidied = map(rolling_ff,
tidy,
conf.int = T)) %>%
unnest(tidied) %>%
slice(-1:-23) %>%
select(date, term, estimate, conf.low, conf.high) %>%
filter(term != "(Intercept)") %>%
rename(beta = estimate, factor = term) %>%
group_by(factor)
}
Now apply the function to each dataframe and store the results in a list
n <- 10
out <- setNames(lapply(mget(paste0("dft", 1:n)), apply_fun), paste0("dfb", 1:n))
Assuming you have input dataframes like dft1, dft2...this will output a list of dataframes which you can now access doing out[['dfb1']], out[['dfb2']] and so on. Change the value of n based on number of dft dataframes you have.
If the data is already present in a list we can avoid mget by doing
setNames(lapply(result, apply_fun), paste0("dfb", 1:n))

Related

Creating indices using means of variables in R

I am trying to create an index of a set of variables by taken the mean of the selected variables using the following code:
data <- data %>%
group_by(country) %>%
# Standardize each component/measure
mutate(
std_var1 = standardize(var1, Z),
std_var2 = standardize(var2, Z),
std_var3 = standardize(var3, Z),
std_var4 = standardize(var4, Z)
) %>%
ungroup() %>%
dplyr::select(std_var1,
std_var2,
std_var3,
std_var4) %>%
# Average all z scores for an individual
mutate(index = pmap_dbl(., ~ mean(c(...), na.rm = T))) %>%
cbind(data, .) %>% unnest() %>%
I also use the idx_mean package that takes the following syntax:
mutate(data, idx_var = idx_mean(std_var1, std_var2, std_var3, std_var4))
and obtain similar but not exactly the same index values (not just a matter of rounding).
Is there one approach that seems more accurate here?
The 4th and 5th columns display index values created by the idx function (4th column) and the other approach (5th column.)

Passing arguments dynamically in Expss tables with user-defined functions

I have a (new) question related to expss tables. I wrote a very simple UDF (that relies on few expss functions), as follows:
library(expss)
z_indices <- function(x, m_global, std_global, weight=NULL){
if(is.null(weight)) weight = rep(1, length(x))
z <- (w_mean(x, weight)-m_global)/std_global
indices <- 100+(z*100)
return(indices)
}
Reproducible example, based on infert dataset (plus a vector of arbitrary weights):
data(infert)
infert$w <- as.vector(x=rep(2, times=nrow(infert)), mode='numeric')
infert %>%
tab_cells(age, parity) %>%
tab_cols(total(), education, case %nest% list(total(), education)) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
tab_stat_fun(label="Z", function(x, m_global, std_global, weight=NULL){
z_indices(x, m_global=w_mean(infert$age, infert$w),std_global=w_sd(infert$age, infert$w))
}) %>%
tab_pivot(stat_position="inside_columns")
The table is computed and the output for the first line is (almost) as expected.
Then things go messy for the second line, since both arguments of z_indices explicitely refer to infert$age, where infert$parity is expected.
My question: is there a way to dynamically pass the variables of tab_cells as function argument within tab_stat_fun to match the variable being processed? I guess this happens inside function declaration but have not clue how to proceed...
Thanks!
EDIT April 28th 2020:
Answer from #Gregory Demin works great in the scope of infert dataset, although for better scalability to larger dataframes I wrote the following loop:
var_df <- data.frame("age"=infert$age, "parity"=infert$parity)
tabZ=infert
for(each in names(var_df)){
tabZ = tabZ %>%
tab_cells(var_df[each]) %>%
tab_cols(total(), education) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
tab_stat_fun(label="Z", function(x, m_global, std_global, weight=NULL){
z_indices(x, m_global=w_mean(var_df[each], infert$w),std_global=w_sd(var_df[each], infert$w))
})
}
tabZ = tabZ %>% tab_pivot()
Hope this inspires other expss users in the future!
There is no universal solution for this case. Function in the tab_stat_fun is always calculated inside cell so you can't get global values in it.
However, in your case we can calculate z-index before summarizing. Not so flexible solution but it works:
# function for weighted z-score
w_z_index = function(x, weight = NULL){
if(is.null(weight)) weight = rep(1, length(x))
z <- (x - w_mean(x, weight))/w_sd(x, weight)
indices <- 100+(z*100)
return(indices)
}
data(infert)
infert$w <- rep(2, times=nrow(infert))
infert %>%
tab_cells(age, parity) %>%
tab_cols(total(), education, case %nest% list(total(), education)) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
# here we get z-index instead of original variables
tab_cells(age = w_z_index(age, w), parity = w_z_index(parity, w)) %>%
tab_stat_mean(label="Z") %>%
tab_pivot(stat_position="inside_columns")
UPDATE.
A little more scalable approach:
w_z_index = function(x, weight = NULL){
if(is.null(weight)) weight = rep(1, length(x))
z <- (x - w_mean(x, weight))/w_sd(x, weight)
indices <- 100+(z*100)
return(indices)
}
w_z_index_df = function(df, weight = NULL){
df[] = lapply(df, w_z_index, weight = weight)
df
}
data(infert)
infert$w <- rep(2, times=nrow(infert))
infert %>%
tab_cells(age, parity) %>%
tab_cols(total(), education, case %nest% list(total(), education)) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
# here we get z-index instead of original variables
# we process a lot of variables at once
tab_cells(w_z_index_df(data.frame(age, parity))) %>%
tab_stat_mean(label="Z") %>%
tab_pivot(stat_position="inside_columns")

How to calculate p.value of each column in a data frame with NA values using shapiro.test in r?

This is what I have tried so far. It works, but it only tells me the p.value of the data that has no NA's. Much of my data has NA values in a few places up to 1/3rd of the data.
normal <- apply(cor_phys, 2, function(x) shapiro.test(x)$p.value)
I want to try adding na.rm to the function, but it's not working. Help?
#calculate the correlations between all variables
corres <- cor_phys %>% #cor_phys is my data
as.matrix %>%
cor(use="complete.obs") %>% #complete.obs does not use NA
as.data.frame %>%
rownames_to_column(var = 'var1') %>%
gather(var2, value, -var1)
#removes duplicates correlations
corres <- corres %>%
mutate(var_order = paste(var1, var2) %>%
strsplit(split = ' ') %>%
map_chr( ~ sort(.x) %>%
paste(collapse = ' '))) %>%
mutate(cnt = 1) %>%
group_by(var_order) %>%
mutate(cumsum = cumsum(cnt)) %>%
filter(cumsum != 2) %>%
ungroup %>%
select(-var_order, -cnt, -cumsum) #removes unneeded columns
I did not write this myself, but it is the answer that I used and worked for my needs. The link to the page I used is: How to compute correlations between all columns in R and detect highly correlated variables

Save intermediate list output in dplyr pipeline and map it back to another list further down the pipeline - R

I am running pcas on groups in a data set using dplyr pipelines. I am starting with group_split, so am working with a list. In order to run the prcomp() function, only the numeric columns of each list can be included, but I would like the factor column brought back in for plotting at the end. I have tried saving an intermediate output using {. ->> temp} partway through the pipeline, but since it is a list, I don't know how to index the grouping column when plotting.
library(tidyverse)
library(ggbiplot)
iris %>%
group_split(Species, keep = T) %>% #group by species, one pca per species
{. ->> temp} %>% # save intermediate output to preserve species column for use in plotting later
map(~.x %>% select_if(is.numeric) %>% select_if(~var(.) != 0) %>%
prcomp(scale. = TRUE))%>% #run pca on numeric columns only
map(~ggbiplot(.x), label=temp$Species)#plot each pca, labeling points as species names form the temporary object
This works to produce one pca plot for each species in the irisdata set, but since temp$species = NULL, the points are not labelled.
If you use map2() and pass the .y argument as the species list you can get the result I think you want. Note that in your original code the labels argument was outside the ggbiplot() function and was ignored.
library(tidyverse)
library(ggbiplot)
iris %>%
group_split(Species, keep = T) %>%
{. ->> temp} %>%
map(~.x %>%
select_if(is.numeric) %>%
select_if(~var(.) != 0) %>%
prcomp(scale. = TRUE)) %>%
map2(map(temp, "Species"), ~ggbiplot(.x, labels = .y))
In response to your comment, if you wanted to add a third argument you could use pmap() instead of map2(). In the example below, pmap() is being passed a (nested) list of the data for the ggbiplot() arguments. Note I've changed the new variable so that it's a factor and not constant across groups.
iris %>%
mutate(new = factor(sample(1:3, 150, replace = TRUE))) %>%
group_split(Species, keep = T) %>%
{. ->> temp} %>%
map(~.x %>%
select_if(is.numeric) %>%
select_if(~var(.) != 0) %>%
prcomp(scale. = TRUE)) %>%
list(map(temp, "Species"), map(temp, "new")) %>%
pmap(~ ggbiplot(pcobj = ..1, labels = ..2, groups = ..3))
One option is to use split and imap
library(tidyverse)
library(ggbiplot)
iris %>%
split(.$Species) %>% # save intermediate output to preserve species column for use in plotting later
map(~.x %>% select_if(is.numeric) %>% select_if(~var(.) != 0) %>%
prcomp(scale. = TRUE)) %>%
imap(~ggbiplot(.x, labels = .y))

R Many Models Multiple Functions

I've written a routine that extracts information from lmer models to compute the ICC and get the LRT from lmerTest's ranova function. What I have below works but I suspect it could be improved by (a) combining the two functions into one and returning a list, but I can't seem to access the list elements with purrr's map function, and (b) using multiple mutate/purrr lines to get all the needed data in one place rather than having to join later. My code follows using the "Peet" dataset provided in Hox (2002) and available at the UCLA IDRE site:
library(foreign)
library(lme4)
library(tidyverse)
library(purrr)
#Peet family data described and used in Hox
peet.dat<-read.dta("https://stats.idre.ucla.edu/stat/stata/examples/mlm_ma_hox/peetmis.dta")
names(peet.dat)
#convert to long format
peet.long.dat <- peet.dat %>%
tidyr::gather(type, score, -family,-sex,-person) %>%
arrange(type)
names(peet.long.dat)
#need two functions, one for the MLM estimates and the other for
#ranova p-test for variance--merge later by type
aov_model <- function(df) {
lmr.model <- lmerTest::lmer(score~ 1 + (1|family), data=df)
}
aov_test <- function(df) {
lmr.model <- lmerTest::lmer(score~ 1 + (1|family), data=df)
ll.test <- lmerTest::ranova(lmr.model)
}
#get the model estimates
models <- peet.long.dat %>%
nest(-type) %>%
mutate(aov_obj = map(data, aov_model),
summaries = map(aov_obj, broom.mixed::tidy)) %>%
unnest(summaries, .drop = T) %>%
select(type, effect, estimate, term) %>%
filter(effect != "fixed") %>%
mutate(variance = estimate^2) %>%
select(-estimate, -effect) %>%
spread(term, variance) %>%
rename(group.var = `sd__(Intercept)`, residual = `sd__Observation`) %>%
mutate(ICC = group.var/(group.var+residual))
models
#get the ranova LRTs
tests <- peet.long.dat %>%
nest(-type) %>%
mutate(test_obj = map(data, aov_test),
test_summaries = map(test_obj, broom.mixed::tidy)) %>%
unnest(test_summaries, .drop = T) %>%
filter(!is.na(LRT))
#join estimates with LRT p values
models %>% left_join(tests[c("type","p.value")])
Any help greatly appreciated.
I think the key here is to split() your data.frame based on the variable type:
# convert to list by type
peet.ls <- peet.dat %>%
tidyr::gather(type, score, -family,-sex,-person) %>%
split(.$type)
# map to fit models on subsets and return summaries
peet.ls %>%
map(function(df.x) {
# fit the model
lmr_model <- lmerTest::lmer(score~ 1 + (1|family), data = df.x)
#get the model estimates
mlm_est <- lmr_model %>%
broom.mixed::tidy() %>%
select(effect, estimate, term) %>%
filter(effect != "fixed") %>%
mutate(variance = estimate^2) %>%
select(-estimate, -effect) %>%
spread(term, variance) %>%
rename(group.var = `sd__(Intercept)`,
residual = `sd__Observation`) %>%
mutate(ICC = group.var/(group.var+residual))
# get the ranova LRTs & add to other estimates
mlm_est$p.value <- lmr_model %>%
lmerTest::ranova() %>%
broom.mixed::tidy() %>%
filter(!is.na(LRT)) %>%
pull(p.value)
# return summaries
mlm_est
}) %>%
# combine data.frames and add the variable 'type'
bind_rows(.id = "type") %>%
select(type, everything())

Resources