I've written a routine that extracts information from lmer models to compute the ICC and get the LRT from lmerTest's ranova function. What I have below works but I suspect it could be improved by (a) combining the two functions into one and returning a list, but I can't seem to access the list elements with purrr's map function, and (b) using multiple mutate/purrr lines to get all the needed data in one place rather than having to join later. My code follows using the "Peet" dataset provided in Hox (2002) and available at the UCLA IDRE site:
library(foreign)
library(lme4)
library(tidyverse)
library(purrr)
#Peet family data described and used in Hox
peet.dat<-read.dta("https://stats.idre.ucla.edu/stat/stata/examples/mlm_ma_hox/peetmis.dta")
names(peet.dat)
#convert to long format
peet.long.dat <- peet.dat %>%
tidyr::gather(type, score, -family,-sex,-person) %>%
arrange(type)
names(peet.long.dat)
#need two functions, one for the MLM estimates and the other for
#ranova p-test for variance--merge later by type
aov_model <- function(df) {
lmr.model <- lmerTest::lmer(score~ 1 + (1|family), data=df)
}
aov_test <- function(df) {
lmr.model <- lmerTest::lmer(score~ 1 + (1|family), data=df)
ll.test <- lmerTest::ranova(lmr.model)
}
#get the model estimates
models <- peet.long.dat %>%
nest(-type) %>%
mutate(aov_obj = map(data, aov_model),
summaries = map(aov_obj, broom.mixed::tidy)) %>%
unnest(summaries, .drop = T) %>%
select(type, effect, estimate, term) %>%
filter(effect != "fixed") %>%
mutate(variance = estimate^2) %>%
select(-estimate, -effect) %>%
spread(term, variance) %>%
rename(group.var = `sd__(Intercept)`, residual = `sd__Observation`) %>%
mutate(ICC = group.var/(group.var+residual))
models
#get the ranova LRTs
tests <- peet.long.dat %>%
nest(-type) %>%
mutate(test_obj = map(data, aov_test),
test_summaries = map(test_obj, broom.mixed::tidy)) %>%
unnest(test_summaries, .drop = T) %>%
filter(!is.na(LRT))
#join estimates with LRT p values
models %>% left_join(tests[c("type","p.value")])
Any help greatly appreciated.
I think the key here is to split() your data.frame based on the variable type:
# convert to list by type
peet.ls <- peet.dat %>%
tidyr::gather(type, score, -family,-sex,-person) %>%
split(.$type)
# map to fit models on subsets and return summaries
peet.ls %>%
map(function(df.x) {
# fit the model
lmr_model <- lmerTest::lmer(score~ 1 + (1|family), data = df.x)
#get the model estimates
mlm_est <- lmr_model %>%
broom.mixed::tidy() %>%
select(effect, estimate, term) %>%
filter(effect != "fixed") %>%
mutate(variance = estimate^2) %>%
select(-estimate, -effect) %>%
spread(term, variance) %>%
rename(group.var = `sd__(Intercept)`,
residual = `sd__Observation`) %>%
mutate(ICC = group.var/(group.var+residual))
# get the ranova LRTs & add to other estimates
mlm_est$p.value <- lmr_model %>%
lmerTest::ranova() %>%
broom.mixed::tidy() %>%
filter(!is.na(LRT)) %>%
pull(p.value)
# return summaries
mlm_est
}) %>%
# combine data.frames and add the variable 'type'
bind_rows(.id = "type") %>%
select(type, everything())
Related
Using function slidify() in package timetk to create lm_roll(), how to retrieve the coefficients calculated by lm_roll() and calculate slope ? Below code , the last muate can't work . Thanks!
library(timetk)
library(tidyverse)
library(tidyquant)
lm_roll <- slidify(~lm(.x ~ .y),.period=90,.unlist=FALSE,.align ="right")
FB <- FANG %>% filter(symbol == "FB")
FB %>%
drop_na() %>%
mutate(numeric_date = as.numeric(date)) %>%
mutate(rolling_lm = lm_roll(adjusted,numeric_date)) %>%
filter(!is.na(rolling_lm)) %>% mutate(intercept= coef(rolling_lm)[1],
numeric_date_index = coef(rolling_lm)[2],
slope=coef(rolling_lm)[1]/coef(rolling_lm)[2])
I need to fit many loess splines by the grouping variable (Animal) across multiple numeric columns (Var1, Var2), and extract these values.
I found code to do this task one variable at a time;
# Create dataframe 1
OneVarDF <- data.frame(Day = c(replicate(1,sample(1:50,200,rep=TRUE))),
Animal = c(c(replicate(100,"Greyhound"), c(replicate(100,"Horse")))),
Var1 = c(c(replicate(1,sample(2:10,100,rep=TRUE))), c(replicate(1,sample(15:20,100,rep=TRUE)))))
library(dplyr)
library(tidyr)
library(purrr)
# Get fitted values from each model
Models <- OneVarDF %>%
tidyr::nest(-Animal) %>%
dplyr::mutate(m = purrr::map(data, loess, formula = Var1 ~ Day, span = 0.30),
fitted = purrr::map(m, `[[`, "fitted")
)
# Create prediction column
Results <- Models %>%
dplyr::select(-m) %>%
tidyr::unnest()
This "Results" dataframe is essential for downstream tasks (detrending many non-parametric distributions).
How can we achieve this with a dataframe with multiple numeric columns (code below), and extract a "Results" dataframe? Thank you.
# Create dataframe 2
TwoVarDF <- data.frame(Day = c(replicate(1,sample(1:50,200,rep=TRUE))),
Animal = c(c(replicate(100,"Greyhound"), c(replicate(100,"Horse")))),
Var1 = c(c(replicate(1,sample(2:10,100,rep=TRUE))), c(replicate(1,sample(15:20,100,rep=TRUE)))),
Var2 = c(c(replicate(1,sample(22:27,100,rep=TRUE))), c(replicate(1,sample(29:35,100,rep=TRUE)))))
We can get the data in long format using. pivot_longer, group_by Animal and column name and apply loess to each combinaton.
library(dplyr)
library(tidyr)
TwoVarDF %>%
pivot_longer(cols = starts_with('Var')) %>%
group_by(Animal, name) %>%
mutate(model = loess(value~Day, span = 0.3)$fitted)
Include a gather() function to proceed as similar to your previous code.
Models2 <- TwoVarDF %>%
gather(varName, varVal, 3:4) %>%
tidyr::nest(-Animal, -varName) %>%
dplyr::mutate(m = purrr::map(data, loess, formula = varVal ~ Day, span = 0.30),
fitted = purrr::map(m, `[[`, "fitted")
)
I'm trying to do a Wilcoxon test on long-formatted data. I want to use dplyr::group_by() to specify the subsets I'd like to do the test on.
The final result would be a new column with the p-value of the Wilcoxon test appended to the original data frame. All of the techniques I have seen require summarizing the data frame. I DO NOT want to summarize the data frame.
Please see an example reformatting the iris dataset to mimic my data, and finally my attempts to perform the task.
I am getting close, but I want to preserve all of my original data from before the Wilcoxon test.
# Reformatting Iris to mimic my data.
long_format <- iris %>%
gather(key = "attribute", value = "measurement", -Species) %>%
mutate(descriptor =
case_when(
str_extract(attribute, pattern = "\\.(.*)") == ".Width" ~ "Width",
str_extract(attribute, pattern = "\\.(.*)") == ".Length" ~ "Length")) %>%
mutate(Feature =
case_when(
str_extract(attribute, pattern = "^(.*?)\\.") == "Sepal." ~ "Sepal",
str_extract(attribute, pattern = "^(.*?)\\.") == "Petal." ~ "Petal"))
# Removing no longer necessary column.
cleaned_up <- long_format %>% select(-attribute)
# Attempt using do(), but I lose important info like "measurement"
cleaned_up %>%
group_by(Species, Feature) %>%
do(w = wilcox.test(measurement~descriptor, data=., paired=FALSE)) %>%
mutate(Wilcox = w$p.value)
# This is an attempt with the dplyr experimental group_map function. If only I could just make this a new column appended to the original df in one step.
cleaned_up %>%
group_by(Species, Feature) %>%
group_map(~ wilcox.test(measurement~descriptor, data=., paired=FALSE)$p.value)
Thanks for your help.
The model object can be wrapped in a list
library(tidyverse)
cleaned_up %>%
group_by(Species, Feature) %>%
nest %>%
mutate(model = map(data, ~
.x %>%
transmute(w = list(wilcox.test(measurement~descriptor,
data=., paired=FALSE)))))
Or another option is group_split into a list, then map through the list, elements create the 'pval' column after applying the model
cleaned_up %>%
group_split(Species, Feature) %>%
map_dfr(~ .x %>%
mutate(pval = wilcox.test(measurement~descriptor,
data=., paired=FALSE)$p.value))
Another option is to avoid the data argument entirely. The wilcox.test function only requires a data argument when the variables being tested aren't in the calling scope, but functions called within mutate have all the columns from the data frame in scope.
cleaned_up %>%
group_by(Species, Feature) %>%
mutate(pval = wilcox.test(measurement~descriptor, paired=FALSE)$p.value)
Same as akrun's output (thanks to his correction in the comments above)
akrun <-
cleaned_up %>%
group_split(Species, Feature) %>%
map_dfr(~ .x %>%
mutate(pval = wilcox.test(measurement~descriptor,
data=., paired=FALSE)$p.value))
me <-
cleaned_up %>%
group_by(Species, Feature) %>%
mutate(pval = wilcox.test(measurement~descriptor, paired=FALSE)$p.value)
all.equal(akrun, me)
# [1] TRUE
I have the follwing code that takes a dataframe called dft1 and then produces a resulting dataframe called dfb1. I want to repeat the same code for multiple input dataframes such as dft1, dft2 all indexed by a number towards the end and then store the results using the same pattern i.e. dfb1, dfb2, ....
I have tried many methods such as using dapply or for loops but given the nature of the code inside I wasn't able to get the intended results.
#define the function for rolling
window <- 24
rolling_lm <-
rollify(.f = function(R_excess, MKT_RF, SMB, HML) {
lm(R_excess ~ MKT_RF + SMB + HML)
}, window = window, unlist = FALSE)
#rolling over the variable
dfb1 <-
dft1 %>%
mutate(rolling_ff =
rolling_lm(R_excess,
MKT_RF,
SMB,
HML)) %>%
mutate(tidied = map(rolling_ff,
tidy,
conf.int = T)) %>%
unnest(tidied) %>%
slice(-1:-23) %>%
select(date, term, estimate, conf.low, conf.high) %>%
filter(term != "(Intercept)") %>%
rename(beta = estimate, factor = term) %>%
group_by(factor)
Add the command you want to apply to each dataframe in a function
apply_fun <- function(df) {
df %>%
mutate(rolling_ff =
rolling_lm(R_excess,
MKT_RF,
SMB,
HML)) %>%
mutate(tidied = map(rolling_ff,
tidy,
conf.int = T)) %>%
unnest(tidied) %>%
slice(-1:-23) %>%
select(date, term, estimate, conf.low, conf.high) %>%
filter(term != "(Intercept)") %>%
rename(beta = estimate, factor = term) %>%
group_by(factor)
}
Now apply the function to each dataframe and store the results in a list
n <- 10
out <- setNames(lapply(mget(paste0("dft", 1:n)), apply_fun), paste0("dfb", 1:n))
Assuming you have input dataframes like dft1, dft2...this will output a list of dataframes which you can now access doing out[['dfb1']], out[['dfb2']] and so on. Change the value of n based on number of dft dataframes you have.
If the data is already present in a list we can avoid mget by doing
setNames(lapply(result, apply_fun), paste0("dfb", 1:n))
I am looking to create a clean dataframe with reodered columns out of a linear model results lm, my eventual aim is to write the dataframe to excel to chart and audit model residuals. First, the sample data:
df1 <- cbind.data.frame(dt = seq.Date(as.Date('2019-01-01'),
as.Date('2019-01-10'),
by = 'day' ),
depVar = rnorm(10,2,1),
indepVar1 = rnorm(10,4,3),
indepVar2 = rnorm(10,7,2)
)
Now run the model:
modRes <- lm(depVar~ indepVar1, data=df1)
avf1 <- broom::augment(modRes)
library(dplyr)
avf1 <- avf1 %>%
# drop what we don't need
select(-c(.se.fit, .hat, .sigma, .cooksd, .std.resid)) %>%
cbind(df1)
The above runs well, but i want to add another pipe %>% with select(dt, everything()) so I can reorder the columns. The below returns an error:
avf1 <- avf1 %>%
select(-c(.se.fit, .hat, .sigma, .cooksd, .std.resid)) %>% # drop what we don't need
cbind(df1) %>%
select(dt, everything())
Error: Can't bind data because some arguments have the same name
Call `rlang::last_error()` to see a backtrace
Why is this failing?
avf1 <- modRes %>%
augment() %>%
select(-c(.se.fit, .hat, .sigma, .cooksd, .std.resid)) %>%
bind_cols(df1) %>%
select(dt, everything())