Join across/within nested dataframes - r

I am pulling information out of a model for eventual plotting. My desired plots are jittered original data with an overlay of mean +/- STDERR and text groupings. The model outputs put the groupings and estimates in separate dataframes within a list. I'm using map to extract those and it works, but I'm stuck with the step of joining them together.
I want to join two nested list-cols into a single table and nest that result as a new column. Best I can do currently is to unnest, join tables, nest again, and join back to original nested table.
library(agricolae)
library(tidyverse)
fitHSD2<- function(d) HSD.test(aov(mpg ~ cyl, data= d), trt = "cyl") # anova with Tukey HSD
carnestdf <-
mtcars %>%
group_by(gear) %>%
nest() %>%
mutate(mod = map(data, fitHSD2) # fit model
, estimates = map(mod, function(df) return(df$means)) # pull out estimates and StdErr
, estimates = map(estimates, function(df) return(rownames_to_column(df, var = "trt"))) #attach rownames as column for unnest
, grouping = map(mod, function(df) return(df$groups)) # pull out groupings
, grouping = map(grouping, function(df) mutate(df, trt = as.character(trt) # convert to character
, trt = gsub("[[:space:]]*$", "", trt)
, M = as.character(M)
)
) # remove whitespace at end for join
)
carnestdf
I can unnest each one and join them, but I can't nest and join them back. I can in fact... just need to define the join key otherwise it tries to join based upon the nested DF and that doesn't work without the hashing below.
full_join(unnest(carnestdf , estimates), unnest(carnestdf , grouping)) %>%
group_by(gear) %>%
nest(.key = "estgrp") %>%
full_join(carnestdf, ., by = "gear")
I found this: R: Join two tables (tibbles) by *list* columns
But it doesn't seem to work, I get the same error when using the hash to join. It does work, needed to define the .key in nest so it wasn't "data". Would still prefer to join without unnesting... :/
nestmerge <-
full_join(unnest(carnestdf , estimates), unnest(carnestdf , grouping)) %>%
group_by(gear) %>%
nest(.key = "mergedestgrp") %>%
mutate_all(funs(hash = map_chr(., digest::digest)))
carnestdf %>%
mutate_all(funs(hash = map_chr(., digest::digest))) %>%
full_join(., nestmerge) %>%
select(-ends_with("hash"))

The answer apparently is map2:
carnestdf <-
mtcars %>%
group_by(gear) %>%
nest() %>%
mutate(mod = map(data, fitHSD2) # fit model
, estimates = map(mod, function(df) return(df$means)) # pull out estimates and StdErr
, estimates = map(estimates, function(df) return(rownames_to_column(df, var = "trt"))) #attach rownames as column for unnest
, grouping = map(mod, function(df) return(df$groups)) # pull out groupings
, grouping = map(grouping, function(df) mutate(df, trt = as.character(trt) # convert to character
, trt = gsub("[[:space:]]*$", "", trt)
, M = as.character(M)
)
) # remove whitespace at end for join
, estgrp = map2(estimates, grouping, ~full_join(.x, .y, by = "trt"))
)
carnestdf
This does a full join on the two tables by "trt" and makes a new list column with the result.

Related

not getting all variables after merging three data sets

I am merging three data sets ( data frames) in R as follows:
Prv_mnth_so3 has state, Product_Number , Quantity_On_Hand , Category and Lc_Amount
Prv_mnth_soqty <- Prv_mnth_so3 %>%
filter( Category== "ESC") %>%
group_by (state,Product_Number) %>%
summarise(qty = sum(Quantity_On_Hand))
#arrange(state,Product_Number)
Prv_mnth_so_esc_amt <- Prv_mnth_so3 %>%
filter( Category== "ESC") %>%
group_by (state,Product_Number) %>%
summarise(esc = sum(as.numeric(Lc_Amount))) %>%
arrange(state,Product_Number)
Prv_mnth_so_lom_amt <- Prv_mnth_so3 %>%
filter( Category== "LOM") %>%
group_by (state,Product_Number) %>%
summarise(lom = sum(Lc_Amount))%>%
arrange(state,Product_Number)
Prv_mnth_si <- merge(Prv_mnth_soqty, Prv_mnth_so_esc_amt , Prv_mnth_so_lom_amt,
by.x = c("state","Product_Number") , by.y = c("state","Product_Number"), by.z = c("state","Product_Number"), all = TRUE) ```
in out come (Prv_mnth_si ) I expect 5 variables as - State, Product_number), qty, esc and lom but I am not gettig lom in outcome, though in Prv_mnth_so_lom_amt, I can see lom variables is there
Since you are using the tidyverse you can use one of the join functions provided by the dplyr package.
Change the last line from:
Prv_mnth_si <- merge(Prv_mnth_soqty,
Prv_mnth_so_esc_amt ,
Prv_mnth_so_lom_amt,
by.x = c("state","Product_Number") ,
by.y = c("state","Product_Number"),
by.z = c("state","Product_Number"),
all = TRUE)
to:
Prv_mnth_si <- full_join(Prv_mnth_soqty, Prv_mnth_so_esc_amt) %>%
full_join(Prvmnth_so_lom_amt)

Fit loess smoothers for multiple groups across multiple numeric variables

I need to fit many loess splines by the grouping variable (Animal) across multiple numeric columns (Var1, Var2), and extract these values.
I found code to do this task one variable at a time;
# Create dataframe 1
OneVarDF <- data.frame(Day = c(replicate(1,sample(1:50,200,rep=TRUE))),
Animal = c(c(replicate(100,"Greyhound"), c(replicate(100,"Horse")))),
Var1 = c(c(replicate(1,sample(2:10,100,rep=TRUE))), c(replicate(1,sample(15:20,100,rep=TRUE)))))
library(dplyr)
library(tidyr)
library(purrr)
# Get fitted values from each model
Models <- OneVarDF %>%
tidyr::nest(-Animal) %>%
dplyr::mutate(m = purrr::map(data, loess, formula = Var1 ~ Day, span = 0.30),
fitted = purrr::map(m, `[[`, "fitted")
)
# Create prediction column
Results <- Models %>%
dplyr::select(-m) %>%
tidyr::unnest()
This "Results" dataframe is essential for downstream tasks (detrending many non-parametric distributions).
How can we achieve this with a dataframe with multiple numeric columns (code below), and extract a "Results" dataframe? Thank you.
# Create dataframe 2
TwoVarDF <- data.frame(Day = c(replicate(1,sample(1:50,200,rep=TRUE))),
Animal = c(c(replicate(100,"Greyhound"), c(replicate(100,"Horse")))),
Var1 = c(c(replicate(1,sample(2:10,100,rep=TRUE))), c(replicate(1,sample(15:20,100,rep=TRUE)))),
Var2 = c(c(replicate(1,sample(22:27,100,rep=TRUE))), c(replicate(1,sample(29:35,100,rep=TRUE)))))
We can get the data in long format using. pivot_longer, group_by Animal and column name and apply loess to each combinaton.
library(dplyr)
library(tidyr)
TwoVarDF %>%
pivot_longer(cols = starts_with('Var')) %>%
group_by(Animal, name) %>%
mutate(model = loess(value~Day, span = 0.3)$fitted)
Include a gather() function to proceed as similar to your previous code.
Models2 <- TwoVarDF %>%
gather(varName, varVal, 3:4) %>%
tidyr::nest(-Animal, -varName) %>%
dplyr::mutate(m = purrr::map(data, loess, formula = varVal ~ Day, span = 0.30),
fitted = purrr::map(m, `[[`, "fitted")
)

Trying to repeat the same code for multiple dataframes

I have the follwing code that takes a dataframe called dft1 and then produces a resulting dataframe called dfb1. I want to repeat the same code for multiple input dataframes such as dft1, dft2 all indexed by a number towards the end and then store the results using the same pattern i.e. dfb1, dfb2, ....
I have tried many methods such as using dapply or for loops but given the nature of the code inside I wasn't able to get the intended results.
#define the function for rolling
window <- 24
rolling_lm <-
rollify(.f = function(R_excess, MKT_RF, SMB, HML) {
lm(R_excess ~ MKT_RF + SMB + HML)
}, window = window, unlist = FALSE)
#rolling over the variable
dfb1 <-
dft1 %>%
mutate(rolling_ff =
rolling_lm(R_excess,
MKT_RF,
SMB,
HML)) %>%
mutate(tidied = map(rolling_ff,
tidy,
conf.int = T)) %>%
unnest(tidied) %>%
slice(-1:-23) %>%
select(date, term, estimate, conf.low, conf.high) %>%
filter(term != "(Intercept)") %>%
rename(beta = estimate, factor = term) %>%
group_by(factor)
Add the command you want to apply to each dataframe in a function
apply_fun <- function(df) {
df %>%
mutate(rolling_ff =
rolling_lm(R_excess,
MKT_RF,
SMB,
HML)) %>%
mutate(tidied = map(rolling_ff,
tidy,
conf.int = T)) %>%
unnest(tidied) %>%
slice(-1:-23) %>%
select(date, term, estimate, conf.low, conf.high) %>%
filter(term != "(Intercept)") %>%
rename(beta = estimate, factor = term) %>%
group_by(factor)
}
Now apply the function to each dataframe and store the results in a list
n <- 10
out <- setNames(lapply(mget(paste0("dft", 1:n)), apply_fun), paste0("dfb", 1:n))
Assuming you have input dataframes like dft1, dft2...this will output a list of dataframes which you can now access doing out[['dfb1']], out[['dfb2']] and so on. Change the value of n based on number of dft dataframes you have.
If the data is already present in a list we can avoid mget by doing
setNames(lapply(result, apply_fun), paste0("dfb", 1:n))

R Many Models Multiple Functions

I've written a routine that extracts information from lmer models to compute the ICC and get the LRT from lmerTest's ranova function. What I have below works but I suspect it could be improved by (a) combining the two functions into one and returning a list, but I can't seem to access the list elements with purrr's map function, and (b) using multiple mutate/purrr lines to get all the needed data in one place rather than having to join later. My code follows using the "Peet" dataset provided in Hox (2002) and available at the UCLA IDRE site:
library(foreign)
library(lme4)
library(tidyverse)
library(purrr)
#Peet family data described and used in Hox
peet.dat<-read.dta("https://stats.idre.ucla.edu/stat/stata/examples/mlm_ma_hox/peetmis.dta")
names(peet.dat)
#convert to long format
peet.long.dat <- peet.dat %>%
tidyr::gather(type, score, -family,-sex,-person) %>%
arrange(type)
names(peet.long.dat)
#need two functions, one for the MLM estimates and the other for
#ranova p-test for variance--merge later by type
aov_model <- function(df) {
lmr.model <- lmerTest::lmer(score~ 1 + (1|family), data=df)
}
aov_test <- function(df) {
lmr.model <- lmerTest::lmer(score~ 1 + (1|family), data=df)
ll.test <- lmerTest::ranova(lmr.model)
}
#get the model estimates
models <- peet.long.dat %>%
nest(-type) %>%
mutate(aov_obj = map(data, aov_model),
summaries = map(aov_obj, broom.mixed::tidy)) %>%
unnest(summaries, .drop = T) %>%
select(type, effect, estimate, term) %>%
filter(effect != "fixed") %>%
mutate(variance = estimate^2) %>%
select(-estimate, -effect) %>%
spread(term, variance) %>%
rename(group.var = `sd__(Intercept)`, residual = `sd__Observation`) %>%
mutate(ICC = group.var/(group.var+residual))
models
#get the ranova LRTs
tests <- peet.long.dat %>%
nest(-type) %>%
mutate(test_obj = map(data, aov_test),
test_summaries = map(test_obj, broom.mixed::tidy)) %>%
unnest(test_summaries, .drop = T) %>%
filter(!is.na(LRT))
#join estimates with LRT p values
models %>% left_join(tests[c("type","p.value")])
Any help greatly appreciated.
I think the key here is to split() your data.frame based on the variable type:
# convert to list by type
peet.ls <- peet.dat %>%
tidyr::gather(type, score, -family,-sex,-person) %>%
split(.$type)
# map to fit models on subsets and return summaries
peet.ls %>%
map(function(df.x) {
# fit the model
lmr_model <- lmerTest::lmer(score~ 1 + (1|family), data = df.x)
#get the model estimates
mlm_est <- lmr_model %>%
broom.mixed::tidy() %>%
select(effect, estimate, term) %>%
filter(effect != "fixed") %>%
mutate(variance = estimate^2) %>%
select(-estimate, -effect) %>%
spread(term, variance) %>%
rename(group.var = `sd__(Intercept)`,
residual = `sd__Observation`) %>%
mutate(ICC = group.var/(group.var+residual))
# get the ranova LRTs & add to other estimates
mlm_est$p.value <- lmr_model %>%
lmerTest::ranova() %>%
broom.mixed::tidy() %>%
filter(!is.na(LRT)) %>%
pull(p.value)
# return summaries
mlm_est
}) %>%
# combine data.frames and add the variable 'type'
bind_rows(.id = "type") %>%
select(type, everything())

Transpose data frame variables and add null, unique counts in [r]

I am trying to build a summary table of a data frame like DataProfile below.
The idea is to transform each column into a row and add variables for count, nulls, not nulls, unique, and add additional mutations of those variables.
It seems like there should be a better faster way to do this. Is there a function that does this?
#trying to write the functions within dplyr & magrittr framework
library(tidyverse)
mtcars[2,2] <- NA # Add a null to test completeness
#
total <- mtcars %>% summarise_all(funs(n())) %>% melt
nulls <- mtcars %>% summarise_all(funs(sum(is.na(.)))) %>% melt
filled <- mtcars %>% summarise_all(funs(sum(!is.na(.)))) %>% melt
uniques <- mtcars %>% summarise_all(funs(length(unique(.)))) %>% melt
mtcars %>% summarise_all(funs(n_distinct(.))) %>% melt
#Build a Data Frame from names of mtcars and add variables with mutate
DataProfile <- as.data.frame(names(mtcars))
DataProfile <- DataProfile %>% mutate(Total = total$value,
Nulls = nulls$value,
Filled = filled $value,
Complete = Filled/Total,
Cardinality = uniques$value,
Uniqueness = Cardinality/Total,
Distinctness = Cardinality/Filled)
DataProfile
#These are other attempts with Base R, but they are harder to read and don't play well with summarise_all
sapply(mtcars, function(x) length(unique(x[!is.na(x)]))) %>% melt
rapply(mtcars,function(x)length(unique(x))) %>% melt
The summarise_all() function can process more than one function at a time, so you can consolidate code by doing it in one pass then formatting your data to get to the type of "profile" per variable that you want.
library(tidyverse)
mtcars[2,2] <- NA # Add a null to test completeness
DataProfile <- mtcars %>%
summarise_all(funs("Total" = n(),
"Nulls" = sum(is.na(.)),
"Filled" = sum(!is.na(.)),
"Cardinality" = length(unique(.)))) %>%
melt() %>%
separate(variable, into = c('variable', 'measure'), sep="_") %>%
spread(measure, value) %>%
mutate(Complete = Filled/Total,
Uniqueness = Cardinality/Total,
Distinctness = Cardinality/Filled)
DataProfile

Resources