Related
I have a dataset with 29 variables and I have tried to see how they are correlated using
cor().
This has given me a 29X29 matrix with the p-value for each product pair. Most of these correlations are insignificant, and I want only to retain the instances where the p-value is significant for 2 specific variables.
Here is a toy example, suppose that I want to maintain only the variables there are significant correlated with mpg, i.g., cor_pmat(mpg, other_variables) < 0.05).
library(ggcorrplot)
p.mat <- cor_pmat(mtcars)
corr <- round(cor(mtcars), 2)
Any hint on how can I do that?
Here is a function to select on the data frame:
library(dplyr)
library(rlang)
library(broom)
select_via_cor_sig <- function(.data, x, p.value, ...) {
x <- rlang::ensym(x)
.data %>%
dplyr::select(-dplyr::all_of(x)) %>%
names() %>%
lapply(function(candidate) {
c(rlang::as_string(x), candidate)
}) -> ls_pairs
ls_pairs %>%
lapply(function(vec_pair) {
x <- .data[[vec_pair[1]]]
y <- .data[[vec_pair[2]]]
cor.test(x, y, ...) %>%
broom::tidy() %>%
dplyr::mutate(v1 = vec_pair[1], v2 = vec_pair[2]) %>%
dplyr::select(v1, v2, dplyr::everything())
}) %>%
dplyr::bind_rows() -> tbl_tidy_cor_test
tbl_tidy_cor_test %>%
dplyr::filter(p.value < {{p.value}}) %>%
dplyr::pull(v2) %>%
c(rlang::as_string(x), .) -> keepers
.data %>%
dplyr::select(dplyr::all_of(keepers))
}
# use it like so:
select_via_cor_sig(mtcars, mpg, 0.001)
If you want the p-value matrix, you could run it on the subset data frame produced by this function.
I have the following working code that utilizes two lists to produce simulation output:
strategy_list <- list("s_Win","s_WinH1", "s_WinH2", "s_WinH1F1", "s_WinH2F2", "s_WinDerEx")
function_list <- list(s_win, s_winH1, s_winH2, s_winH1F1, s_winH2F2, s_winDerEx)
l <- list(strategy_list, function_list)
simulation <- pmap_dfc(l, ~ df %>%
transmute(!! .x := .y(entries, skill, field, win_payoff, wager_amt, Winner, exacta_payoff))) %>%
bind_cols(df, .)
Now I would like to run the simulation at several different skill levels, so I have added a loop and have tried to replace the skill input with i from the loop to create several variations of the simulation:
for (i in seq(from = 0.15, to=0.30, by=0.05)){
skill_list <- list(i, i, i, i, i, i)
strategy_list <- list("s_Win","s_WinH1", "s_WinH2", "s_WinH1F1", "s_WinH2F2", "s_WinDerEx")
function_list <- list(s_win, s_winH1, s_winH2, s_winH1F1, s_winH2F2, s_winDerEx)
l <- list(skill_list, strategy_list, function_list)
simulation <- pmap_dfc(l, ~ df %>%
transmute(!! .w !! .x := .y(entries, i, field, win_payoff, wager_amt, Winner, exacta_payoff))) %>%
bind_cols(df, .)
}
Unfortunately, this is producing an error. I have tried several variations, but can't seem to get the code to work.
EDIT:
Based upon Atem's post below, I have updated my code as follows:
for (i in seq(from = 0.15, to=0.30, by=0.05)){
strategy_list <- list("s_Win","s_WinH1", "s_WinH2", "s_WinH1F1", "s_WinH2F2", "s_WinDerEx") %>% stringr::str_c(i)
function_list <- list(s_win, s_winH1, s_winH2, s_winH1F1, s_winH2F2, s_winDerEx)
skill_list <- list(i, i, i, i, i, i)
l <- list(strategy_list, function_list, skill_list)
simulation <- pmap_dfc(l, ~ df %>%
transmute(!! ..1 := ..2 (entries, ..3, field, win_payoff, wager_amt, Winner, exacta_payoff))) %>%
bind_cols(df, .) %>%
Unfortunately, this is still producing an error. The problem appears to be with ..2 as this does not receive the same syntax highlighting as ..1 and ..3.
EDIT 2:
To make this a bit simpler I have put together a simplified version of my question and included a reprex. Simulation1 with two lists works fine. Simulation2 with three lists and the loop fails with the error message: could not find function "..2".
``` r
library(tidyverse)
z <- 5
df <- tibble(x=1:10, y=1:10)
s_win <- function(x,y,z){
a <-rnorm(x) + x + y + 1 +z
a
}
s_win1 <- function(x,y,z){
b <- rnorm(x) + x + y + 2 + z
b
}
s_win2 <- function(x,y,z){
c <- rnorm(x) + x + y + 3 +z
c
}
# Simulation1 with two list works.
strategy_list <- list("s_Win","s_Win1", "s_Win2")
function_list <- list(s_win, s_win1, s_win2)
l <- list(strategy_list, function_list)
simulation1 <- pmap_dfc(l, ~ df %>%
transmute(!! .x := .y (x, y, z))) %>%
bind_cols(df, .) %>%
pivot_longer(
cols = starts_with("s_"),
names_to = "Strategy",
names_prefix = "s_",
values_to = "Value",
values_drop_na = TRUE
)
View(simulation1)
# Simulation 2 with thre list does not work. Error message = could not find function "..2"
for (i in seq(from = 5, to=20, by=5)){
strategy_list <- list("s_Win","s_Win1", "s_Win2") %>% stringr::str_c(i)
function_list <- list(s_win, s_win1, s_win2)
skill_list <- list(i, i, i)
l <- list(strategy_list, function_list, skill_list)
simulation2 <- pmap_dfc(l, ~ df %>%
transmute(!! ..1 := ..2 (x, y, ..3))) %>%
bind_cols(df, .) %>%
pivot_longer(
cols = starts_with("s_"),
names_to = "Strategy",
names_prefix = "s_",
values_to = "Value",
values_drop_na = TRUE
)
}
#> Error: Problem with `mutate()` input `s_Win5`.
#> x could not find function "..2"
#> i Input `s_Win5` is `..2(x, y, ..3)`.
View(simulation2)
#> Error in as.data.frame(x): object 'simulation2' not found
```
Created on 2020-11-25 by the reprex package (v0.3.0)
The column names are stored in strategy_list, which is where you will want to incorporate i:
strategy_list <- list("s_Win","s_WinH1", "s_WinH2",
"s_WinH1F1", "s_WinH2F2", "s_WinDerEx") %>%
stringr::str_c(i)
Because you now have three lists in l, you will also want to switch to using ..1, ..2, etc. instead of .x and .y (which are only appropriate for two sets of arguments):
simulation <- pmap_dfc(l, ~ df %>%
transmute(!! ..1 := rlang::exec(..2, entries, ..3, field, win_payoff,
wager_amt, Winner, exacta_payoff))) %>%
bind_cols(df, .)
Minor note: The !! operator is known as "unquoting". Without it, transmute would create a column called .x instead of using the names stored in .x. Here's an example that demonstrates the difference:
x <- "result"
mtcars %>% transmute( x = "Hello World" )
# x
# 1 Hello World
# 2 Hello World
# ...
mtcars %>% transmute( !!x := "Hello World" )
# result
# 1 Hello World
# 2 Hello World
# ...
EDIT to address the ..2 issue: For some reason, pmap has issues with interpreting ..2 as containing a function. A simple workaround is to use rlang::exec to execute that function with the given arguments:
simulation2 <- pmap_dfc(l, ~ df %>%
transmute(!! ..1 := rlang::exec(..2, x, y, ..3))) %>%
# ... as before
I updated the original answer above as well.
I have a data.frame called fd with 406 rows and 48 columns. For each row in fd I want to compute ntiles (sixtiles). I do this the following way:
quant <- apply(fd, 1, function(x) quantile(t(x), probs = c(1/6, 2/6, 0.5, 4/6, 5/6), na.rm = TRUE ))
What I now want to do, is split my original data into 6 new dataframes, i.e. fd1 to fd6, where in fd1 I have all the observations of the first sixtile, in fd2 I have all the observations of the second sixtile and so on. Again, I want to do this rowwise. Meaning, I want my algorithm/function to look at each row of fd and do the following:
Take all the observations of the first sixtile, in the first row of fd, and store them into the first row of fd1, then take the first sixtile of the second row of fd and store them in the second row of fd1.
Important to note: I do not have observations for each row and column, so in some I have missing data (NA)
Could anybody give hints on how I can achieve this?
Thanks in advance.
Old school solution using matrix, list and nested loops.
# some artifical data with missings
set.seed(123)
fd <- data.frame(matrix(rnorm(406*48), nrow = 406, ncol = 48))
diag(fd) <- NA
# quant
quant <- apply(fd, 1, function(x)
quantile(t(x), probs = (0:6)/6, na.rm = TRUE, type = 6)
)
#matrix with selection
res <- list()
for (i in 1:6) {
mm <- matrix(NA, nrow = nrow(fd), ncol = ncol(fd)/6)
for (j in 1:nrow(fd)) {
lwr <- (quant[(i),j] < fd[j,])
upr <- (fd[j,] <= quant[(i+1),j])
if (i == 1)
z_j <- fd[j,][ upr ]
else
z_j <- fd[j,][ lwr & upr ]
z_j <- z_j[!is.na(z_j)]
mm[j,1:length(z_j)] <- sort(z_j)
}
res[[i]] <- mm
}
rm(i, mm, j, lwr, upr)
fd1 <- res[[1]]
Here is a relatively shorter way of achieving this using purrr and dplyr packages:
library(dplyr)
library(purrr)
# some random example
df <- data.frame(matrix(runif(48),405,48))
df[3,5] <- NA
df[10,25:26] <- NA
quant <- apply(df, 1, function(x) aa <- quantile(t(x), probs = c(1/6, 2/6, 3/6, 4/6, 5/6), na.rm = TRUE ))
aa <- as.data.frame(t(df))
fd1 <- map2(quant[1,],aa,function(x,y) y[y <= x] %>% .[!is.na(.)]) %>%
do.call(rbind,.)%>% as.data.frame(.)
fd2 <- pmap(list(quant[1,],quant[2,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd3 <- pmap(list(quant[2,],quant[3,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd4 <- pmap(list(quant[3,],quant[4,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd5 <- pmap(list(quant[4,],quant[5,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd6 <- map2(quant[5,],aa,function(x,y) y[y > x & y <= max(y)] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
NB: There are some duplicate values in the final fd1 - fd6 data frames (which is not, by the way, the best format to store values for this type of problem) but you can always filter them out by using for example unique.
Hope this helps. Any modification to the answer is welcomed.
I have an issue where after replicating data for a training and testing set, I'm showing a large amount of memory allocated to my user in Rstudio, but not being used in my R session. I've created a small example to reproduce my situation :)
This code runs a bunch of model, based on different formulas, algorithms, and parameter sets that I give it. It is a function, but I've created a simple script for reprex.
library(dplyr)
library(purrr)
library(modelr)
library(tidyr)
library(pryr)
# set my inputs
data <- mtcars
formulas <- c(test1 = mpg ~ cyl + wt + hp,
test2 = mpg ~ cyl + wt)
params = list()
methods <- "lm"
n <- 20 # num of cv splits
mult <- 10 # number of times I want to replicate some of the data
frac <- .25 # how much I want to cut down other data (fractional)
### the next few chunks get the unique combos of the inputs.
if (length(params) != 0) {
cross_params <- params %>%
map(cross) %>%
map_df(enframe, name = "param_set", .id = "method") %>%
list
} else cross_params <- NULL
methods_df <- tibble(method = methods) %>%
list %>%
append(cross_params) %>%
reduce(left_join, by = "method") %>%
split(1:nrow(.))
# wrangle formulas into a split dataframe
formulas_df <- tibble(formula = formulas,
name = names(formulas)) %>%
split(.$name)
# split out the data into n random train-test combos
cv_data <- data %>%
crossv_kfold(n) %>% # rsample?
mutate_at(vars(train:test), ~map(.x, as_tibble))
# sample out if needed
cv_data_samp <- cv_data %>%
mutate(train = modify(train,
~ .x %>%
split(.$gear == 4) %>%
# take a sample of the non-vo data
modify_at("FALSE", sample_frac, frac) %>%
# multiply out the vo-on data
modify_at("TRUE", function(.df) {
map_df(seq_along(1:mult), ~ .df)
}) %>%
bind_rows))
# get all unique combos of formula and method
model_combos <- list(cv = list(cv_data_samp),
form = formulas_df,
meth = methods_df) %>%
cross %>%
map_df(~ bind_cols(nest(.x$cv), .x$form, .x$meth)) %>%
unnest(data, .preserve = matches("formula|param|value")) %>%
{if ("value" %in% names(.)) . else mutate(., value = list(NULL))}
# run the models
model_combos %>%
# put all arguments into a single params column
mutate(params = pmap(list(formula = formula, data = train), list)) %>%
mutate(params = map2(params, value, ~ append(.x, .y))) %>%
mutate(params = modify(params, discard, is.null)) %>%
# run the models
mutate(model = invoke_map(method, params))
mem_change(rm(data, cv_data, cv_data_samp))
mem_used()
Now after I do this, my mem_used comes out to 77.3mb, but I'm seeing roughly double that (160Mb) allocated to my R user. This really explodes when my data is 3 Gb, which is my real-life case. I end up using 100Gb and tying up a whole server :(.
What is going on and how can I optimize?
Any help appreciated!!!
I figured this out! The issue was that I was converting my series of modelr resample objects to tibbles and that was exploding the memory even though I was subsequently sampling them down. The solution? Write methods for dealing with resample objects so that I never have to convert resample objects to tibble. These looked like:
# this function just samples the indexes instead of the data
sample_frac.resample <- function(data, frac) {
data$idx <- sample(data$idx, frac * length(data$idx))
data
}
# this function replicates the indexes. I should probably call it something else.
augment.resample <- function(data, n) {
data$idx <- unlist(map(seq_along(1:n), ~ data$idx))
data
}
# This function does simple splitting (logical only) of resample obejcts
split.resample <- function(data, .p) {
pos <- list(data = data$data, idx = which(.p, 1:nrow(data$data)))
neg <- list(data = data$data, idx = which(!.p, 1:nrow(data$data)))
class(pos) <- "resample"
class(neg) <- "resample"
list("TRUE" = pos,
"FALSE" = neg)
}
# This function takes the equivalent of a `bind_rows` for resample objects.
# Since bind rows does not call `useMethod` I had to call it something else
bind <- function(data) {
out <- list(data = data[[1]]$data, idx = unlist(map(data, pluck, "idx")))
class(out) <- "resample"
out
}
Then I just converted to a tibble in the same purrr closure in which my model is run for that CV. Problem solved! My memory usage is VERY low now.
Could you please help me do the filtering in the last command below, using dplyr instead of apply?
I was trying to solve the problem posted here
library(gtools)
n <- 8
dt <- permutations(n+1,6,v=0:n,repeats.allowed=TRUE)
SmplMode <- function(x) {
tabSmpl <- tabulate(x)
SmplMode <- which(tabSmpl == max(tabSmpl))
if (sum(tabSmpl == max(tabSmpl)) > 1)
SmplMode <- 0
return(SmplMode)
}
res <- dt[apply(dt,1,function(x) {
y <- rep(c(1,2,3,4,5,6),c(x[1],x[2],x[3],x[4],x[5],x[6]))
return(mean(y)==3 & diff(range(y))==4 & median(y)==3.5 & SmplMode(y)==4)
}),]
Operations with rowwise is slow, so filtering out SmplMode(y), mean(y), diff(range(y)) conditions early on with the help of row-wise operations from matrixStats package speeds the things up nicely. Following runs about 0.4 sec on my laptop, while both your original solution and #shadow's solution runs about 70secs.
library(dplyr)
library(matrixStats)
df <- data.frame(dt)
df$m <- rowMaxs(dt) #for SmplMode(y)
S <- matrix(1:6, ncol=ncol(dt), nrow=nrow(dt), byrow=T)
Z <- S*(dt!=0)
Z[Z==0] <- NA
df$Range <- rowMaxs(Z, na.rm=TRUE)-rowMins(Z, na.rm=TRUE) #for diff(rang(y))
df$Mean <- rowSums(S*dt)/rowSums(dt) #for mean(y)
res <- df %>%
filter(X4 == m, (X1==m)+(X2==m)+(X3==m)+(X4==m)+(X5==m)+(X6==m)==1,
Range == 4, # range condition here
Mean == 3) %>% #mean condition here
rowwise() %>%
mutate(Med = median(rep(c(1,2,3,4,5,6), c(X1, X2, X3, X4, X5, X6)))) %>%
filter(Med == 3.5) %>% #median condition here
select(-m, -Range, -Mean, -Med) %>% # get rid of newcols
as.matrix
You can use rowwise to do rowwise operations. Then use mutate to determine if the condition is satisfied and filter to filter by the condition.
res <- dt %>%
data.frame %>% # convert to data.frame, so you can use dplyr
rowwise %>% # for rowwise calculations
mutate(cond = {y = rep(1:6, c(X1, X2, X3, X4, X5, X6)) # calculate condition
mean(y)==3 & diff(range(y))==4 & median(y)==3.5 & SmplMode(y)==4}) %>%
filter(cond) %>% # filter by condition
mutate(cond = NULL) %>% # remove condition
as.matrix # convert back to matrix