perform ttest on a data.frame - r

Trying to perform ttest (and to get p.value) from a data.frame, there's one column that includes the groups (good vs bad) and the rest of the columns are numeric.
I generated a toy dataset here:
W <- rep(letters[seq( from = 1, to = 2)], 25)
X <- rnorm(n=50, mean = 10, sd = 5)
Y <- rnorm(n=50, mean = 15, sd = 6)
Z <- rnorm(n=50, mean = 20, sd = 5)
test_data <- data.frame(W, X, Y, Z)
Then I transform the data into long format:
melt_testdata <- melt(test_data)
And performed the t.test
lapply(unique(melt_testdata$variable),function(x){
Good <- subset(melt_testdata, W == 'a' & variable ==x)$variable
Bad <- subset(melt_testdata, W == 'b' & variable ==x)$variable
t.test(Good,Bad)$p.value
})
But I instead of getting the t.test results, I got the following error messages:
Error in if (stderr < 10 * .Machine$double.eps * max(abs(mx), abs(my))) stop("data are essentially constant") :
missing value where TRUE/FALSE needed In addition: Warning messages:
1: In mean.default(x) : argument is not numeric or logical: returning NA
2: In var(x) :
Calling var(x) on a factor x is deprecated and will become an error.
Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
3: In mean.default(y) : argument is not numeric or logical: returning NA
4: In var(y) :
Calling var(x) on a factor x is deprecated and will become an error.
Use something like 'all(duplicated(x)[-1L])' to test for a constant vector.
Then I tried to write loops (first time..)
good <- matrix(,50)
bad <- matrix(,50)
cnt=3
out <- rep(0,cnt)
for (i in 2:4){
good[i] <- subset(test_data, W == 'a', select= test_data[,i])
bad[i] <- subset(test_data, W == 'b', select= test_data[,i])
out[i] <- print(t.test(good[[i]], bad[[i]])$p.value)
}
Still not getting p.values .......
This is the error messages
Error in x[j] : only 0's may be mixed with negative subscripts
I appreciate any help in any method, thanks!

I think you'll have better luck with the formula method of t.test. Try
library(broom)
library(magrittr)
library(dplyr)
W <- rep(letters[seq( from = 1, to = 2)], 25)
X <- rnorm(n=50, mean = 10, sd = 5)
Y <- rnorm(n=50, mean = 15, sd = 6)
Z <- rnorm(n=50, mean = 20, sd = 5)
test_data <- data.frame(W, X, Y, Z)
lapply(test_data[c("X", "Y", "Z")],
function(x, y) t.test(x ~ y),
y = test_data[["W"]]) %>%
lapply(tidy) %>%
do.call("rbind", .) %>%
mutate(variable = rownames(.))
Edit:
With stricter adherence to the dplyr philosophy, you can use the following: which is actually a bit cleaner looking.
library(broom)
library(dplyr)
library(tidyr)
W <- rep(letters[seq( from = 1, to = 2)], 25)
X <- rnorm(n=50, mean = 10, sd = 5)
Y <- rnorm(n=50, mean = 15, sd = 6)
Z <- rnorm(n=50, mean = 20, sd = 5)
test_data <- data.frame(W, X, Y, Z)
test_data %>%
gather(variable, value, X:Z) %>%
group_by(variable) %>%
do(., tidy(t.test(value ~ W, data = .)))

Here is a solution using dplyr and the formula argument to t.test. do works on each group defined by the group_by. glance extracts values from the t.test output and makes them into a data.frame.
library(tidyverse)
library(broom)
melt_testdata %>%
group_by(variable) %>%
do(glance(t.test(value ~ W, data = .)))

Related

argument "" is missing, with no default when using map_dfc in R

I am trying to modify Monte Carlo code from Reproduceable Finance with R to handle withdrawals and inflation of these withdrawals.
to do so I have changed a simulation accumulate function to subtract a withdrawl amount that has been increased do to inflation prior to calling the function.
simulation_accum_withdrawls <- function(init_value, N, mean, stdev, wAmt) {
tibble(c(init_value, 1 + rnorm(N, mean, stdev),wAmt)) %>%
`colnames<-`("returns") %>%
mutate(growth =
accumulate(returns, function(x, y, wAmt) x * y - wAmt)) %>%
select(growth)
}
the function is run from the following code
monte_carlo_sim <-
map_dfc(starts, simulation_accum_withdrawls,
N = sim_duration_months,
mean = geo_mean_port_return,
stdev = geo_stddev_port_return,
wAmt = inflated_withdrawls)
which throws off the following error.
Error in map():
ℹ In index: 1.
Caused by error in mutate():
! Problem while computing growth = accumulate(returns, function(x, y, wAmt) x * y - wAmt).
Caused by error in fn():
! argument "wAmt" is missing, with no default
Backtrace:
purrr::map_dfc(...)
purrr::accumulate(returns, function(x, y, wAmt) x * y - wAmt)
purrr:::reduce_impl(...)
fn(out, elt, ...)
Error in map(.x, .f, ...) :
The inflated_withdrawls data frame does exist and is not empty i.e. see below
str(inflated_withdrawls)
'data.frame': 38 obs. of 1 variable:
$ wAmt: num 1805391 0 0 0 0 ...
Any advice or insight on how to debug this would be appreciated.
Thanks
Here is an executable example per the request.
library(tidyverse)
library(highcharter)
library(tidyquant)
library(timetk)
library(broom)
library(highcharter)
library(purrr)
library(knitr)
library(readxl)
accum_inflation <- function(init_value, inflation=0.0) {
tibble(c(init_value, 1 + inflation)) %>%
`colnames<-`("acc_inf") %>%
mutate(acc_inf =
accumulate(acc_inf, function(x, y) x * y ))
}
accum_wd <- function(wAmt, acc_inf=0.0) {
growth = wAmt * acc_inf
}
simulation_accum_1 <- function(init_value, N, mean, stdev,inf_wd) {
tibble(c(init_value, 1 + rnorm(N, mean, stdev), inf_wd)) %>%
`colnames<-`("returns") %>%
mutate(growth =
accumulate(returns, function(x, y) x * y - inf_wd)) %>%
select(growth)
}
simulation_accum_withdrawls <- function(init_value, N, mean, stdev, wAmt=0) {
tibble(c(init_value, 1 + rnorm(N, mean, stdev),wAmt)) %>%
`colnames<-`("returns") %>%
mutate(growth =
accumulate(returns, function(x, y, wAmt) x * y + wAmt)) %>%
select(growth)
}
test_df <- data.frame(wAmt =c(100000,-180000,-180000,-180000),
date=seq.Date(as.Date("2024-01-01"), by = "year", length.out = 4)
)
test_df <- test_df %>% pad_by_time(date, .by = "month", .pad_value = 0)
inf_df <- rnorm(nrow(test_df),0.03/12,0.042/12)
inflated_returns <- accum_inflation(1, inf_df)
edatep1 <- ymd(edate)+months(1)
test_df<- rows_append(test_df,tibble(wAmt = 0, date = edatep1))
inflated_withdrawls <- accum_wd(test_df[1], inflated_returns)
nav=1000000
sim_duration_months = 40
total_sims = 10
geo_mean_port_return = 0.00443402454379282
geo_stddev_port_return = 0.0237813751473552
sims <- total_sims
starts <-
rep(nav, sims) %>%
set_names(paste("sim", 1:sims, sep = ""))
monte_carlo_sim <-
map_dfc(starts, simulation_accum_withdrawls,
N = sim_duration_months,
mean = geo_mean_port_return,
stdev = geo_stddev_port_return,
wAmt = inflated_withdrawls)

Using group_by in function

I have a function written to calculate the confidence interval of a ratio of averages between two vectors using jackknife standard errors
jackknife_CI = function(x, y, alpha = .05) {
xl = (sum(x,na.rm=T) - x) / (length(x) - 1)
yl = (sum(y,na.rm=T) - y) / (length(y) - 1)
n = length(x) + length(y)
jack_se = (sd(c(xl / mean(y,na.rm=T), mean(x,na.rm=T) / yl),na.rm=T) * (n - 1)) / sqrt(n)
mean(x, na.rm = T) / mean(y, na.rm = T) + jack_se * qnorm(c(alpha/2,1-alpha/2))
}
I want to then use it with the ToothGrowth dataset in the following way:
df1 =
ToothGrowth %>%
filter(supp == "OJ") %>%
rename(len_x = len) %>%
select(dose,len_x)
df2 =
ToothGrowth %>%
filter(supp == "VC") %>%
rename(len_y = len) %>%
select(dose, len_y)
df = cbind(df1,df2)
df = df[,-3]
jack_CI = df %>% group_by(dose) %>% jackknife_CI(x = len_x, y = len_y)
My problem is that the last line results in the error:
Error in jackknife_CI(., x = len_x, y = len_y) : object 'len_x' not found
How do I get around this?
The last line need to be:
jack_CI = jackknife_CI(x = df$len_x, y = df$len_y)
The way you are running it is being interpreted as follows:
jack_CI = jackknife_CI(group_by(df, dose), x = len_x, y = len_y)
Which is causing a couple issues:
jackknife_CI is not expecting the first argument to be the dataframe. (because of pipe operator)
len_x and len_y are not recognized outside of the dataframe.
If you want to run the function on each group you can do:
df %>% group_by(dose) %>%
do({
ci <- jackknife_CI(.$len_x, .$len_y)
tibble(low = ci[1], hi = ci[2])
})
I use do because the function returns two values. Otherwise you would be able to just use summarize. Each group is being passed to do which is then returning a tibble (note the last line in do) which are then being stacked to return the result. I am referring to each group inside of do with .$variable_name where the dot references the value being passed (in this case the dataframe for each group)

custom function does not work on column named "x" unless specified by .$x in summarise() dplyr R

I wanted to create a custom function to calculate confidence intervals of a column by creating two columns called lower.bound and upper.bound. I also wanted this function to be able to work within dplyr::summarize() function.
The function works as expected in all tested circumstances, but it does not when the column is named "x". When it is it draws a warning and returns NaN values. It only works when the column is specifically declared as .$x. Here is an example of the code. I don't understand the nuance... could you point me to the right direction in understanding this?
set.seed(12)
# creates random data frame
z <- data.frame(
x = runif(100),
y = runif(100),
z = runif(100)
)
# creates function to calculate confidence intervals
conf.int <- function(x, alpha = 0.05) {
sample.mean <- mean(x)
sample.n <- length(x)
sample.sd <- sd(x)
sample.se <- sample.sd / sqrt(sample.n)
t.score <- qt(p = alpha / 2,
df = sample.n - 1,
lower.tail = F)
margin.error <- t.score * sample.se
lower.bound <- sample.mean - margin.error
upper.bound <- sample.mean + margin.error
as.data.frame(cbind(lower.bound, upper.bound))
}
# This works as expected
z %>%
summarise(x = mean(y), conf.int(y))
# This does not
z %>%
summarise(x = mean(x), conf.int(x))
# This does
z %>%
summarise(x = mean(x), conf.int(.$x))
Thanks!
This is a "feature" in dplyr which makes the updated value of x (which has the mean value) is available when you pass it to conf.int function.
Possible options are -
Change the name of the variable to store the mean value
library(dplyr)
z %>% summarise(x1 = mean(x), conf.int(x))
# x1 lower.bound upper.bound
#1 0.4797154 0.4248486 0.5345822
Change the order
z %>% summarise(conf.int(x), x = mean(x))
# lower.bound upper.bound x
#1 0.4248486 0.5345822 0.4797154

Call the same function by varying parameters with tidyverse and apply family functions

This is my first question here on Stack Overflow, so I apologize in advance if I won't be clear enough.
I searched for similar questions, but I didn't find anything (I probably didn't search enough!)
Given a data.frame (or a data.table or a tibble) consisting of four sets of points divided into two groups:
df_points <- tibble(
x = c(rnorm(10000, mean = 0), rnorm(10000, mean = 1),
rnorm(10000, mean = 0), rnorm(10000, mean = 4)),
dist = c(rep("d1", 10000), rep("d2", 10000),
rep("d1", 10000), rep("d2", 10000)),
overlap = c(rep("o1", 20000), rep("o2", 20000))
)
my goal is to apply the density function, using different values of bw, from and to for the "o1" and "o2" groups.
I would like to solve this problem in an elegant way with both a tidyverse and a R-base-data.table approach (apply family functions).
For now I have managed to do this via tidyverse:
I define a common_dens function which applies density and returns a tibble of the x and y of the distribution
common_dens <- function(df, Bw, lower, upper) {
d <- density(df, n = 2048, bw = Bw, from = lower, to = upper)
df_d <- tibble(x = d$x, y = d$y)
return(df_d)
}
assuming that the values of upper, lower and bws are the following:
lower <- c(-5.050, -4.705)
upper <- c(6.445, 9.070)
bws <- c(0.1427, 0.1417)
I get the desired dataframe through the following for loop:
df_dens <- NULL
for (i in 1:2) {
df_t <- df_points %>%
filter(overlap == unique(df_points$overlap)[[i]]) %>%
group_by(dist, overlap) %>%
summarise(common_dens(x, bws[i], lower[i], upper[i]))
df_dens <- rbind(df_dens, df_t)
}
Is there any way to remove the for loop?
Is there a way to do the same with apply family functions and data.table?
Thanks for your help!
The purrr::pmap function allows you to apply an arbitrary number of parameters to a function in succession. The pmap_dfr returns a data.frame bound by row:
Consider your parameters provided as a data.frame:
params <- data.frame(group = c("o1","o2"), bws, lower, upper)
group bws lower upper
1 o1 0.1427 -5.050 6.445
2 o2 0.1417 -4.705 9.070
The paramters are automatically assigned to the special symbols ..1, ..2, and so on:
library(purrr)
pmap_dfr(params, ~ df_points %>%
filter(overlap == ..1) %>%
group_by(dist, overlap) %>%
summarise(common_dens(x,Bw = ..2, lower = ..3, upper = ..4)))
It can get confusing which ..# is which, so a trick is to use with(list(...), ):
pmap_dfr(params, ~ with(list(...), df_points %>%
filter(overlap == group) %>%
group_by(dist, overlap) %>%
summarise(common_dens(x,Bw = bws, lower = lower, upper = upper))))
You could of course do the same with base R apply:
apply(params, 1, function(y){ df_points %>%
filter(overlap == y[1]) %>%
group_by(dist, overlap) %>%
summarise(common_dens(x, Bw = as.numeric(y[2]), lower = as.numeric(y[2]),
upper = as.numeric(y[4])))}) %>%
bind_rows()
However, because apply converts the types, you'll need to use as.numeric.

I'm getting Error in dat$y : $ operator is invalid for atomic vectors when trying to calculate the possible results using map() function

I am currently taking an online Data science: Machine learning course and we are asked to fit a lm 100 times and obtain the values of the mean (rmse) and sd(rmse) for data sets of different sizes n=c(100,500,1000,5000,10000).
we are asked to create a function that takes the size n and builds the dataset, then runs the loop made for fitting the 100 models, then set the seed and use a map() or sapply() function for applying our new function to the n different sizes.
The code I did is showing me "Error in dat$y : $ operator is invalid for atomic vectors" error when I run f1
This is my code:
library(MASS)
library(caret)
ff=function(n){
Sigma <- 9*matrix(c(1.0, 0.5, 0.5, 1.0), 2, 2)
dat <- MASS::mvrnorm(n, c(69, 69), Sigma)%>%data.frame() %>% setNames(c("x", "y"))
}
set.seed(1,sample.kind = "Rounding")
n=c(100,500,1000,5000,10000)
f1=map(n,function(dat){
rmse=replicate(100,{
y <- dat$y
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
train_set <- dat %>% slice(-test_index)
test_set <- dat %>% slice(test_index)
fit <- lm(y ~ x, data = train_set)
y_hat <- fit$coef[1] + fit$coef[2]*test_set$x
sqrt(mean((y_hat - test_set$y)^2))
})
structure(c(mean(rmse),sd(rmse)))
})
Thank you for your help!!
I think you should use something like :
library(caret)
library(dplyr)
n=c(100,500,1000,5000,10000)
f1= purrr::map(n,function(x){
rmse=replicate(100,{
dat <- ff(x)
y <- 1:nrow(dat)
test_index <- createDataPartition(y, times = 1, p = 0.5, list = FALSE)
train_set <- dat %>% slice(-test_index)
test_set <- dat %>% slice(test_index)
fit <- lm(y ~ x, data = train_set)
y_hat <- fit$coef[1] + fit$coef[2]*test_set$x
sqrt(mean((y_hat - test_set$y)^2))
})
c(mean(rmse),sd(rmse))
})

Resources