I wrote a function to calculate odds ratios of two variables, CI's and bind everything together with the n and the name of one of the variables. My example including my function looks like this:
library(DescTools)
a <- as.numeric(replicate(1, sample(c(0:1), 100, replace = TRUE)))
b <- as.numeric(replicate(1, sample(c(0:1), 100, replace = TRUE)))
c <- as.numeric(replicate(1, sample(c(0:1), 100, replace = TRUE)))
x <- as.data.frame(cbind(a, b, c))
orr <- function (var1, var2){
con <- table(var1, var2)
o <- OddsRatio(con, conf.level = 0.95)
n <- sum(con[2, 1:2])
name <- deparse(substitute(var2))
df <- data.frame(rbind(o), n, "ind.varname" = name)
return(df)
}
a<-orr(x$b,x$a)
b<-orr(x$c,x$a)
rbind(a,b)
Now, in my example, I want to pass a list of arguments to the function (arg2) to have the odds calculated for several variables with the first argument staying the same. Thus, the final output would preferably be a data.frame that looks like the following, just with more lines (see above):
Does anybody have any tips on how to do that? Thanks in advance.
You can try this function :
orr <- function (data, var1, var2){
val1 <- data[[var1]]
val2 <- data[[var2]]
con <- table(val1, val2)
o <- OddsRatio(con, conf.level = 0.95)
n <- sum(con[2, 1:2])
df <- data.frame(rbind(o), n, ind.varname = var2, row.names = NULL)
return(df)
}
do.call(rbind, lapply(names(x[-1]), orr, data = x, var1 = "a"))
# odds.ratio lwr.ci upr.ci n ind.varname
#1 1.739130 0.7706866 3.924519 61 b
#2 1.519481 0.6704328 3.443777 61 c
Little shorter with purrr::map_df :
purrr::map_df(names(x[-1]), orr, data = x, var1 = "a")
Related
I am trying to apply fm.Choquet function (Rfmtool package) to my R data frame, but no success. The function works like this (ref. here):
# let x <- 0.6 (N = 1)
# and y <- c(0.3, 0.5). y elements are always 2 power N (here, 2)
# env<-fm.Init(1). env is propotional to N
# fm.Choquet(0.6, c(0.3, 0.5),env) gives a single value output
I have this sample data frame:
set.seed(123456)
a <- qnorm(runif(30,min=pnorm(0),max=pnorm(1)))
b <- qnorm(runif(30,min=pnorm(0),max=pnorm(1)))
c <- qnorm(runif(30,min=pnorm(0),max=pnorm(1)))
df <- data.frame(a=a, b=b, c=c)
df$id <- seq_len(nrow(df))
I would like to apply fm.Choquet function to each row of my df such that, for each row (or ID), a is read as x, while b and c are read as y vector (N = 2), and add the function output as a new column for each row. However, I am getting the dimension error "The environment mismatches the dimension to the fuzzy measure.".
Here is my attempt.
df2 <- df %>% as_tibble() %>%
rowwise() %>%
mutate(ci = fm.Choquet(df$a,c(df[,2],df[,3]), env)) %>%
mutate(sum = rowSums(across(where(is.numeric)))) %>% # Also tried adding sum which works
as.matrix()
I am using dplyr::rowwise(), but I am open to looping or other suggestions. Can someone help me?
EDIT 1:
A relevant question is identified as a possible solution for the above question, but using one of the suggestions, by(), still throws the same error:
by(df, seq_len(nrow(df)), function(row) fm.Choquet(df$a,c(df$b,df$c), env))
set.seed(123456)
a <- qnorm(runif(30, min = pnorm(0), max = pnorm(1)))
b <- qnorm(runif(30, min = pnorm(0), max = pnorm(1)))
c <- qnorm(runif(30, min = pnorm(0), max = pnorm(1)))
df <- data.frame(a = a, b = b, c = c)
df$id <- seq_len(nrow(df))
library(Rfmtool)
library(tidyverse)
env <- fm.Init(1)
map_dbl(
seq_len(nrow(df)),
~ {
row <- slice(df,.x)
fm.Choquet(
x = row$a,
v = c(row$b, row$c), env
)
}
)
Here is the original dataframe:
set.seed(100)
toydata <- data.frame(A = sample(1:50,50,replace = T),
B = sample(1:50,50,replace = T),
C = sample(1:50,50,replace = T)
)
Below is the function which can swap values:
derangement <- function(x){
if(max(table(x)) > length(x)/2) return(NA)
while(TRUE){
y <- sample(x)
if(sum(y == x)<3) return(y)
}
}
swapFun <- function(x, n = 10){
inx <- which(x < n)
y <- derangement(x[inx])
if(length(y) == 1) return(NA)
x[inx] <- y
x
}
toy is the new dataframe after swapping
toy <- toydata # Work with a copy
toy[] <- lapply(toydata, swapFun)
I want to compare the contingency tables of these two dataframe by the difference of sum, which means:
table1<-table(toydata$A,toydata$B)
table2<-table(toy$A,toy$B)
SUM1<-sum(abs(table1-table2))
table3<-table(toydata$A,toydata$C)
table4<-table(toy$A,toy$C)
SUM2<-sum(abs(table3-table4))
table5<-table(toydata$B,toydata$C)
table6<-table(toy$C,toy$C)
SUM3<-sum(abs(table5-table6))
SUM1+SUM2+SUM3 is what I want to have. Can I get it more conviniently because sometimes the dataframe has many columns.
How to solve it? Thanks.
library(dplyr)
# your function to compare contingency tables
f = function(x,y){
table1<-table(toydata[,x],toydata[,y])
table2<-table(toy[,x],toy[,y])
sum(abs(table1-table2))
}
# vectorise your function
f = Vectorize(f)
combn(x=names(toydata),
y=names(toydata), 2) %>% # create all combinations of your column names
t() %>% # transpose
data.frame(., stringsAsFactors = F) %>% # save as dataframe
filter(X1 != X2) %>% # exclude pairs of same column
mutate(SumAbs = f(X1,X2)) # apply function
# X1 X2 SumAbs
# 1 A B 14
# 2 A C 26
# 3 B C 22
Below is my data
set.seed(100)
toydata <- data.frame(A = sample(1:50,50,replace = T),
B = sample(1:50,50,replace = T),
C = sample(1:50,50,replace = T)
)
Below is my swapping function
derangement <- function(x){
if(max(table(x)) > length(x)/2) return(NA)
while(TRUE){
y <- sample(x)
if(all(y != x)) return(y)
}
}
swapFun <- function(x, n = 10){
inx <- which(x < n)
y <- derangement(x[inx])
if(length(y) == 1) return(NA)
x[inx] <- y
x
}
In the first case,I get the new data toy by swapping the entire dataframe. The code is below:
toydata<-as.matrix(toydata)
toy<-swapFun(toydata)
toy<-as.data.frame(toy)
In the second case, I get the new data toy by swapping each column respectively. Below is the code:
toydata<-as.data.frame(toydata)
toy2 <- toydata # Work with a copy
toy2[] <- lapply(toydata, swapFun)
toy<-toy2
Below is the function that can output the difference of contigency table after swapping.
# the function to compare contingency tables
f = function(x,y){
table1<-table(toydata[,x],toydata[,y])
table2<-table(toy[,x],toy[,y])
sum(abs(table1-table2))
}
# vectorise your function
f = Vectorize(f)
combn(x=names(toydata),
y=names(toydata), 2) %>%# create all combinations of your column names
t() %>% # transpose
data.frame(., stringsAsFactors = F) %>% # save as dataframe
filter(X1 != X2) %>% # exclude pairs of same
# column
mutate(SumAbs = f(X1,X2)) # apply function
In the second case, this mutate function works.
But in the first case, this mutatefunction does not work. It says:
+ filter(X1 != X2) %>% # exclude pairs of same column
+ mutate(SumAbs = f(X1,X2)) # apply function
Error in combn(x = names(toydata), y = names(toydata), 2) : n < m
However in the two cases, the toy data are all dataframes with the same dimension, the same row names and the same column names. I feel confused.
How can I fix it? Thanks.
I am working with a resampling procedure in R (just like a bootstrap). I have a matrix of response/explanatory variables and would like to make 999 samples of this matrix to calculate for each statistic I am working their mean, sd and confidence interval. So, I wrote a function to calculate and to return a list:
mydata <- data.frame(a=rnorm(20, 1, 1), b = rnorm(20,1,1))
myfun <- function(data, n){
sample <- data[sample(n, replace = T),]
model1 <- lm(sample[,1]~sample[,2])
return(list(model1[[1]][[1]], model1[[1]][[2]]))
}
result <- as.numeric()
result <- replicate(99, myfun(mydata, 10))
Then, I have a matrix as my output in which the rows are the statistics and the columns are the samplings (nrow = 2 and ncol = 99). I need the mean and sd for each row, but when I try to use the apply function or even a loop the following message shows up:
In mean.default(newX[, i], ...) :
argument is not numeric or logical: returning NA
Moreover:
is.numeric(result)
[1] FALSE
I found it strange, because I never had such problem with similar procedures.
Any thoughts?
Use the following:
myfun <- function(dat, n){
dat1 <- dat[sample(n, replace = T),]
model1 <- lm(dat1[,1] ~ dat1[,2])
return(coef(model1))
}
replicate(99, myfun(mydata, 10))
The reason is the 'result' is a list of 198 elements with dimension attributes. We need to unlist the 'result' and provide the dimension attributes
result1 <- `dim<-`(unlist(result), dim(result))
and then use the apply
Just replace list() by c() in your myfun() function
mydata <- data.frame(a=rnorm(20, 1, 1), b = rnorm(20,1,1))
myfun <- function(data, n){
sample <- data[sample(n, replace = T),]
model1 <- lm(sample[,1]~sample[,2])
return(c(model1[[1]][[1]], model1[[1]][[2]]))
}
result <- as.numeric()
result <- replicate(99, myfun(mydata, 10))
apply(result, FUN=mean, 1)
apply(result, FUN=sd, 1)
This worked for me:
mydata <- data.frame(a=rnorm(20, 1, 1), b = rnorm(20,1,1))
myfun <- function(data, n){
sample <- data[sample(n, replace = T),]
model1 <- lm(sample[,1]~sample[,2])
return(data.frame(v1 = model1[[1]][[1]], v2 = model1[[1]][[2]]))
}
result <- do.call("rbind",(replicate(99, myfun(mydata, 10), simplify = FALSE)))
I want to apply t-tests on a bunch of variables. Below is some mock data
d <- data.frame(var1=rnorm(10),
var2=rnorm(10),
group=sample(c(0,1), 10, replace=TRUE))
# Is there a way to do this in some sort of loop?
with(d, t.test(var1~group))
with(d, t.test(var2~group))
# I tried this but the loop did not give a result!?
varnames <- c('var1', 'var2')
for (i in 1:2) {
eval(substitute(with(d, t.test(variable~group)),
list(variable=as.name(varnames[i]))))
}
Also, is it possible to extract the values from the t-test's result (e.g. two group means, p-value) so that the loop will produce a neat balance table across the variables? In other words, the end result I want is not a bunch of t-tests upon one another, but a table like this:
Varname mean1 mean2 p-value
Var1 1.1 1.2 0.989
Var2 1.2 1.3 0.912
You can use formula and lapply like this
set.seed(1)
d <- data.frame(var1 = rnorm(10),
var2 = rnorm(10),
group = sample(c(0, 1), 10, replace = TRUE))
varnames <- c("var1", "var2")
formulas <- paste(varnames, "group", sep = " ~ ")
res <- lapply(formulas, function(f) t.test(as.formula(f), data = d))
names(res) <- varnames
If you want to extract your table, you can proceed like this
t(sapply(res, function(x) c(x$estimate, pval = x$p.value)))
mean in group 0 mean in group 1 pval
var1 0.61288 0.012034 0.098055
var2 0.46382 0.195100 0.702365
Here is a reshape/plyr solution:
The foo function is the workhorse, it runs the t-test and extract means and p-value.
d <- data.frame(var1=rnorm(10),
var2=rnorm(10),
group=sample(c(0,1), 10, replace=TRUE))
require(reshape2)
require(plyr)
dfm <- melt(d, id = 'group')
foo <- function(x) {
tt <- t.test(value ~ group, data = x)
out <- data.frame(mean1 = tt$estimate[1], mean2 = tt$estimate[2], P = tt$p.value)
return(out)
}
ddply(dfm, .(variable), .fun=foo)
# variable mean1 mean2 P
#1 var1 -0.2641942 0.3716034 0.4049852
#2 var2 -0.9186919 -0.2749101 0.5949053
Use sapply to apply t-test to all varnames and extract the necessary data by subsetting "estimate" and "p.value". Check names(with(d, t.test(var1~group))) if you want to extract other information
t(with(d, sapply(varnames, function(x) unlist(t.test(get(x)~group)[c("estimate", "p.value")]))))