Compare the contingency table of two dataframes - r

Here is the original dataframe:
set.seed(100)
toydata <- data.frame(A = sample(1:50,50,replace = T),
B = sample(1:50,50,replace = T),
C = sample(1:50,50,replace = T)
)
Below is the function which can swap values:
derangement <- function(x){
if(max(table(x)) > length(x)/2) return(NA)
while(TRUE){
y <- sample(x)
if(sum(y == x)<3) return(y)
}
}
swapFun <- function(x, n = 10){
inx <- which(x < n)
y <- derangement(x[inx])
if(length(y) == 1) return(NA)
x[inx] <- y
x
}
toy is the new dataframe after swapping
toy <- toydata # Work with a copy
toy[] <- lapply(toydata, swapFun)
I want to compare the contingency tables of these two dataframe by the difference of sum, which means:
table1<-table(toydata$A,toydata$B)
table2<-table(toy$A,toy$B)
SUM1<-sum(abs(table1-table2))
table3<-table(toydata$A,toydata$C)
table4<-table(toy$A,toy$C)
SUM2<-sum(abs(table3-table4))
table5<-table(toydata$B,toydata$C)
table6<-table(toy$C,toy$C)
SUM3<-sum(abs(table5-table6))
SUM1+SUM2+SUM3 is what I want to have. Can I get it more conviniently because sometimes the dataframe has many columns.
How to solve it? Thanks.

library(dplyr)
# your function to compare contingency tables
f = function(x,y){
table1<-table(toydata[,x],toydata[,y])
table2<-table(toy[,x],toy[,y])
sum(abs(table1-table2))
}
# vectorise your function
f = Vectorize(f)
combn(x=names(toydata),
y=names(toydata), 2) %>% # create all combinations of your column names
t() %>% # transpose
data.frame(., stringsAsFactors = F) %>% # save as dataframe
filter(X1 != X2) %>% # exclude pairs of same column
mutate(SumAbs = f(X1,X2)) # apply function
# X1 X2 SumAbs
# 1 A B 14
# 2 A C 26
# 3 B C 22

Related

Error: Unable to apply package function to each row in R

I am trying to apply fm.Choquet function (Rfmtool package) to my R data frame, but no success. The function works like this (ref. here):
# let x <- 0.6 (N = 1)
# and y <- c(0.3, 0.5). y elements are always 2 power N (here, 2)
# env<-fm.Init(1). env is propotional to N
# fm.Choquet(0.6, c(0.3, 0.5),env) gives a single value output
I have this sample data frame:
set.seed(123456)
a <- qnorm(runif(30,min=pnorm(0),max=pnorm(1)))
b <- qnorm(runif(30,min=pnorm(0),max=pnorm(1)))
c <- qnorm(runif(30,min=pnorm(0),max=pnorm(1)))
df <- data.frame(a=a, b=b, c=c)
df$id <- seq_len(nrow(df))
I would like to apply fm.Choquet function to each row of my df such that, for each row (or ID), a is read as x, while b and c are read as y vector (N = 2), and add the function output as a new column for each row. However, I am getting the dimension error "The environment mismatches the dimension to the fuzzy measure.".
Here is my attempt.
df2 <- df %>% as_tibble() %>%
rowwise() %>%
mutate(ci = fm.Choquet(df$a,c(df[,2],df[,3]), env)) %>%
mutate(sum = rowSums(across(where(is.numeric)))) %>% # Also tried adding sum which works
as.matrix()
I am using dplyr::rowwise(), but I am open to looping or other suggestions. Can someone help me?
EDIT 1:
A relevant question is identified as a possible solution for the above question, but using one of the suggestions, by(), still throws the same error:
by(df, seq_len(nrow(df)), function(row) fm.Choquet(df$a,c(df$b,df$c), env))
set.seed(123456)
a <- qnorm(runif(30, min = pnorm(0), max = pnorm(1)))
b <- qnorm(runif(30, min = pnorm(0), max = pnorm(1)))
c <- qnorm(runif(30, min = pnorm(0), max = pnorm(1)))
df <- data.frame(a = a, b = b, c = c)
df$id <- seq_len(nrow(df))
library(Rfmtool)
library(tidyverse)
env <- fm.Init(1)
map_dbl(
seq_len(nrow(df)),
~ {
row <- slice(df,.x)
fm.Choquet(
x = row$a,
v = c(row$b, row$c), env
)
}
)

Extracting conditional rows from a list of dataframes in a loop

In my split(w7, w7$study.name)[48] call below there are 4 rows for which variable control == FALSE.
But I'm wondering why ctlistG(split(w7, w7$study.name)[48]) returns only one of such rows?
ps. I suspect, instead of lapply() I should have used mapply() in ctlistG().
Reproducible R code:
ctlist <- function(List, cont=FALSE, pos=1, outcom=1){
if(!inherits(List, "list")) List <- list(List)
h <- setNames(lapply(List, function(i) i[i$control==cont & i$post == pos & i$outcome == outcom, , drop = FALSE]), names(List))
Filter(NROW, h) }
#====================
ctlistG <- function(m){
input <- setNames(lapply(m, function(i) rev(expand.grid(outcom = seq_len(max(i$outcome, na.rm = TRUE)), pos = seq_len(max(i$post, na.rm = TRUE))))), names(m))
lapply(input, function(i) ctlist(m, cont = FALSE, pos = i$pos, outcom = i$outcom)) }
#==================== EXAMPLE OF USE:
w7 <- read.csv('https://raw.githubusercontent.com/rnorouzian/m/master/w7.csv')
ctlistG(split(w7, w7$study.name)[48]) # I expect 4 rows not 1 below!
#$VanBe_Jng_KenA
#$VanBe_Jng_KenA$VanBe_Jng_KenA
# study.name YofPub group.name n d
#406 VanBe_Jng_KenA 2012 NA 34 NA
If we need 4 rows, based on the function, we may need Map instead of lapply
out <- do.call(rbind, lapply(input, function(inp)
do.call(rbind, Map(function(p, o)
do.call(rbind, lapply(m, function(m1)
m1[m1$control == FALSE & m1$post == p & m1$outcome ==o, , drop = FALSE])),
inp$pos, inp$outcom))))
data
lst1 <- split(w7, w7$study.name)
m <- lst1[48]

Split data by row in R in quantiles

I have a data.frame called fd with 406 rows and 48 columns. For each row in fd I want to compute ntiles (sixtiles). I do this the following way:
quant <- apply(fd, 1, function(x) quantile(t(x), probs = c(1/6, 2/6, 0.5, 4/6, 5/6), na.rm = TRUE ))
What I now want to do, is split my original data into 6 new dataframes, i.e. fd1 to fd6, where in fd1 I have all the observations of the first sixtile, in fd2 I have all the observations of the second sixtile and so on. Again, I want to do this rowwise. Meaning, I want my algorithm/function to look at each row of fd and do the following:
Take all the observations of the first sixtile, in the first row of fd, and store them into the first row of fd1, then take the first sixtile of the second row of fd and store them in the second row of fd1.
Important to note: I do not have observations for each row and column, so in some I have missing data (NA)
Could anybody give hints on how I can achieve this?
Thanks in advance.
Old school solution using matrix, list and nested loops.
# some artifical data with missings
set.seed(123)
fd <- data.frame(matrix(rnorm(406*48), nrow = 406, ncol = 48))
diag(fd) <- NA
# quant
quant <- apply(fd, 1, function(x)
quantile(t(x), probs = (0:6)/6, na.rm = TRUE, type = 6)
)
#matrix with selection
res <- list()
for (i in 1:6) {
mm <- matrix(NA, nrow = nrow(fd), ncol = ncol(fd)/6)
for (j in 1:nrow(fd)) {
lwr <- (quant[(i),j] < fd[j,])
upr <- (fd[j,] <= quant[(i+1),j])
if (i == 1)
z_j <- fd[j,][ upr ]
else
z_j <- fd[j,][ lwr & upr ]
z_j <- z_j[!is.na(z_j)]
mm[j,1:length(z_j)] <- sort(z_j)
}
res[[i]] <- mm
}
rm(i, mm, j, lwr, upr)
fd1 <- res[[1]]
Here is a relatively shorter way of achieving this using purrr and dplyr packages:
library(dplyr)
library(purrr)
# some random example
df <- data.frame(matrix(runif(48),405,48))
df[3,5] <- NA
df[10,25:26] <- NA
quant <- apply(df, 1, function(x) aa <- quantile(t(x), probs = c(1/6, 2/6, 3/6, 4/6, 5/6), na.rm = TRUE ))
aa <- as.data.frame(t(df))
fd1 <- map2(quant[1,],aa,function(x,y) y[y <= x] %>% .[!is.na(.)]) %>%
do.call(rbind,.)%>% as.data.frame(.)
fd2 <- pmap(list(quant[1,],quant[2,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd3 <- pmap(list(quant[2,],quant[3,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd4 <- pmap(list(quant[3,],quant[4,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd5 <- pmap(list(quant[4,],quant[5,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd6 <- map2(quant[5,],aa,function(x,y) y[y > x & y <= max(y)] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
NB: There are some duplicate values in the final fd1 - fd6 data frames (which is not, by the way, the best format to store values for this type of problem) but you can always filter them out by using for example unique.
Hope this helps. Any modification to the answer is welcomed.

Two same type of dataframes perform differently in a function

Below is my data
set.seed(100)
toydata <- data.frame(A = sample(1:50,50,replace = T),
B = sample(1:50,50,replace = T),
C = sample(1:50,50,replace = T)
)
Below is my swapping function
derangement <- function(x){
if(max(table(x)) > length(x)/2) return(NA)
while(TRUE){
y <- sample(x)
if(all(y != x)) return(y)
}
}
swapFun <- function(x, n = 10){
inx <- which(x < n)
y <- derangement(x[inx])
if(length(y) == 1) return(NA)
x[inx] <- y
x
}
In the first case,I get the new data toy by swapping the entire dataframe. The code is below:
toydata<-as.matrix(toydata)
toy<-swapFun(toydata)
toy<-as.data.frame(toy)
In the second case, I get the new data toy by swapping each column respectively. Below is the code:
toydata<-as.data.frame(toydata)
toy2 <- toydata # Work with a copy
toy2[] <- lapply(toydata, swapFun)
toy<-toy2
Below is the function that can output the difference of contigency table after swapping.
# the function to compare contingency tables
f = function(x,y){
table1<-table(toydata[,x],toydata[,y])
table2<-table(toy[,x],toy[,y])
sum(abs(table1-table2))
}
# vectorise your function
f = Vectorize(f)
combn(x=names(toydata),
y=names(toydata), 2) %>%# create all combinations of your column names
t() %>% # transpose
data.frame(., stringsAsFactors = F) %>% # save as dataframe
filter(X1 != X2) %>% # exclude pairs of same
# column
mutate(SumAbs = f(X1,X2)) # apply function
In the second case, this mutate function works.
But in the first case, this mutatefunction does not work. It says:
+ filter(X1 != X2) %>% # exclude pairs of same column
+ mutate(SumAbs = f(X1,X2)) # apply function
Error in combn(x = names(toydata), y = names(toydata), 2) : n < m
However in the two cases, the toy data are all dataframes with the same dimension, the same row names and the same column names. I feel confused.
How can I fix it? Thanks.

Standard evaluation for tidyr::complete - a function that completes by all non-numeric columns

I want to make a function that would apply tidyr::complete to all non-numeric columns of an R data.frame. Value zero should be inserted to the new value rows. I understand that this requires standard evaluation solution, but I've thus far had no success.
Here is what I have thus far:
completeDf <- function(df){
vars <- names(df)
chVars <- vars[!(sapply(df, is.numeric))]
nmVars <- vars[!(vars %in% chVars)]
quoChVars <- quos(chVars)
nmList <- vector("list", length(nmVars))
nmList <- setNames(lapply(nmList, function(x) x <- 0), nmVars)
quoNmVars <- quos(nmList)
df <- df %>%
complete(!!!quoChVars, fill = !!!quoNmVars)
}
Any idea of how to make this work?
1) rlang/tidyreval Use !!!syms(notnum_names) to insert the variable names as complete arguments. Fill is just an ordinary list and no rlang/tidyeval computations are needed for it.
library(dplyr)
library(tidyr)
library(rlang)
completeDF <- function(data) {
is_num <- sapply(data, is.numeric)
num_names <- names(data)[ is_num ]
notnum_names <- names(data)[ !is_num ]
fill <- Map(function(x) 0, num_names)
data %>% complete(!!!syms(notnum_names), fill = fill)
}
DF <- data.frame(a = c("A", "B", "B"), b = c("a", "a", "b"), c = 1:3) # test data
completeDF(DF)
giving:
# A tibble: 4 x 3
a b c
<fctr> <fctr> <dbl>
1 A a 1
2 A b 0
3 B a 2
4 B b 3
Here is the original code from the question modified to make it work. The changed lines are marked with ## at the end of each.
completeDf <- function(df){
vars <- names(df)
chVars <- vars[!(sapply(df, is.numeric))]
nmVars <- vars[!(vars %in% chVars)]
symsChVars <- rlang::syms(chVars) ##
nmList <- vector("list", length(nmVars))
nmList <- setNames(lapply(nmList, function(x) 0), nmVars) ##
# quoNmVars <- quos(nmList ##
df %>% ##
complete(!!!symsChVars, fill = nmList) ##
}
completeDf(DF)
2) wrapr An alternative to rlang/tidyeval is the wrapr package.
The code here is the same as in (1) except we use library(wrapr) instead of library(rlang) and the last line of completeDF is replaced with a let statement giving completeDF2.
library(dplyr)
library(tidyr)
library(wrapr)
completeDF2 <- function(data) {
is_num <- sapply(data, is.numeric)
num_names <- names(data)[ is_num ]
notnum_names <- names(data)[ !is_num ]
fill <- Map(function(x) 0, num_names)
let(c(NOTNUM = toString(notnum_names)),
data %>% complete(NOTNUM, fill = fill),
strict = FALSE,
subsMethod = "stringsubs")
}
completeDF2(DF)
Updates: Fixes and improvements. Add wrapr approach.

Resources