Related
I have a dataset with several binary variables (x1-x5, values: 1, 2, NA). My goal is to identify whether pairs of binary variables have zero or very low cell counts in the cross-tab table (after ignoring the missing values). So, I would like to calculate the cross-tab table for each pair of binary variables in my data set, extract the lowest value from each table, and report the lowest value from each cross-table into a matrix. By doing so, I would have something similar to a correlation matrix where, instead of correlation coefficients, I would be able to look at the lowest cell count for each pair of variables. Below I created a toy dataset for anyone who will decide to help.
library(tidyverse)
df <- data.frame(x1 = rbinom(n = 1000, size = 1, prob = 0.5),
x2 = rbinom(n = 1000, size = 1, prob = 0.3),
x3 = rbinom(n = 1000, size = 1, prob = 0.4),
x4 = rbinom(n = 1000, size = 1, prob = 0.2),
x5 = rbinom(n = 1000, size = 1, prob = 0.05)) |>
mutate(across(everything(), ~as.factor(.))) |>
mutate(across(everything(), ~recode(., "1" = "2", "0" = "1")))
df1 <- as.data.frame(lapply(df, function(cc) cc[ sample(c(TRUE, NA), prob = c(0.85, 0.15), size = length(cc), replace = TRUE) ]))
I think this is what you mean. It's inefficient (we should only compute one triangle) but short.
cfun <- function(i, j) {
min(table(df[[i]], df[[j]]))
}
outer(1:ncol(df), 1:ncol(df), Vectorize(cfun))
If you want to be more efficient:
n <- ncol(df)
m <- matrix(NA_integer_, n, n, dimnames = list(names(df), names(df)))
for (i in 1:(n-1)) {
for (j in (i+1):n) {
m[j,i] <- cfun(i,j)
}
}
Someone (probably #dcsuka) suggested another solution but then deleted it from the answer section. Thankfully, I had already saved it in my script. After tweaking the code a tiny bit, it returned the correct results. So I am copying it here because, as Ben said, diversity is good.
df2 <- df1 %>%
colnames() %>%
combn(2) %>%
t() %>%
as_tibble(.name_repair = ~c("var1", "var2"))
df3 <- df2 %>%
rowwise() %>%
mutate(crosstab = list(as_tibble(table(select(df1, var1, var2)))),
value = min(list(select(crosstab, n))[[1]])) %>%
select(-crosstab) %>%
pivot_wider(names_from = var1, values_from = value)
I have tried the following formula but it gives all nos even when I change the quantile value.
NOTE: I have 3 independent datasets that I want to apply the function.
outlier<-function(x1,x2){
q1<-quantile(x1 , .75, na.rm = TRUE)
if(x1>q1){x2<-"Yes"
}else{
x2<-"No"
}
}
I have tried x2<-ifelse(x1>q1,"Yes","No")
inside the function but it still doesn't work.
You can use an ifelse statement and create a new column using mutate.
library(dplyr)
set.seed(1)
df <- tibble(x1 = sample(c(1:10), size = 10, replace = T))
df %>%
mutate(x2 = ifelse(quantile(x1, 0.75, na.rm = T) < x1, "Yes", "No"))
If you want a function
library(dplyr)
set.seed(1)
df <- tibble(x1 = sample(c(1:10), size = 10, replace = T),
x2 = sample(c(1:10), size = 10, replace = T),
x3 = sample(c(1:10), size = 10, replace = T),
x4 = sample(c(1:10), size = 10, replace = T))
outlier<-function(dataframe, quant = 0.75, col = c("x1", "x2")){
dataframe %>%
mutate(across(all_of(col), ~ifelse(.x>quantile(.x,0.75), 'Yes', 'No'),
.names = '{col}_yes'))
}
outlier(dataframe = df,quant = 0.25)
In R I've got a dataset like this one:
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
Now I want to add a new column with randomized boolean values, but inside each category the proportion of TRUE and FALSE values should be the same (i.e. the randomizing process should generate the same count of true and false values, in the above data frame 5 TRUEs and 5 FALSEs in each of the 3 categories). How to do this?
You can sample a vector of "TRUE" and "FALSE" values without replacement so you have a randomized and balanced column in your data-frame.
sample(rep(c("TRUE","FALSE"),each=5),10,replace=FALSE)
Based on Yacine Hajji answer:
addRandomBool <- function(df, p){
n <- ceiling(nrow(df) * p)
df$bool <- sample(rep(c("TRUE","FALSE"), times = c(n, nrow(df) - n)))
df
}
Reduce(rbind, lapply(split(df, df$category), addRandomBool, p = 0.5))
where parametar p determines the proportion of TRUE.
This will sample within each group from a vector of 5 TRUE and 5 FALSE without replacement. It will assume that there are always 10 records per group.
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){ # Function to saple and assign the new_col
df$new_col <- sample(rep(c(FALSE, TRUE),
each = 5),
size = 10,
replace = FALSE)
df
})) %>%
unnest(cols = "data")
This next example is a little more generalized, but still assumes (approximately) even distribution of TRUE and FALSE within a group. But it can accomodate variable group sizes, and even groups with odd numbers of records (but will favor FALSE for odd numbers of records)
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data")
Maintaining Column Order
A couple of options to maintain the column order:
First, you can save the column order before you do your group_by - nest, and then use select to set the order when you're done.
set.seed(pi)
orig_col <- names(df) # original column order
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data") %>%
select_at(c(orig_col, "new_col")) # Restore the column order
Or you can use a base R solution that doesn't change the column order in the first place
df <- split(df, df["category"])
df <- lapply(df,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})
do.call("rbind", c(df, list(make.row.names = FALSE)))
There are likely a dozen other ways to do this, and probably more efficient ways that I'm not thinking of.
I have a huge dataset with several groups (factors with between 2 to 6 levels), and dichotomous variables (0, 1).
example data
DF <- data.frame(
group1 = sample(x = c("A","B","C","D"), size = 100, replace = T),
group2 = sample(x = c("red","blue","green"), size = 100, replace = T),
group3 = sample(x = c("tiny","small","big","huge"), size = 100, replace = T),
var1 = sample(x = 0:1, size = 100, replace = T),
var2 = sample(x = 0:1, size = 100, replace = T),
var3 = sample(x = 0:1, size = 100, replace = T),
var4 = sample(x = 0:1, size = 100, replace = T),
var5 = sample(x = 0:1, size = 100, replace = T))
I want to do a chi square for every group, across all the variables.
library(tidyverse)
library(rstatix)
chisq_test(DF$group1, DF$var1)
chisq_test(DF$group1, DF$var2)
chisq_test(DF$group1, DF$var3)
...
etc
I managed to make it work by using two nested for loops, but I'm sure there is a better solution
groups <- c("group1","group2","group3")
vars <- c("var1","var2","var3","var4","var5")
results <- data.frame()
for(i in groups){
for(j in vars){
test <- chisq_test(DF[,i], DF[,j])
test <- mutate(test, group=i, var=j)
results <- rbind(results, test)
}
}
results
I think I need some kind of apply function, but I can't figure it out
Here is one way to do it with apply. I am sure there is an even more elegant way to do it with dplyr. (Note that here I extract the p.value of the test, but you can extract something else or the whole test result if you prefer).
res <- apply(DF[,1:3], 2, function(x) {
apply(DF[,4:7], 2,
function(y) {chisq.test(x,y)$p.value})
})
Here's a quick and easy dplyr solution, that involves transforming the data into long format keyed by group and var, then running the chi-sq test on each combination of group and var.
DF %>%
pivot_longer(starts_with("group"), names_to = "group", values_to = "group_val") %>%
pivot_longer(starts_with("var"), names_to = "var", values_to = "var_val") %>%
group_by(group, var) %>%
summarise(chisq_test(group_val, var_val)) %>%
ungroup()
I have a question similar to this one, but my dataset is a bit bigger: 50 columns with 1 column as UID and other columns carrying either TRUE or NA, I want to change all the NA to FALSE, but I don't want to use explicit loop.
Can plyr do the trick? Thanks.
UPDATE #1
Thanks for quick reply, but what if my dataset is like below:
df <- data.frame(
id = c(rep(1:19),NA),
x1 = sample(c(NA,TRUE), 20, replace = TRUE),
x2 = sample(c(NA,TRUE), 20, replace = TRUE)
)
I only want X1 and X2 to be processed, how can this be done?
If you want to do the replacement for a subset of variables, you can still use the is.na(*) <- trick, as follows:
df[c("x1", "x2")][is.na(df[c("x1", "x2")])] <- FALSE
IMO using temporary variables makes the logic easier to follow:
vars.to.replace <- c("x1", "x2")
df2 <- df[vars.to.replace]
df2[is.na(df2)] <- FALSE
df[vars.to.replace] <- df2
tidyr::replace_na excellent function.
df %>%
replace_na(list(x1 = FALSE, x2 = FALSE))
This is such a great quick fix. the only trick is you make a list of the columns you want to change.
Try this code:
df <- data.frame(
id = c(rep(1:19), NA),
x1 = sample(c(NA, TRUE), 20, replace = TRUE),
x2 = sample(c(NA, TRUE), 20, replace = TRUE)
)
replace(df, is.na(df), FALSE)
UPDATED for an another solution.
df2 <- df <- data.frame(
id = c(rep(1:19), NA),
x1 = sample(c(NA, TRUE), 20, replace = TRUE),
x2 = sample(c(NA, TRUE), 20, replace = TRUE)
)
df2[names(df) == "id"] <- FALSE
df2[names(df) != "id"] <- TRUE
replace(df, is.na(df) & df2, FALSE)
You can use the NAToUnknown function in the gdata package
df[,c('x1', 'x2')] = gdata::NAToUnknown(df[,c('x1', 'x2')], unknown = 'FALSE')
With dplyr you could also do
df %>% mutate_each(funs(replace(., is.na(.), F)), x1, x2)
It is a bit less readable compared to just using replace() but more generic as it allows to select the columns to be transformed. This solution especially applies if you want to keep NAs in some columns but want to get rid of NAs in others.