Related
I have a dataframe where each column is made up of zero along with one other number. For example:
I want to manipulate the dataframe so that columns that contain the same other number become one column where the value stays as the other number if the other number was present in every row, otherwise it turns to zero.
So for instance, I would want the dataframe above to look like
..1 ..2 ..3
1 2 3
0 2 0
0 0 0
1 0 0
The first row of the dataframe is 1 because the values were both 1 in the first row of the original. The second row of the first column is 0 because there were a 1 and a 0 in the row.
Here is some reproducible data:
structure(list(...1 = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ...2 = c(1, 0,
0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
0, 0, 0, 0, 0), ...3 = c(2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ...4 = c(3,
0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0, 0, 0, 3, 0, 0, 3, 0, 0,
0, 0, 0, 0, 0, 0), ...5 = c(3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), ...6 = c(3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA,
-28L), class = "data.frame")
Here is a possible solution in base R, where dat is the data frame you provide in the question. We find the unique value for each column, assuming there is only one nonzero value in each column. Then we loop through the groups of columns with each unique value, applying the function all() to each row of the subsetted dataframe to identify rows with all nonzero values. Multiply the resulting logical vector by the value itself to get the desired result. Then store this vector in a list and bind to a data frame.
col_vals <- apply(dat, 2, max)
columns <- list()
for (val in unique(col_vals)) {
columns[[length(columns) + 1]] <- val * apply(dat[, col_vals == val, drop = FALSE], 1, all)
}
as.data.frame(do.call(cbind, columns))
case1: 0, 0, 0, 0, 1, 1, 1, 1, 0, 0
case2: 1, 1, 0, 0, 1, 0, 0, 0, 1, 0
case3: 0, 1, 0, 0, 0, 1, 1, 0, 0, 1
I would like to get the following vectors from the table above.
answer: 2, mix, 0, 0, mix, mix, mix, 1, 2, 3
How do I solve the above problem in r?
I made my own function to solve the above problem.
I hope it helps people who have the same problems as me.
dummy_to_cate <- function(mydata,column_area){
result_vec <- NA
vec_q=NA
colname <- colnames(mydata)
result_vec[is.na(mydata[,column_area[1]])==FALSE]<-paste(colname[1])
for (i in column_area[-1]) {
vec_q[is.na(mydata[,column_area[i]])==FALSE] <-1
vec_q[is.na(mydata[,column_area[i]])==TRUE] <-0
result_vec[vec_q==1 & is.na(mydata[,column_area[1]])==TRUE]<- paste(colname[i])
}
df<-is.na(mydata[,column_area])==FALSE
result_vec[rowSums(df)>= 2]<-'mix'
return(result_vec)
}
Maybe you can try the code below
sapply(asplit(rbind(c1, c2, c3), 2), function(x) {
u <- which(x == 1)
ifelse(length(u) == 0, 0, ifelse(length(u) == 1, u, "mix"))
})
which gives
[1] "2" "mix" "0" "0" "mix" "mix" "mix" "1" "2" "3"
Data
c1 <- c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0)
c2 <- c(1, 1, 0, 0, 1, 0, 0, 0, 1, 0)
c3 <- c(0, 1, 0, 0, 0, 1, 1, 0, 0, 1)
Consider this data:
df <- structure(list(V1 = c(1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0), V2 = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0), V3 = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1), V4 = c(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 0, 0, 0), V5 = c(1, 1, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1), V6 = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 0, 0), V7 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0), V8 = c(1, 1, 1, 0, 0, 0, 0, 1,
1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0), V9 = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1)), class = "data.frame", row.names = c(NA, -46L))
I would like to apply the following functions for all possible pairs in df.
Applying to the pair V1 and V2:
df$V1V2 <- (df$V1 * df$V2) + (1 - df$V1) * (1 - df$V2)
sum <- (sum(df$V1V2)/46)
df$VD <- (df$V1/sd(df$V1))
df$VI <- (df$V2/sd(df$V2))
est <- lm(df$VD ~ df$VI)
summary(est)
ndf <- data.frame(NA)
ndf$V1V2 <- summary(est)[["coefficients"]][, "Pr(>|t|)"][2]
Applying to the pair V1 and V3:
df$V1V3 <- (df$V1 * df$V3) + (1 - df$V1) * (1 - df$V3)
sum <- (sum(df$V1V3)/46)
df$VD <- (df$V1/sd(df$V1))
df$VI <- (df$V3/sd(df$V3))
est <- lm(df$VD ~ df$VI)
summary(est)
ndf$V1V3 <- summary(est)[["coefficients"]][, "Pr(>|t|)"][2]
I could apply this to all other pairs of V1 (i.e. V3, V4, V5, V6, V7, V8, and V9). Nonetheless, I am sure this is not the best approach.
UPDATE
This is how far I got:
dfV1 <- df[, c("V1", "V2", "V1", "V3", "V1", "V4", "V1", "V5", "V1", "V6", "V1", "V7", "V1", "V8", "V1", "V9")]
colnames(dfV1) <- c("V1", "V2", "V1", "V3", "V1", "V4", "V1", "V5", "V1", "V6", "V1", "V7", "V1", "V8", "V1", "V9")
sep <- lapply(seq(1, ncol(dfV1), by=2), function(i)
dfV1[i: pmin((i+1), ncol(dfV1))])
V1V2 <- sep[[1]]
V1V3 <- sep[[2]]
V1V4 <- sep[[3]]
V1V5 <- sep[[4]]
V1V6 <- sep[[5]]
V1V7 <- sep[[6]]
V1V8 <- sep[[7]]
V1V9 <- sep[[8]]
list_V1 <- tibble::lst(V1V2, V1V3, V1V4, V1V5, V1V6, V1V7, V1V8, V1V9)
library(dplyr)
my_func <- function(x) {
x <- x %>%
mutate(First = (x[,1] * x[,2] + (1 - x[,1] * (1 - x[,2]))),
VD = x[,1] / sd(x[,1]),
VI = x[,2] / sd(x[,2]))
}
res <- lapply(list_V1, my_func)
list2env(res, .GlobalEnv)
df.IC.V1 <- cbind.data.frame(V1V2$First, V1V3$First, V1V4$First, V1V5$First, V1V6$First, V1V7$First, V1V8$First, V1V9$First)
IC.all.V1 <- data.frame(colSums(df.IC.V1)/46)
I do not know how to apply this part to the list dfV1:
est <- lm(df$VD ~ df$VI)
summary(est)
ndf$V1V3 <- summary(est)[["coefficients"]][, "Pr(>|t|)"][2]
Avoid wide data and keep data long or tidy which helps in virtually every aspect of data science: aggregation, modeling, and plotting. Therefore, consider reshape (or tidy semantics of pivot_longer) to reformat data to long and with a generalized method, call by to run your model by each of the 9 different variables.
Finally, you may want to separate out results of this process in a new data frame to avoid repetition of values in original due to different lengths and especially since your model outputs two rows for intercept and variable with multiple columns.
long_df <- reshape(df, idvar="V1", varying=names(df)[-1],
times=names(df)[-1],
v.names="value", timevar="variable",
new.row.names=1:1E4, direction="long")
pairwise_model <- function(sub) {
sub$vpair <- (sub$V1 * sub$value) + (1 - sub$value) * (1 - sub$value)
sum_v <- (sum(sub$vpair)/46)
sub$VD <- (sub$V1/sd(sub$V1))
sub$VI <- (sub$value/sd(sub$value))
est <- lm(VD ~ VI, data=sub)
print(summary(est))
ndf <- data.frame(variable = sub$variable[[1]],
summary(est)[["coefficients"]])
return(ndf)
}
df_list <- by(long_df, long_df$variable, pairwise_model)
results_df <- do.call(rbind, df_list)
results_df
# variable Estimate Std..Error t.value Pr...t..
# V2.(Intercept) V2 0.70897157 0.1697361 4.1769050 1.376794e-04
# V2.VI V2 0.20546559 0.1475392 1.3926169 1.707334e-01
# V3.(Intercept) V3 0.00000000 0.4324256 0.0000000 1.000000e+00
# V3.VI V3 0.29294628 0.1441419 2.0323471 4.818178e-02
# V4.(Intercept) V4 0.59719677 0.1461565 4.0860091 1.829775e-04
# V4.VI V4 0.47663445 0.1325296 3.5964390 8.118808e-04
# V5.(Intercept) V5 0.77259722 0.1604268 4.8158874 1.766754e-05
# V5.VI V5 0.13627939 0.1493492 0.9124883 3.664845e-01
# V6.(Intercept) V6 0.68720490 0.1557572 4.4120255 6.533736e-05
# V6.VI V6 0.31399751 0.1431310 2.1937768 3.357833e-02
# V7.(Intercept) V7 0.57392936 0.1516681 3.7841150 4.627413e-04
# V7.VI V7 0.46128020 0.1337587 3.4486009 1.253140e-03
# V8.(Intercept) V8 0.90047538 0.1869717 4.8161045 1.765498e-05
# V8.VI V8 -0.09345783 0.1500959 -0.6226544 5.367256e-01
# V9.(Intercept) V9 0.60871296 0.1645124 3.7001029 5.960399e-04
# V9.VI V9 0.35598290 0.1408800 2.5268512 1.517916e-02
Online Demo
I have a 3185x90 dataset of binary values and want to do a chi-squared test of independence, comparing all column variables against each other.
I've been tried using different variations of code from google searches with chisq.test() and some for loops, but none of them have worked so far.
How do I do this?
This is the frame I've tinkered with. My dataset is oak.
chi_trial <- data.frame(a = c(0,1), b = c(0,1))
for(row in 1:nrow(oak)){
print(row)
print(chisq.test(c(oak[row,1],d[row,2])))
}
I also tried this:
apply(d, 1, chisq.test)
which gives me the error: Error in FUN(newX[, i], ...) :
all entries of 'x' must be nonnegative and finite
dput(oak[1:2],)
structure(list(post_flu = structure(c(1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
label = "Receipt of Flu Vaccine - Encounter Survey", format.stata = "%10.0g")), row.names = c(NA,
-3185L), class = c("tbl_df", "tbl", "data.frame"), label = "Main Oakland Clinic Analysis Dataset")
I added a sample of my data with the final lines of the output. The portion of the dataset is small, but it all looks like this.
You could use something like the code below, which is similar to R's cor function. I don't have your data, so I'm simulating some. Note that I get one significant p-value, using the traditional cut-off of 0.05.
set.seed(3)
nr=3185; nc=3
oak <- as.data.frame(matrix(sample(0:1, size=nr*nc, replace=TRUE), ncol=nc))
oak
mult.chi <- function(data){
nc <- ncol(data)
res <- matrix(0, nrow=nc, ncol=nc) # or NA
for(i in 1:(nc-1))
for(j in (i+1):nc)
res[i,j] <- suppressWarnings(chisq.test(oak[,i], oak[,j])$p.value)
rownames(res) <- colnames(data)
colnames(res) <- colnames(data)
res
}
mult.chi(oak)
# V1 V2 V3
# V1 0 0.7847063 0.32012466
# V2 0 0.0000000 0.01410326
# V3 0 0.0000000 0.00000000
So consider applying a multiple testing adjustment as mentioned in the comments.
Here is a solution with combn to get all combinations of column numbers 2 by 2. Tested with the data in #Edward's answer.
chisq2cols <- function(X){
y <- matrix(0, ncol(X), ncol(X))
cmb <- combn(ncol(X), 2)
y[upper.tri(y)] <- apply(cmb, 2, function(k){
tbl <- table(X[k])
chisq.test(tbl)$p.value
})
y
}
chisq2cols(oak)
# [,1] [,2] [,3]
#[1,] 0 0.7847063 0.32012466
#[2,] 0 0.0000000 0.01410326
#[3,] 0 0.0000000 0.00000000
I'm using R to complete some GA driven searches.
Returned from my GA script is the resulting chromosome, returned as a binary numeric of length 40.
An example is: c(0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0).
I also have a corresponding data frame with 40 columns.
Using the data in the numeric vector, how do I efficiently build a (or re-build the) data frame so that it contains only those columns represented by the 1's in my numeric vector?
Building a sample data.frame and assigning your sample vector to x:
df <- as.data.frame(matrix(sample(1:100, 400, replace=T), ncol=40))
x <- c(0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0)
I can subset:
df[ ,x==1]
or:
df[, as.logical(x)]