Warning writing table output as data.frame - r

I have a data.frame that looks like this:
C1 C2 C3 C4
1 -1 -1 1
1 1 -1 1
1 1 -1 1
1 -1 1 -1
I would like to count -1 and 1 occurrences by columns so I used:
tab= apply(DF, 2, table)
After I used the following string:
final <- as.data.frame(do.call("cbind", tab))
to write the result as data.frame. Unfortunately it gives me back an error because of the first element:
tab[[1]]
1
4
tab[[2]]
-1 1
2 2
..........
So I would like to add 0 to tab[[1]] regarding -1 frequencies to be able to write the results as data.frame.

And a third way:
x <- read.table(text = "C1 C2 C3 C4
1 -1 -1 1
1 1 -1 1
1 1 -1 1
1 -1 1 -1 ", header = TRUE)
sapply(sapply(x, factor, levels = c(1, -1), simplify = FALSE), table)
C1 C2 C3 C4
1 4 2 1 3
-1 0 2 3 1
Some benchmarking:
xx <- as.data.frame(matrix(sample(c(-1,1), 1e7, replace=TRUE), ncol=100))
Roland <- function(DF) {
res <- table(stack(DF))
res2 <- as.data.frame(res)
reshape(res2, timevar = "ind", idvar = "values", direction = "wide")
}
Roman <- function(x) {
sapply(sapply(x, factor, levels = c(1, -1), simplify = FALSE), table)
}
user20650 <- function(x) {
rbind(colSums(x == 1), colSums(x==-1))
}
require(microbenchmark)
microbenchmark(m1 <- Roland(xx), m2 <- Roman(xx), m3 <- user20650(xx), times = 2)
Unit: milliseconds
expr min lq median uq max neval
m1 <- Roland(xx) 17624.6297 17624.6297 18116.6595 18608.6893 18608.6893 2
m2 <- Roman(xx) 13838.2030 13838.2030 14301.9159 14765.6288 14765.6288 2
m3 <- user20650(xx) 786.3689 786.3689 788.7253 791.0818 791.0818 2

DF <- read.table(text="C1 C2 C3 C4
1 -1 -1 1
1 1 -1 1
1 1 -1 1
1 -1 1 -1 ",header=TRUE)
res <- table(stack(DF))
# ind
# values C1 C2 C3 C4
# -1 0 2 3 1
# 1 4 2 1 3
res2 <- as.data.frame(res)
# values ind Freq
# 1 -1 C1 0
# 2 1 C1 4
# 3 -1 C2 2
# 4 1 C2 2
# 5 -1 C3 3
# 6 1 C3 1
# 7 -1 C4 1
# 8 1 C4 3
reshape(res2, timevar = "ind", idvar = "values", direction = "wide")
# values Freq.C1 Freq.C2 Freq.C3 Freq.C4
# 1 -1 0 2 3 1
# 2 1 4 2 1 3
An alternative is res <- ftable(stack(DF)), which can be written to a file directly using write.ftable.

Related

pairwise subtraction of columns in a dataframe in R

I was wondering is there a way to automate (e.g., loop) the subtraction of (X2-X1), (X3-X1), (X3-X2) in my data below and add them as three new columns to the data?
m="
id X1 X2 X3
A 1 0 4
B 2 2 2
C 3 4 1"
data <- read.table(text = m, h = T)
This is very similar to this question; we basically just need to change the function that we are using in map2_dfc:
library(tidyverse)
combn(names(data)[-1], 2) %>%
map2_dfc(.x = .[1,], .y = .[2,],
.f = ~transmute(data, !!paste0(.y, "-", .x) := !!sym(.y) - !!sym(.x))) %>%
bind_cols(data, .)
#> id X1 X2 X3 X2-X1 X3-X1 X3-X2
#> 1 A 1 0 4 -1 3 4
#> 2 B 2 2 2 0 0 0
#> 3 C 3 4 1 1 -2 -3
With combn:
dif <- combn(data[-1], 2, \(x) x[, 2] - x[, 1])
colnames(dif) <- combn(names(data)[-1], 2, \(x) paste(x[2], x[1], sep = "-"))
cbind(data, dif)
# id X1 X2 X3 X2-X1 X3-X1 X3-X2
#1 A 1 0 4 -1 3 4
#2 B 2 2 2 0 0 0
#3 C 3 4 1 1 -2 -3

Permute columns of a square 2-way contingency table (matrix) to maximize its diagonal

After doing clustering, the labels found are meaningless. One can calculate a contingency table to see which labels are most related to the original classes if the ground-truth is available.
I want to automatically permute columns of a contingency table to maximize its diagonal. For example:
# Ground-truth labels
c1 = c(1,1,1,1,1,2,2,2,3,3,3,3,3,3,3)
# Labels found
c2 = c(3,3,3,3,1,1,1,1,2,2,2,3,2,2,1)
# Labels found but renamed correctly
c3 = c(1,1,1,1,2,2,2,2,3,3,3,1,3,3,2)
# Current output
tab1 <- table(c1,c2)
# c2
#c1 1 2 3
# 1 1 0 4
# 2 3 0 0
# 3 1 5 1
# Desired output
tab2 <- table(c1,c3)
# c3
#c1 1 2 3
# 1 4 1 0
# 2 0 3 0
# 3 1 1 5
In reality, c3 is not available. Is there an easy way to obtain c3, tab2 from c2, tab1?
c1 <- c(1,1,1,1,1,2,2,2,3,3,3,3,3,3,3)
c2 <- c(3,3,3,3,1,1,1,1,2,2,2,3,2,2,1)
## table works with factor variables internally
c1 <- as.factor(c1)
c2 <- as.factor(c2)
tab1 <- table(c1, c2)
# c2
# c1 1 2 3
# 1 1 0 4
# 2 3 0 0
# 3 1 5 1
Your question is essentially: how to re-level c2 so that the maximum value on a row sits on the main diagonal. In terms of matrix operation, this is a column permutation.
## find column permutation index
## this can potentially be buggy if there are multiple maxima on a row
## because `sig` may then not be a permutation index vector
## A simple example is:
## tab1 <- matrix(5, 3, 3); max.col(tab1, "first")
sig <- max.col(tab1, "first")
#[1] 3 1 2
## re-level `c2` (create `c3`)
c3 <- factor(c2, levels = levels(c2)[sig])
## create new contingency table
table(c1, c3)
# c3
#c1 3 1 2
# 1 4 1 0
# 2 0 3 0
# 3 1 1 5
## if creation of `c3` is not necessary, just do
tab1[, sig]
# c3
#c1 3 1 2
# 1 4 1 0
# 2 0 3 0
# 3 1 1 5

Special occurrency counting in data table

(preamble)
I don't know if this is the right place for that...I actually have a problem solving/optimization issue for the counting over a table. So if it's not. very sorry and deserve the minusrating.
Here's the data frame
dat <- data.frame(id=letters[1:5],matrix(c(0,0,1,0,0, 0,1,0,1,1, 0,0,2,1,0, 1,0,2,1,1, 0,0,2,0,0, 0,1,2,1,0),5,6))
#
# id X1 X2 X3 X4 X5 X6
# 1 a 0 0 0 1 0 0
# 2 b 0 1 0 0 0 1
# 3 c 1 0 2 2 2 2
# 4 d 0 1 1 1 0 1
# 5 e 0 1 0 1 0 0
I would like to count along every row, how many times we get to 1 and how many times from 1 we go to 0. so the final results should be
# id N1 N0
# a 1 1
# b 2 1
# c 1 1
# d 2 1
# e 2 2
I actually found an algorithm but it's more C/FORTRAN style (here below) and I can't believe there's not an esaier and more elegant way to get this in R. Thanks a lot for any help or hint.
nr <- nrow(dat)
nc <- ncol(dat)
rownames(dat) <- seq(1,nr,1)
colnames(dat) <- seq(1,nc,1)
dat$N1 <- NULL
dat$N2 <- NULL
for (i in 1:nr) {
n1 <- 0
n0 <- 0
j <- 2
while (!(j>nc)) {
k <- j
if (dat[i,k] == 1) {
n1 <- n1 + 1
k <- j + 1
while (!(k>nc)) {
if (dat[i,k] == 0) {
n0 <- n0 + 1
break
}
k <- k + 1
}
}
j <- k
j <- j + 1
}
dat$N1[i] <- n1
dat$N0[i] <- n0
}
Not sure if I totally got it, but you can try:
cbind(dat["id"],N0=rowSums(dat[,3:7]==1 & dat[,2:6]!=1)+(dat[,2]==1),
N1=rowSums(dat[,3:7]==0 & dat[,2:6]==1))
# id N0 N1
#1 a 1 1
#2 b 2 1
#3 c 1 1
#4 d 2 1
#5 e 2 2
Here's another way, using rle wrapped in data.table syntax:
library(data.table)
setDT(dat)
melt(dat, id="id")[, with(rle(value), list(
n1 = sum(values==1),
n1to0 = sum("10" == do.call(paste0, shift(values, 1:0, fill=0)))
)), by=id]
# id n1 n1to0
# 1: a 1 1
# 2: b 2 1
# 3: c 1 1
# 4: d 2 1
# 5: e 2 2
Notes.
shift with n=1:0 returns the lagged vector (lag of 1) and the vector itself (lag of 0).
melt creates a value column; and rle contains a values vector.

Selecting a subset of rows where a % of the values meet the threshold

I have a dataframe with values in rows and samples in columns (two groups, A and B). Example df:
df <- rbind(rep(1, times = 10),
c(rep(1, times = 9), 2),
c(rep(1, times = 8), rep(2, times = 2)),
c(rep(1, times = 7), rep(2, times = 3)), rep(1, times = 10),
c(rep(1, times = 9), 2),
c(rep(1, times = 8), rep(2, times = 2)),
c(rep(2, times = 7), rep(1, times = 3)))
colnames(df) <- c("A1", "A2", "A3", "A4", "A5",
"B1", "B2", "B3", "B4", "B5")
row.names(df) <- 1:8
I have been selecting subset of rows where all samples are below a certain threshold using the following:
selected <- apply(df, MARGIN = 1, function(x) all(x < 1.5))
df.sel <- df[selected,]
result of this is
df[c(1,5),]
I require two further type of selections. The first is to select, for example, all rows where at least 90% of the samples are below the threshold values of 1.5. The result of this should be:
df[c(1,2,5,6)]
The second is to select by group. Say, rows where at least 50% of values in at least one of the groups is > that 1.5. This should give me the following df:
df[c(4,8),]
I am new to Stackoverflow and i have been asked in the past to put example. I hope this is good!
df[!rowSums(df >= 1.5),]
## A1 A2 A3 A4 A5 B1 B2 B3 B4 B5
## 1 1 1 1 1 1 1 1 1 1 1
## 5 1 1 1 1 1 1 1 1 1 1
df[rowMeans(df < 1.5) >= 0.9,]
## A1 A2 A3 A4 A5 B1 B2 B3 B4 B5
## 1 1 1 1 1 1 1 1 1 1 1
## 2 1 1 1 1 1 1 1 1 1 2
## 5 1 1 1 1 1 1 1 1 1 1
## 6 1 1 1 1 1 1 1 1 1 2
idx <- apply(df, 1, function(x) {
any(tapply(x, gsub("[0-9]", "", names(x)), function(y) mean(y > 1.5)) > 0.5)
})
df[idx,]
## A1 A2 A3 A4 A5 B1 B2 B3 B4 B5
## 4 1 1 1 1 1 1 1 2 2 2
## 8 2 2 2 2 2 2 2 1 1 1
In your specific case of nearly all-ones, you can do all this with rowMeans or colMeans. (There is also plyr::colwise for more complicated stuff).
Select subset of rows where all samples are below a certain threshold using the following:
df[rowMeans(df)<1.5,]
Select all rows where >=90% of samples are below the threshold value of 1.5.
(would be much easier if we can exploit knowing that the only other value is 2)
You can directly count the proportion of '1' entries with:
> apply(df, 1, function(x) sum(x==1)) /ncol(df)
1 2 3 4 5 6 7 8
1.0 0.9 0.8 0.7 1.0 0.9 0.8 0.3
Thus to get the row-indices you want:
> apply(df, 1, function(x) sum(x==1)) /ncol(df) >= 0.9
1 2 3 4 5 6 7 8
TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE
and the row-slice you want:
> df[ apply(df, 1, function(x) sum(x==1)) /ncol(df) >= 0.9 , ]
A1 A2 A3 A4 A5 B1 B2 B3 B4 B5
1 1 1 1 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1 1 1 2
5 1 1 1 1 1 1 1 1 1 1
6 1 1 1 1 1 1 1 1 1 2
The second is to select by group. Say, rows where at least 50% of values in at least one of the groups is > that 1.5.
Unless I misunderstand what you meant by 'at least one of the groups',
your example's wrong. Row 4 doesn't qualify, only row 8.
Again, you could either cheat with rowSums, or else:
> apply(df, 1, function(x) sum(x>=1.5)) /ncol(df) >= 0.5
1 2 3 4 5 6 7 8
FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
That only gets you row 8 not 4, so have I misunderstood you?
(Jake Burhead clarifies you are doing hierarchical indexing by string name of column. See his solution, there's no point in me reproducing it.)

Create "contingency" table with multi-rows

Let's consider this dataset, where the first field is a bill number and the second one is the name of a product :
df=data.frame(bill=c(1,1,1,1,2,2,2,2,3,3),product=c("A","B","C","B","A","C","E","D","C","D"))
I would like to count the number of bills containing each combination of two products, for example in this case a result like this (I don't want to keep combinations where count is 0) :
# prod1 prod2 count
# A B 1
# A C 2
# A D 1
# A E 1
# B C 1
# C D 2
# C E 1
# D E 1
I have a solution with loops but it's really not pretty (and slow !):
products=sort(unique(df$product))
bills_list=list()
for (i in 1:length(products)){
bills_list[[i]]=unique(df[which(df$product==products[i]),"bill"])
}
df2=data.frame(prod1=character(0),prod2=character(0),count=numeric(0))
for (i in 1:(length(products)-1)){
for (j in (i+1):length(products)){
Nij=length(intersect(bills_list[[i]],bills_list[[j]]))
if (Nij>0){
temp=data.frame(prod1=products[i],prod2=products[j],count=Nij)
df2=rbind(df2,temp)
}
}
}
Is there a way to do this without loops ?
Thank you for your time.
Here's a solution with plyr and data.table.
# needed packages
require(plyr)
require(data.table)
# find the combinations in each of the bills
combs <- ddply(df, .(bill), function(x){
t(combn(unique(as.character(x$product)),2))
})
colnames(combs) <- c("bill", "prod1", "prod2")
# combine these
res <- data.table(combs, key=c("prod1", "prod2"))[, .N, by=list(prod1, prod2)]
library(reshape2)
df$product <- as.character(df$product)
products <- t(combn(unique(df$product), 2))
dat <- dcast(bill ~ product, data = df)
## bill A B C D E
## 1 1 1 2 1 0 0
## 2 2 1 0 1 1 1
## 3 3 0 0 1 1 0
out <- structure(
data.frame(products, apply(products, 1, function(x) sum(rowSums(dat[x] > 0) == 2) )),
names = c("prod1", "prod2", "count")
)
out[out$count != 0,]
## prod1 prod2 count
## 1 A B 1
## 2 A C 2
## 3 A E 1
## 4 A D 1
## 5 B C 1
## 8 C E 1
## 9 C D 2
## 10 E D 1
Here's another approach:
library(qdap)
dat <- unlist(lapply(split(df$product, df$bill), function(x) {
y <- outer(unique(x), unique(x), paste)
unlist(y[upper.tri(y)])
}))
dat2 <- data.frame(table(dat), stringsAsFactors = FALSE)
colsplit2df(dat2, sep=" ", new.names=paste0("prod", 1:2))
## prod1 prod2 Freq
## 1 A B 1
## 2 A C 2
## 3 A D 1
## 4 A E 1
## 5 B C 1
## 6 C D 2
## 7 C E 1
## 8 E D 1
res <- table(df$bill, df$product)
##> res
##
## A B C D E
## 1 1 2 1 0 0
## 2 1 0 1 1 1
## 3 0 0 1 1 0
res2 <- ifelse(res > 0, 1, 0)
##> res2
##
## A B C D E
## 1 1 1 1 0 0
## 2 1 0 1 1 1
## 3 0 0 1 1 0
cor(res2)
##
## A B C D E
##A 1.0 0.5 NA -0.5 0.5
##B 0.5 1.0 NA -1.0 -0.5
##C NA NA 1 NA NA
##D -0.5 -1.0 NA 1.0 0.5
##E 0.5 -0.5 NA 0.5 1.0
##Warning message:
##In cor(res2) : the standard deviation is zero
I do realize that this does not answer the question that you asked.
But, it may get you closer to the answer that, presumably, you seek. Namely, what is the impact of a customer ordering one product on the likelihood (positive or negative) that will order one of the others.

Resources