Create "contingency" table with multi-rows - r

Let's consider this dataset, where the first field is a bill number and the second one is the name of a product :
df=data.frame(bill=c(1,1,1,1,2,2,2,2,3,3),product=c("A","B","C","B","A","C","E","D","C","D"))
I would like to count the number of bills containing each combination of two products, for example in this case a result like this (I don't want to keep combinations where count is 0) :
# prod1 prod2 count
# A B 1
# A C 2
# A D 1
# A E 1
# B C 1
# C D 2
# C E 1
# D E 1
I have a solution with loops but it's really not pretty (and slow !):
products=sort(unique(df$product))
bills_list=list()
for (i in 1:length(products)){
bills_list[[i]]=unique(df[which(df$product==products[i]),"bill"])
}
df2=data.frame(prod1=character(0),prod2=character(0),count=numeric(0))
for (i in 1:(length(products)-1)){
for (j in (i+1):length(products)){
Nij=length(intersect(bills_list[[i]],bills_list[[j]]))
if (Nij>0){
temp=data.frame(prod1=products[i],prod2=products[j],count=Nij)
df2=rbind(df2,temp)
}
}
}
Is there a way to do this without loops ?
Thank you for your time.

Here's a solution with plyr and data.table.
# needed packages
require(plyr)
require(data.table)
# find the combinations in each of the bills
combs <- ddply(df, .(bill), function(x){
t(combn(unique(as.character(x$product)),2))
})
colnames(combs) <- c("bill", "prod1", "prod2")
# combine these
res <- data.table(combs, key=c("prod1", "prod2"))[, .N, by=list(prod1, prod2)]

library(reshape2)
df$product <- as.character(df$product)
products <- t(combn(unique(df$product), 2))
dat <- dcast(bill ~ product, data = df)
## bill A B C D E
## 1 1 1 2 1 0 0
## 2 2 1 0 1 1 1
## 3 3 0 0 1 1 0
out <- structure(
data.frame(products, apply(products, 1, function(x) sum(rowSums(dat[x] > 0) == 2) )),
names = c("prod1", "prod2", "count")
)
out[out$count != 0,]
## prod1 prod2 count
## 1 A B 1
## 2 A C 2
## 3 A E 1
## 4 A D 1
## 5 B C 1
## 8 C E 1
## 9 C D 2
## 10 E D 1

Here's another approach:
library(qdap)
dat <- unlist(lapply(split(df$product, df$bill), function(x) {
y <- outer(unique(x), unique(x), paste)
unlist(y[upper.tri(y)])
}))
dat2 <- data.frame(table(dat), stringsAsFactors = FALSE)
colsplit2df(dat2, sep=" ", new.names=paste0("prod", 1:2))
## prod1 prod2 Freq
## 1 A B 1
## 2 A C 2
## 3 A D 1
## 4 A E 1
## 5 B C 1
## 6 C D 2
## 7 C E 1
## 8 E D 1

res <- table(df$bill, df$product)
##> res
##
## A B C D E
## 1 1 2 1 0 0
## 2 1 0 1 1 1
## 3 0 0 1 1 0
res2 <- ifelse(res > 0, 1, 0)
##> res2
##
## A B C D E
## 1 1 1 1 0 0
## 2 1 0 1 1 1
## 3 0 0 1 1 0
cor(res2)
##
## A B C D E
##A 1.0 0.5 NA -0.5 0.5
##B 0.5 1.0 NA -1.0 -0.5
##C NA NA 1 NA NA
##D -0.5 -1.0 NA 1.0 0.5
##E 0.5 -0.5 NA 0.5 1.0
##Warning message:
##In cor(res2) : the standard deviation is zero
I do realize that this does not answer the question that you asked.
But, it may get you closer to the answer that, presumably, you seek. Namely, what is the impact of a customer ordering one product on the likelihood (positive or negative) that will order one of the others.

Related

In R, change the values of some items in a matrix without causing a copy of the entire matrix?

I have a "small" square matrix that I want to add to a "big" matrix. The big matrix contains all the rows and columns of the small matrix plus extras. I want to add the values where the indices are in common and just keep the values from the big one where that index is not contained in the small one. Unfortunately, all the data is copied on the addition so it takes a long time and can temporarily spike memory when the matrices are large.
I have tried adding subsets using matrices and data.frames, as well as a data.table method using rbindlist. Both the data.frame and matrix methods seem to cause a memory copy (why?) and the rbindlist method is not ideal because it requires a melt and dcast and temporarily spiking the memory by spiking the number of rows.
Is there any way to just change the values of some items in a matrix without causing a copy of the entire matrix?
Here are my attempts:
MList <- list(M1,M2)
unionCols <- Reduce(union, lapply(MList, colnames))
MTotal <- matrix(as.double(rep(0,(length(unionCols))^2)), nrow = length(unionCols))
rownames(MTotal) <- colnames(MTotal) <- unionCols
DFTotal <- as.data.frame(MTotal)
DFList <- lapply(MList, as.data.frame)
for(i in 1:length(MList)){
tracemem(MTotal)
tracemem(DFTotal)
mCol <- match(colnames(MList[[i]]), colnames(MTotal))
MTotal[mCol,mCol] <- MTotal[mCol,mCol] + MList[[i]] # this causes a copy
DFTotal[mCol,mCol] <- DFTotal[mCol,mCol] + DFList[[i]] # this causes a copy
}
M1
M2
MTotal
# rbindlist method
.AggDMCMatsSingleM2 <- function(M1, M2){
.MyMelt <- function(M){
DT <- setnames(reshape2::melt(M, id.vars = colnames(M)), c('Var1','Var2'), c('row','col'))
}
M_total <- as.matrix(data.table::dcast(rbindlist(lapply(list(M1,M2), .MyMelt)),
formula = as.formula(row ~ col),
value.var = 'value',
fun.aggregate = sum,
fill = 0),
rownames = 'row')
return(M_total)
}
M1
M2
.AggDMCMatsSingleM2(M1,M2)
If I follow what you are asking we can directly add and write to the big matrix using the bracket notation row/col names of the small matrix:
big_matrix<-matrix(data=rep(1, 25), nrow=5,
dimnames = list(c(LETTERS[1:5]),
c(letters[1:5])))
# a b c d e
#A 1 1 1 1 1
#B 1 1 1 1 1
#C 1 1 1 1 1
#D 1 1 1 1 1
#E 1 1 1 1 1
small_matrix<-matrix(data=c(1:9), nrow=3,
dimnames = list(c(LETTERS[2:4]),
c(letters[2:4])))
# b c d
#B 1 4 7
#C 2 5 8
#D 3 6 9
big_matrix[rownames(small_matrix), colnames(small_matrix)] <-
big_matrix[rownames(small_matrix), colnames(small_matrix)] + small_matrix
# a b c d e
#A 1 1 1 1 1
#B 1 2 5 8 1
#C 1 3 6 9 1
#D 1 4 7 10 1
#E 1 1 1 1 1
More complex test:
big_matrix<-matrix(data=rep(1, 25), nrow=5,
dimnames = list(c(LETTERS[1:5]),
c(letters[1:5])))
# a b c d e
#A 1 1 1 1 1
#B 1 1 1 1 1
#C 1 1 1 1 1
#D 1 1 1 1 1
#E 1 1 1 1 1
small_matrix<-matrix(data=c(1:9), nrow=3,
dimnames = list(c("A", "D", "C"),
c(letters[c(2:4)])))
# b c d
#A 1 4 7
#D 2 5 8
#C 3 6 9
big_matrix[rownames(small_matrix), colnames(small_matrix)] <-
big_matrix[rownames(small_matrix), colnames(small_matrix)] + small_matrix
big_matrix
# a b c d e
#A 1 2 5 8 1
#B 1 1 1 1 1
#C 1 4 7 10 1
#D 1 3 6 9 1
#E 1 1 1 1 1

Special occurrency counting in data table

(preamble)
I don't know if this is the right place for that...I actually have a problem solving/optimization issue for the counting over a table. So if it's not. very sorry and deserve the minusrating.
Here's the data frame
dat <- data.frame(id=letters[1:5],matrix(c(0,0,1,0,0, 0,1,0,1,1, 0,0,2,1,0, 1,0,2,1,1, 0,0,2,0,0, 0,1,2,1,0),5,6))
#
# id X1 X2 X3 X4 X5 X6
# 1 a 0 0 0 1 0 0
# 2 b 0 1 0 0 0 1
# 3 c 1 0 2 2 2 2
# 4 d 0 1 1 1 0 1
# 5 e 0 1 0 1 0 0
I would like to count along every row, how many times we get to 1 and how many times from 1 we go to 0. so the final results should be
# id N1 N0
# a 1 1
# b 2 1
# c 1 1
# d 2 1
# e 2 2
I actually found an algorithm but it's more C/FORTRAN style (here below) and I can't believe there's not an esaier and more elegant way to get this in R. Thanks a lot for any help or hint.
nr <- nrow(dat)
nc <- ncol(dat)
rownames(dat) <- seq(1,nr,1)
colnames(dat) <- seq(1,nc,1)
dat$N1 <- NULL
dat$N2 <- NULL
for (i in 1:nr) {
n1 <- 0
n0 <- 0
j <- 2
while (!(j>nc)) {
k <- j
if (dat[i,k] == 1) {
n1 <- n1 + 1
k <- j + 1
while (!(k>nc)) {
if (dat[i,k] == 0) {
n0 <- n0 + 1
break
}
k <- k + 1
}
}
j <- k
j <- j + 1
}
dat$N1[i] <- n1
dat$N0[i] <- n0
}
Not sure if I totally got it, but you can try:
cbind(dat["id"],N0=rowSums(dat[,3:7]==1 & dat[,2:6]!=1)+(dat[,2]==1),
N1=rowSums(dat[,3:7]==0 & dat[,2:6]==1))
# id N0 N1
#1 a 1 1
#2 b 2 1
#3 c 1 1
#4 d 2 1
#5 e 2 2
Here's another way, using rle wrapped in data.table syntax:
library(data.table)
setDT(dat)
melt(dat, id="id")[, with(rle(value), list(
n1 = sum(values==1),
n1to0 = sum("10" == do.call(paste0, shift(values, 1:0, fill=0)))
)), by=id]
# id n1 n1to0
# 1: a 1 1
# 2: b 2 1
# 3: c 1 1
# 4: d 2 1
# 5: e 2 2
Notes.
shift with n=1:0 returns the lagged vector (lag of 1) and the vector itself (lag of 0).
melt creates a value column; and rle contains a values vector.

Convert list of individuals to occurence of pairs in R

I need specific format of data.frame for social structure analysis. How to convert data.frame containing list of individuals occuring together on multiple events:
my.df <- data.frame(individual = c("A","B","C","B","C","D"),
time = rep(c("event_01","event_02"), each = 3))
individual time
1 A event_01
2 B event_01
3 C event_01
4 B event_02
5 C event_02
6 D event_02
into a data.frame containing occurence for each pairs (including [A,A]; [B,B] etc. pairs:
ind_1 ind_2 times
A A 0
A B 1
A C 1
A D 0
B A 1
B B 0
B C 2
B D 1
C A 1
C B 2
C C 0
C D 1
D A 0
D B 1
D C 1
D D 0
In base R, you could do the following:
data.frame(as.table(`diag<-`(tcrossprod(table(my.df)), 0)))
# individual individual.1 Freq
# 1 A A 0
# 2 B A 1
# 3 C A 1
# 4 D A 0
# 5 A B 1
# 6 B B 0
# 7 C B 2
# 8 D B 1
# 9 A C 1
# 10 B C 2
# 11 C C 0
# 12 D C 1
# 13 A D 0
# 14 B D 1
# 15 C D 1
# 16 D D 0
tcrossprod gives you the following:
> tcrossprod(table(my.df))
individual
individual A B C D
A 1 1 1 0
B 1 2 2 1
C 1 2 2 1
D 0 1 1 1
That's essentially all the information you are looking for, but you want it in a slightly different form, without the diagonal values.
We can set the diagonals to zero with:
`diag<-`(theOutputFromAbove, 0)
Then, to get the long form, trick R into thinking that the resulting matrix is a table by using as.table, and make use of the data.frame method for tables.
You can do:
create the first 2 variables of the new data.frame:
df2 <- expand.grid(ind_2=levels(my.df$individual), ind_1=levels(my.df$individual))[, 2:1]
Put the value to 0 for the pairs of same individuals:
df2$times[df2[, 1]==df2[, 2]] <- 0
See the other unique combinations:
comb_diff <- combn(levels(my.df$individual), 2)
compute the times each unique combination is found together:
times_uni <- apply(comb_diff, 2, function(inds){
sum(table(my.df$time[my.df$individual %in% inds])==2)
})
Finally, fill the new data.frame:
df2$times[match(c(paste0(comb_diff[1,], comb_diff[2,]), paste0(comb_diff[2, ], comb_diff[1, ])), paste0(df2[, 1],df2[, 2]))] <- rep(times_uni, 2)
df2
# ind_1 ind_2 times
#1 A A 0
#2 A B 1
#3 A C 1
#4 A D 0
#5 B A 1
#6 B B 0
#7 B C 2
#8 B D 1
#9 C A 1
#10 C B 2
#11 C C 0
#12 C D 1
#13 D A 0
#14 D B 1
#15 D C 1
#16 D D 0
You can do it using data.table
dt_combs <- my.dt[,
list(ind_1 = combn(individual, 2)[1, ],
ind_2 = combn(individual, 2)[2, ]),
by = time]
dt_ncombs <- dt_combs[, .N, by = c("ind_1", "ind_2")]
dt_ncombs_inverted <- copy(dt_ncombs)
dt_ncombs_inverted[, temp := ind_1]
dt_ncombs_inverted[, ind_1 := ind_2]
dt_ncombs_inverted[, ind_2 := temp]
dt_ncombs_inverted[, temp := NULL]
dt_ncombs <- rbind(dt_ncombs, dt_ncombs_inverted)
dt_allcombs <- data.table(expand.grid(
ind_1 = my.dt[, unique(individual)],
ind_2 = my.dt[, unique(individual)]
))
dt_final <- merge(dt_allcombs,
dt_ncombs,
all.x = TRUE,
by = c("ind_1", "ind_2"))
dt_final[is.na(N), N := 0]
dt_final

Identify and label the largest number in each group

Hi I want to identify and label the largest number for each group, can someone tell me how to get this done in r (or maybe excel would be easier)?
The following is an example data, the original data contains only the left 2 columns and I want to generate the third one. In the 3rd column, I want to label the largest value in the group as 1, e.g., in group 1, the largest is .02874 so it's marked as 1, otherwise 0. Thank you!
x <- read.table(header=T, text="group value largest
1 0.02827 0
1 0.02703 0
1 0.02874 1
2 0.03255 0
2 0.10394 1
2 0.03417 0
3 0.13858 0
3 0.16084 0
3 0.99830 1
3 0.24563 0")
UPDATE: Thank you all for your help! They all are great solutions!
Finally, the base (no package required) approach:
is.largest <- function(x) as.integer(seq_along(x) == which.max(x))
x <- transform(x, largest = ave(value, group, FUN = is.largest))
Note that if I were you, I would remove the as.integer and just store a logical (TRUE/FALSE) vector.
library(data.table)
x <- data.table(x)
y <- x[,list(value = max(value), maxindicator = TRUE), by = c('group')]
z <- merge(x,y, by = c('group','value'), all = TRUE)
Output
> z
group value largest maxindicator
1: 1 0.02703 0 NA
2: 1 0.02827 0 NA
3: 1 0.02874 1 TRUE
4: 2 0.03255 0 NA
5: 2 0.03417 0 NA
6: 2 0.10394 1 TRUE
7: 3 0.13858 0 NA
8: 3 0.16084 0 NA
9: 3 0.24563 0 NA
10: 3 0.99830 1 TRUE
Here is a solution with plyr :
x$largest <- 0
x <- ddply(x, .(group), function(df) {
df$largest[which.max(df$value)] <- 1
df
})
And one with base R :
x$largest <- 0
l <- split(x, x$group)
l <- lapply(l, function(df) {
df$largest[which.max(df$value)] <- 1
df
})
x <- do.call(rbind, l)
Here's a less cool base approach:
FUN <- function(x) {y <- rep(0, length(x)); y[which.max(x)] <- 1; y}
x$largest <- unlist(tapply(x$value, x$group, FUN))
## group value largest
## 1 1 0.02827 0
## 2 1 0.02703 0
## 3 1 0.02874 1
## 4 2 0.03255 0
## 5 2 0.10394 1
## 6 2 0.03417 0
## 7 3 0.13858 0
## 8 3 0.16084 0
## 9 3 0.99830 1
## 10 3 0.24563 0
It was more difficult to do in base than I had anticipated.

How to sum counts across tables that may contain partially different categories in R?

How do I merge (add) contingency tables:
> (t1 <- table(c("a","b","b","c")))
a b c
1 2 1
> (t2 <- table(c("c","d","d","a")))
a c d
1 1 2
I want this:
a b c d
2 2 2 2
You can do it using split and sapply
> T <- c(t1, t2)
> sapply(split(T, names(T)), sum)
a b c d
2 2 2 2
Or directly using tapply as pointed out by #Arun
> tapply(T, names(T), sum)
a b c d
2 2 2 2
Here is what I was able to come up with:
> (t1 <- table(c("a","b","b","c")))
a b c
1 2 1
> (t2 <- table(c("c","d","d","a")))
a c d
1 1 2
> (n <- sort(union(names(t1),names(t2))))
[1] "a" "b" "c" "d"
> (t1 <- t1[n])
a b c <NA>
1 2 1 NA
> names(t1) <- n
> t1
a b c d
1 2 1 NA
> t1[is.na(t1)] <- 0
> t1
a b c d
1 2 1 0
> t2 <- t2[n]
> names(t2) <- n
> t2
a b c d
1 NA 1 2
> t2[is.na(t2)] <- 0
> t2
a b c d
1 0 1 2
> t1+t2
a b c d
2 2 2 2
I think there must be a better way...
This works:
library(plyr)
colSums(rbind.fill(data.frame(t(unclass(t1))), data.frame(t(unclass(t2)))),
na.rm = T)

Resources