Convert list of individuals to occurence of pairs in R - r

I need specific format of data.frame for social structure analysis. How to convert data.frame containing list of individuals occuring together on multiple events:
my.df <- data.frame(individual = c("A","B","C","B","C","D"),
time = rep(c("event_01","event_02"), each = 3))
individual time
1 A event_01
2 B event_01
3 C event_01
4 B event_02
5 C event_02
6 D event_02
into a data.frame containing occurence for each pairs (including [A,A]; [B,B] etc. pairs:
ind_1 ind_2 times
A A 0
A B 1
A C 1
A D 0
B A 1
B B 0
B C 2
B D 1
C A 1
C B 2
C C 0
C D 1
D A 0
D B 1
D C 1
D D 0

In base R, you could do the following:
data.frame(as.table(`diag<-`(tcrossprod(table(my.df)), 0)))
# individual individual.1 Freq
# 1 A A 0
# 2 B A 1
# 3 C A 1
# 4 D A 0
# 5 A B 1
# 6 B B 0
# 7 C B 2
# 8 D B 1
# 9 A C 1
# 10 B C 2
# 11 C C 0
# 12 D C 1
# 13 A D 0
# 14 B D 1
# 15 C D 1
# 16 D D 0
tcrossprod gives you the following:
> tcrossprod(table(my.df))
individual
individual A B C D
A 1 1 1 0
B 1 2 2 1
C 1 2 2 1
D 0 1 1 1
That's essentially all the information you are looking for, but you want it in a slightly different form, without the diagonal values.
We can set the diagonals to zero with:
`diag<-`(theOutputFromAbove, 0)
Then, to get the long form, trick R into thinking that the resulting matrix is a table by using as.table, and make use of the data.frame method for tables.

You can do:
create the first 2 variables of the new data.frame:
df2 <- expand.grid(ind_2=levels(my.df$individual), ind_1=levels(my.df$individual))[, 2:1]
Put the value to 0 for the pairs of same individuals:
df2$times[df2[, 1]==df2[, 2]] <- 0
See the other unique combinations:
comb_diff <- combn(levels(my.df$individual), 2)
compute the times each unique combination is found together:
times_uni <- apply(comb_diff, 2, function(inds){
sum(table(my.df$time[my.df$individual %in% inds])==2)
})
Finally, fill the new data.frame:
df2$times[match(c(paste0(comb_diff[1,], comb_diff[2,]), paste0(comb_diff[2, ], comb_diff[1, ])), paste0(df2[, 1],df2[, 2]))] <- rep(times_uni, 2)
df2
# ind_1 ind_2 times
#1 A A 0
#2 A B 1
#3 A C 1
#4 A D 0
#5 B A 1
#6 B B 0
#7 B C 2
#8 B D 1
#9 C A 1
#10 C B 2
#11 C C 0
#12 C D 1
#13 D A 0
#14 D B 1
#15 D C 1
#16 D D 0

You can do it using data.table
dt_combs <- my.dt[,
list(ind_1 = combn(individual, 2)[1, ],
ind_2 = combn(individual, 2)[2, ]),
by = time]
dt_ncombs <- dt_combs[, .N, by = c("ind_1", "ind_2")]
dt_ncombs_inverted <- copy(dt_ncombs)
dt_ncombs_inverted[, temp := ind_1]
dt_ncombs_inverted[, ind_1 := ind_2]
dt_ncombs_inverted[, ind_2 := temp]
dt_ncombs_inverted[, temp := NULL]
dt_ncombs <- rbind(dt_ncombs, dt_ncombs_inverted)
dt_allcombs <- data.table(expand.grid(
ind_1 = my.dt[, unique(individual)],
ind_2 = my.dt[, unique(individual)]
))
dt_final <- merge(dt_allcombs,
dt_ncombs,
all.x = TRUE,
by = c("ind_1", "ind_2"))
dt_final[is.na(N), N := 0]
dt_final

Related

In R, change the values of some items in a matrix without causing a copy of the entire matrix?

I have a "small" square matrix that I want to add to a "big" matrix. The big matrix contains all the rows and columns of the small matrix plus extras. I want to add the values where the indices are in common and just keep the values from the big one where that index is not contained in the small one. Unfortunately, all the data is copied on the addition so it takes a long time and can temporarily spike memory when the matrices are large.
I have tried adding subsets using matrices and data.frames, as well as a data.table method using rbindlist. Both the data.frame and matrix methods seem to cause a memory copy (why?) and the rbindlist method is not ideal because it requires a melt and dcast and temporarily spiking the memory by spiking the number of rows.
Is there any way to just change the values of some items in a matrix without causing a copy of the entire matrix?
Here are my attempts:
MList <- list(M1,M2)
unionCols <- Reduce(union, lapply(MList, colnames))
MTotal <- matrix(as.double(rep(0,(length(unionCols))^2)), nrow = length(unionCols))
rownames(MTotal) <- colnames(MTotal) <- unionCols
DFTotal <- as.data.frame(MTotal)
DFList <- lapply(MList, as.data.frame)
for(i in 1:length(MList)){
tracemem(MTotal)
tracemem(DFTotal)
mCol <- match(colnames(MList[[i]]), colnames(MTotal))
MTotal[mCol,mCol] <- MTotal[mCol,mCol] + MList[[i]] # this causes a copy
DFTotal[mCol,mCol] <- DFTotal[mCol,mCol] + DFList[[i]] # this causes a copy
}
M1
M2
MTotal
# rbindlist method
.AggDMCMatsSingleM2 <- function(M1, M2){
.MyMelt <- function(M){
DT <- setnames(reshape2::melt(M, id.vars = colnames(M)), c('Var1','Var2'), c('row','col'))
}
M_total <- as.matrix(data.table::dcast(rbindlist(lapply(list(M1,M2), .MyMelt)),
formula = as.formula(row ~ col),
value.var = 'value',
fun.aggregate = sum,
fill = 0),
rownames = 'row')
return(M_total)
}
M1
M2
.AggDMCMatsSingleM2(M1,M2)
If I follow what you are asking we can directly add and write to the big matrix using the bracket notation row/col names of the small matrix:
big_matrix<-matrix(data=rep(1, 25), nrow=5,
dimnames = list(c(LETTERS[1:5]),
c(letters[1:5])))
# a b c d e
#A 1 1 1 1 1
#B 1 1 1 1 1
#C 1 1 1 1 1
#D 1 1 1 1 1
#E 1 1 1 1 1
small_matrix<-matrix(data=c(1:9), nrow=3,
dimnames = list(c(LETTERS[2:4]),
c(letters[2:4])))
# b c d
#B 1 4 7
#C 2 5 8
#D 3 6 9
big_matrix[rownames(small_matrix), colnames(small_matrix)] <-
big_matrix[rownames(small_matrix), colnames(small_matrix)] + small_matrix
# a b c d e
#A 1 1 1 1 1
#B 1 2 5 8 1
#C 1 3 6 9 1
#D 1 4 7 10 1
#E 1 1 1 1 1
More complex test:
big_matrix<-matrix(data=rep(1, 25), nrow=5,
dimnames = list(c(LETTERS[1:5]),
c(letters[1:5])))
# a b c d e
#A 1 1 1 1 1
#B 1 1 1 1 1
#C 1 1 1 1 1
#D 1 1 1 1 1
#E 1 1 1 1 1
small_matrix<-matrix(data=c(1:9), nrow=3,
dimnames = list(c("A", "D", "C"),
c(letters[c(2:4)])))
# b c d
#A 1 4 7
#D 2 5 8
#C 3 6 9
big_matrix[rownames(small_matrix), colnames(small_matrix)] <-
big_matrix[rownames(small_matrix), colnames(small_matrix)] + small_matrix
big_matrix
# a b c d e
#A 1 2 5 8 1
#B 1 1 1 1 1
#C 1 4 7 10 1
#D 1 3 6 9 1
#E 1 1 1 1 1

Replacing zero and one in data frame in R with letters

I have a data frame (df) which looks like this:
df$a df$b
T C 1 1 1 1
A G 0 1 1 0
C G 0 0 1 1
A T 0 0 0 0
I would like to have this output:
T C C C C C
A G A G G A
C G C C G G
A T A A A A
Basically if it is zero it takes the letter in column df$a while if it is 1 it takes the letter in column df$b and the zeros and ones are replaced.
Anybody knows how to do this in R?
You can do this with base functionality - note that I read the data in columns 1 and 2 as characters, not factors.
df <- read.table(text = "
T C 1 1 1 1
A G 0 1 1 0
C G 0 0 1 1
A T 0 0 0 0", header = FALSE, stringsAsFactors = FALSE)
df[, 3:ncol(df)] <- sapply(df[, 3:ncol(df)], function(x) ifelse(x == 1, df[, 2], df[, 1]))
df
#> V1 V2 V3 V4 V5 V6
#> 1 T C C C C C
#> 2 A G A G G A
#> 3 C G C C G G
#> 4 A T A A A A
Created on 2019-01-25 by the reprex package (v0.2.1)
df[, -(1:2)] <- ifelse(df[, -(1:2)] == 1, df[, 2], df[, 1])
Here is a base R solution.
df[-(1:2)] <- t(apply(df, 1, function(x) {
y <- as.numeric(x[-(1:2)])
x[1:2][y + 1]
}))
df
# a b c d e f
#1 T C C C C C
#2 A G A G G A
#3 C G C C G G
#4 A T A A A A
Data.
df <- read.table(text = "
a b c d e f
T C 1 1 1 1
A G 0 1 1 0
C G 0 0 1 1
A T 0 0 0 0
", header = TRUE)

Cross comparison of columns of the same data.frame

I have a data.frame that looks like this:
> DF1
A B C D E
a x c h p
c d q t w
s e r p a
w l t s i
p i y a f
I would like to compare each column of my data.frame with the remaining columns in order to count the number of common elements. For example, I would like to compare column A with all the remaining columns (B, C, D, E) and count the common entities in this way:
A versus the remaining:
A vs B: 0 (because they have 0 common elements)
A vs C: 1 (c in common)
A vs D: 2 (p and s in common)
A vs E: 3 (p,w,a, in common)
Then the same: B versus C,D,E columns and so on.
How can I implement this?
We can loop through the column names and compare with the other columns, by taking the intersect and get the length
sapply(names(DF1), function(x) {
x1 <- lengths(Map(intersect, DF1[setdiff(names(DF1), x)], DF1[x]))
c(x1, setNames(0, setdiff(names(DF1), names(x1))))[names(DF1)]})
# A B C D E
#A 0 0 1 3 3
#B 0 0 0 0 1
#C 1 0 0 1 0
#D 3 0 1 0 2
#E 3 1 0 2 0
Or this can be done more compactly by taking the cross product after getting the frequency of the long formatted (melt) dataset
library(reshape2)
tcrossprod(table(melt(as.matrix(DF1))[-1])) * !diag(5)
# Var2
#Var2 A B C D E
# A 0 0 1 3 3
# B 0 0 0 0 1
# C 1 0 0 1 0
# D 3 0 1 0 2
# E 3 1 0 2 0
NOTE: The crossprod part is also implemented with RcppEigen here which would make this faster
An alternative is to use combn twice, once to get the column combinations and next to find the lengths of the element intersections.
cbind.data.frame returns a data.frame and setNames is used to add column names.
setNames(cbind.data.frame(t(combn(names(df), 2)),
combn(names(df), 2, function(x) length(intersect(df[, x[1]], df[, x[2]])))),
c("col1", "col2", "count"))
col1 col2 count
1 A B 0
2 A C 1
3 A D 3
4 A E 3
5 B C 0
6 B D 0
7 B E 1
8 C D 1
9 C E 0
10 D E 2

Create "contingency" table with multi-rows

Let's consider this dataset, where the first field is a bill number and the second one is the name of a product :
df=data.frame(bill=c(1,1,1,1,2,2,2,2,3,3),product=c("A","B","C","B","A","C","E","D","C","D"))
I would like to count the number of bills containing each combination of two products, for example in this case a result like this (I don't want to keep combinations where count is 0) :
# prod1 prod2 count
# A B 1
# A C 2
# A D 1
# A E 1
# B C 1
# C D 2
# C E 1
# D E 1
I have a solution with loops but it's really not pretty (and slow !):
products=sort(unique(df$product))
bills_list=list()
for (i in 1:length(products)){
bills_list[[i]]=unique(df[which(df$product==products[i]),"bill"])
}
df2=data.frame(prod1=character(0),prod2=character(0),count=numeric(0))
for (i in 1:(length(products)-1)){
for (j in (i+1):length(products)){
Nij=length(intersect(bills_list[[i]],bills_list[[j]]))
if (Nij>0){
temp=data.frame(prod1=products[i],prod2=products[j],count=Nij)
df2=rbind(df2,temp)
}
}
}
Is there a way to do this without loops ?
Thank you for your time.
Here's a solution with plyr and data.table.
# needed packages
require(plyr)
require(data.table)
# find the combinations in each of the bills
combs <- ddply(df, .(bill), function(x){
t(combn(unique(as.character(x$product)),2))
})
colnames(combs) <- c("bill", "prod1", "prod2")
# combine these
res <- data.table(combs, key=c("prod1", "prod2"))[, .N, by=list(prod1, prod2)]
library(reshape2)
df$product <- as.character(df$product)
products <- t(combn(unique(df$product), 2))
dat <- dcast(bill ~ product, data = df)
## bill A B C D E
## 1 1 1 2 1 0 0
## 2 2 1 0 1 1 1
## 3 3 0 0 1 1 0
out <- structure(
data.frame(products, apply(products, 1, function(x) sum(rowSums(dat[x] > 0) == 2) )),
names = c("prod1", "prod2", "count")
)
out[out$count != 0,]
## prod1 prod2 count
## 1 A B 1
## 2 A C 2
## 3 A E 1
## 4 A D 1
## 5 B C 1
## 8 C E 1
## 9 C D 2
## 10 E D 1
Here's another approach:
library(qdap)
dat <- unlist(lapply(split(df$product, df$bill), function(x) {
y <- outer(unique(x), unique(x), paste)
unlist(y[upper.tri(y)])
}))
dat2 <- data.frame(table(dat), stringsAsFactors = FALSE)
colsplit2df(dat2, sep=" ", new.names=paste0("prod", 1:2))
## prod1 prod2 Freq
## 1 A B 1
## 2 A C 2
## 3 A D 1
## 4 A E 1
## 5 B C 1
## 6 C D 2
## 7 C E 1
## 8 E D 1
res <- table(df$bill, df$product)
##> res
##
## A B C D E
## 1 1 2 1 0 0
## 2 1 0 1 1 1
## 3 0 0 1 1 0
res2 <- ifelse(res > 0, 1, 0)
##> res2
##
## A B C D E
## 1 1 1 1 0 0
## 2 1 0 1 1 1
## 3 0 0 1 1 0
cor(res2)
##
## A B C D E
##A 1.0 0.5 NA -0.5 0.5
##B 0.5 1.0 NA -1.0 -0.5
##C NA NA 1 NA NA
##D -0.5 -1.0 NA 1.0 0.5
##E 0.5 -0.5 NA 0.5 1.0
##Warning message:
##In cor(res2) : the standard deviation is zero
I do realize that this does not answer the question that you asked.
But, it may get you closer to the answer that, presumably, you seek. Namely, what is the impact of a customer ordering one product on the likelihood (positive or negative) that will order one of the others.

How to sum counts across tables that may contain partially different categories in R?

How do I merge (add) contingency tables:
> (t1 <- table(c("a","b","b","c")))
a b c
1 2 1
> (t2 <- table(c("c","d","d","a")))
a c d
1 1 2
I want this:
a b c d
2 2 2 2
You can do it using split and sapply
> T <- c(t1, t2)
> sapply(split(T, names(T)), sum)
a b c d
2 2 2 2
Or directly using tapply as pointed out by #Arun
> tapply(T, names(T), sum)
a b c d
2 2 2 2
Here is what I was able to come up with:
> (t1 <- table(c("a","b","b","c")))
a b c
1 2 1
> (t2 <- table(c("c","d","d","a")))
a c d
1 1 2
> (n <- sort(union(names(t1),names(t2))))
[1] "a" "b" "c" "d"
> (t1 <- t1[n])
a b c <NA>
1 2 1 NA
> names(t1) <- n
> t1
a b c d
1 2 1 NA
> t1[is.na(t1)] <- 0
> t1
a b c d
1 2 1 0
> t2 <- t2[n]
> names(t2) <- n
> t2
a b c d
1 NA 1 2
> t2[is.na(t2)] <- 0
> t2
a b c d
1 0 1 2
> t1+t2
a b c d
2 2 2 2
I think there must be a better way...
This works:
library(plyr)
colSums(rbind.fill(data.frame(t(unclass(t1))), data.frame(t(unclass(t2)))),
na.rm = T)

Resources