Special occurrency counting in data table - r

(preamble)
I don't know if this is the right place for that...I actually have a problem solving/optimization issue for the counting over a table. So if it's not. very sorry and deserve the minusrating.
Here's the data frame
dat <- data.frame(id=letters[1:5],matrix(c(0,0,1,0,0, 0,1,0,1,1, 0,0,2,1,0, 1,0,2,1,1, 0,0,2,0,0, 0,1,2,1,0),5,6))
#
# id X1 X2 X3 X4 X5 X6
# 1 a 0 0 0 1 0 0
# 2 b 0 1 0 0 0 1
# 3 c 1 0 2 2 2 2
# 4 d 0 1 1 1 0 1
# 5 e 0 1 0 1 0 0
I would like to count along every row, how many times we get to 1 and how many times from 1 we go to 0. so the final results should be
# id N1 N0
# a 1 1
# b 2 1
# c 1 1
# d 2 1
# e 2 2
I actually found an algorithm but it's more C/FORTRAN style (here below) and I can't believe there's not an esaier and more elegant way to get this in R. Thanks a lot for any help or hint.
nr <- nrow(dat)
nc <- ncol(dat)
rownames(dat) <- seq(1,nr,1)
colnames(dat) <- seq(1,nc,1)
dat$N1 <- NULL
dat$N2 <- NULL
for (i in 1:nr) {
n1 <- 0
n0 <- 0
j <- 2
while (!(j>nc)) {
k <- j
if (dat[i,k] == 1) {
n1 <- n1 + 1
k <- j + 1
while (!(k>nc)) {
if (dat[i,k] == 0) {
n0 <- n0 + 1
break
}
k <- k + 1
}
}
j <- k
j <- j + 1
}
dat$N1[i] <- n1
dat$N0[i] <- n0
}

Not sure if I totally got it, but you can try:
cbind(dat["id"],N0=rowSums(dat[,3:7]==1 & dat[,2:6]!=1)+(dat[,2]==1),
N1=rowSums(dat[,3:7]==0 & dat[,2:6]==1))
# id N0 N1
#1 a 1 1
#2 b 2 1
#3 c 1 1
#4 d 2 1
#5 e 2 2

Here's another way, using rle wrapped in data.table syntax:
library(data.table)
setDT(dat)
melt(dat, id="id")[, with(rle(value), list(
n1 = sum(values==1),
n1to0 = sum("10" == do.call(paste0, shift(values, 1:0, fill=0)))
)), by=id]
# id n1 n1to0
# 1: a 1 1
# 2: b 2 1
# 3: c 1 1
# 4: d 2 1
# 5: e 2 2
Notes.
shift with n=1:0 returns the lagged vector (lag of 1) and the vector itself (lag of 0).
melt creates a value column; and rle contains a values vector.

Related

In R, change the values of some items in a matrix without causing a copy of the entire matrix?

I have a "small" square matrix that I want to add to a "big" matrix. The big matrix contains all the rows and columns of the small matrix plus extras. I want to add the values where the indices are in common and just keep the values from the big one where that index is not contained in the small one. Unfortunately, all the data is copied on the addition so it takes a long time and can temporarily spike memory when the matrices are large.
I have tried adding subsets using matrices and data.frames, as well as a data.table method using rbindlist. Both the data.frame and matrix methods seem to cause a memory copy (why?) and the rbindlist method is not ideal because it requires a melt and dcast and temporarily spiking the memory by spiking the number of rows.
Is there any way to just change the values of some items in a matrix without causing a copy of the entire matrix?
Here are my attempts:
MList <- list(M1,M2)
unionCols <- Reduce(union, lapply(MList, colnames))
MTotal <- matrix(as.double(rep(0,(length(unionCols))^2)), nrow = length(unionCols))
rownames(MTotal) <- colnames(MTotal) <- unionCols
DFTotal <- as.data.frame(MTotal)
DFList <- lapply(MList, as.data.frame)
for(i in 1:length(MList)){
tracemem(MTotal)
tracemem(DFTotal)
mCol <- match(colnames(MList[[i]]), colnames(MTotal))
MTotal[mCol,mCol] <- MTotal[mCol,mCol] + MList[[i]] # this causes a copy
DFTotal[mCol,mCol] <- DFTotal[mCol,mCol] + DFList[[i]] # this causes a copy
}
M1
M2
MTotal
# rbindlist method
.AggDMCMatsSingleM2 <- function(M1, M2){
.MyMelt <- function(M){
DT <- setnames(reshape2::melt(M, id.vars = colnames(M)), c('Var1','Var2'), c('row','col'))
}
M_total <- as.matrix(data.table::dcast(rbindlist(lapply(list(M1,M2), .MyMelt)),
formula = as.formula(row ~ col),
value.var = 'value',
fun.aggregate = sum,
fill = 0),
rownames = 'row')
return(M_total)
}
M1
M2
.AggDMCMatsSingleM2(M1,M2)
If I follow what you are asking we can directly add and write to the big matrix using the bracket notation row/col names of the small matrix:
big_matrix<-matrix(data=rep(1, 25), nrow=5,
dimnames = list(c(LETTERS[1:5]),
c(letters[1:5])))
# a b c d e
#A 1 1 1 1 1
#B 1 1 1 1 1
#C 1 1 1 1 1
#D 1 1 1 1 1
#E 1 1 1 1 1
small_matrix<-matrix(data=c(1:9), nrow=3,
dimnames = list(c(LETTERS[2:4]),
c(letters[2:4])))
# b c d
#B 1 4 7
#C 2 5 8
#D 3 6 9
big_matrix[rownames(small_matrix), colnames(small_matrix)] <-
big_matrix[rownames(small_matrix), colnames(small_matrix)] + small_matrix
# a b c d e
#A 1 1 1 1 1
#B 1 2 5 8 1
#C 1 3 6 9 1
#D 1 4 7 10 1
#E 1 1 1 1 1
More complex test:
big_matrix<-matrix(data=rep(1, 25), nrow=5,
dimnames = list(c(LETTERS[1:5]),
c(letters[1:5])))
# a b c d e
#A 1 1 1 1 1
#B 1 1 1 1 1
#C 1 1 1 1 1
#D 1 1 1 1 1
#E 1 1 1 1 1
small_matrix<-matrix(data=c(1:9), nrow=3,
dimnames = list(c("A", "D", "C"),
c(letters[c(2:4)])))
# b c d
#A 1 4 7
#D 2 5 8
#C 3 6 9
big_matrix[rownames(small_matrix), colnames(small_matrix)] <-
big_matrix[rownames(small_matrix), colnames(small_matrix)] + small_matrix
big_matrix
# a b c d e
#A 1 2 5 8 1
#B 1 1 1 1 1
#C 1 4 7 10 1
#D 1 3 6 9 1
#E 1 1 1 1 1

Sum partially overlapping square matrix / array

I have two square matrix / array like that
## Matrix 1
t1 <- c(2,1,1,1,1,0,1,0,1)
column.names <- c("A","B","C")
row.names <- c("A","B","C")
m1 <- array(t1,dim = c(3,3),dimnames = list(row.names,column.names))
m1
A B C
A 2 1 1
B 1 1 0
C 1 0 1
## Matrix 2
t2 <- c(1,0,0,0,1,1,0,1,1)
column.names <- c("A","B","D")
row.names <- c("A","B","D")
m2 <- array(t2,dim = c(3,3),dimnames = list(row.names,column.names))
m2
A B D
A 1 0 0
B 0 1 1
D 0 1 1
I need to sum up them (each existing column/row pairs) and to keep all possible combinations, like that :
A B C D
A 3 1 1 0
B 1 2 0 1
C 1 0 1 0
D 0 1 0 1
I have to compute this process a lot of times, so I am looking for a fast and lightweight solution.
Any help would be awsome, I am stuck ;)
A base R option using xtabs + expand.grid
as.data.frame.matrix(
xtabs(
p ~ .,
do.call(
rbind,
lapply(
list(m1, m2),
function(x) cbind(expand.grid(dimnames(x)), p = c(x))
)
)
)
)
gives
A B C D
A 3 1 1 0
B 1 2 0 1
C 1 0 1 0
D 0 1 0 1
Another option using igraph
library(igraph)
get.adjacency(
graph_from_data_frame(
do.call(
rbind,
lapply(
list(m1, m2),
function(x) {
get.data.frame(
graph_from_adjacency_matrix(
x,
"undirected"
)
)
}
)
), FALSE
),
sparse = FALSE
)
which gives
A B C D
A 3 1 1 0
B 1 2 0 1
C 1 0 1 0
D 0 1 0 1
Make m1 and m2 of same dimensions by including all the rownames and colnames available in both of them. Replace non-existent value with 0. You can then add both of them together.
cols <- unique(c(colnames(m1), colnames(m2)))
rows <- unique(c(rownames(m1), rownames(m2)))
dummy_m1 <- matrix(0, nrow = length(cols), ncol = length(rows),
dimnames = list(cols, rows))
dummy_m2 <- dummy_m1
dummy_m1[rownames(m1), colnames(m1)] <- m1
dummy_m2[rownames(m2), colnames(m2)] <- m2
dummy_m1 + dummy_m2
# A B C D
#A 3 1 1 0
#B 1 2 0 1
#C 1 0 1 0
#D 0 1 0 1

R: Applying a function to every entry in dataframe

I want to make every element in the dataframe (except fot the ID column) become a 0 if it is any number other than 1.
I have:
ID A B C D E
abc 5 3 1 4 1
def 4 1 3 2 5
I want:
ID A B C D E
abc 0 0 1 0 1
def 0 1 0 0 0
I am having trouble figuring out how to specify for this to be done to do to every entry in every column and row.
Here is my code:
apply(dat.lec, 2 , function(y)
if(!is.na(y)){
if(y==1){y <- 1}
else{y <-0}
}
else {y<- NA}
)
Thank you for your help!
No need for implicit or explicit looping.
# Sample data
set.seed(2016);
df <- as.data.frame(matrix(sample(10, replace = TRUE), nrow = 2));
df <- cbind.data.frame(id = sample(letters, 2), df);
df;
# id V1 V2 V3 V4 V5
#1 k 2 9 5 7 1
#2 g 2 2 2 9 1
# Replace all entries != 1 with 0's
df[, -1][df[, -1] != 1] <- 0;
df;
# id V1 V2 V3 V4 V5
#1 k 0 0 0 0 1
#2 g 0 0 0 0 1

Generate pairwise movement data from sequence

I have a sequence which looks like this
SEQENCE
1 A
2 B
3 B
4 C
5 A
Now from this sequence, I want to get the matrix like this where i the row and jth column element denotes how many times movement occurred from ith row node to jth column node
A B C
A 0 1 0
B 0 1 1
C 1 0 0
How Can I get this in R
1) Use table like this:
s <- DF[, 1]
table(tail(s, -1), head(s, -1))
giving:
A B C
A 0 0 1
B 1 1 0
C 0 1 0
2) or like this. Since embed does not work with factors we convert the factor to character,
s <- as.character(DF[, 1])
do.call(table, data.frame(embed(s, 2)))
giving:
X2
X1 A B C
A 0 0 1
B 1 1 0
C 0 1 0
3) xtabs also works:
s <- as.character(DF[, 1])
xtabs(data = data.frame(embed(s, 2)))
giving:
X2
X1 A B C
A 0 0 1
B 1 1 0
C 0 1 0
Note: The input DF in reproducible form is:
Lines <- " SEQENCE
1 A
2 B
3 B
4 C
5 A"
DF <- read.table(text = Lines, header = TRUE)

Create "contingency" table with multi-rows

Let's consider this dataset, where the first field is a bill number and the second one is the name of a product :
df=data.frame(bill=c(1,1,1,1,2,2,2,2,3,3),product=c("A","B","C","B","A","C","E","D","C","D"))
I would like to count the number of bills containing each combination of two products, for example in this case a result like this (I don't want to keep combinations where count is 0) :
# prod1 prod2 count
# A B 1
# A C 2
# A D 1
# A E 1
# B C 1
# C D 2
# C E 1
# D E 1
I have a solution with loops but it's really not pretty (and slow !):
products=sort(unique(df$product))
bills_list=list()
for (i in 1:length(products)){
bills_list[[i]]=unique(df[which(df$product==products[i]),"bill"])
}
df2=data.frame(prod1=character(0),prod2=character(0),count=numeric(0))
for (i in 1:(length(products)-1)){
for (j in (i+1):length(products)){
Nij=length(intersect(bills_list[[i]],bills_list[[j]]))
if (Nij>0){
temp=data.frame(prod1=products[i],prod2=products[j],count=Nij)
df2=rbind(df2,temp)
}
}
}
Is there a way to do this without loops ?
Thank you for your time.
Here's a solution with plyr and data.table.
# needed packages
require(plyr)
require(data.table)
# find the combinations in each of the bills
combs <- ddply(df, .(bill), function(x){
t(combn(unique(as.character(x$product)),2))
})
colnames(combs) <- c("bill", "prod1", "prod2")
# combine these
res <- data.table(combs, key=c("prod1", "prod2"))[, .N, by=list(prod1, prod2)]
library(reshape2)
df$product <- as.character(df$product)
products <- t(combn(unique(df$product), 2))
dat <- dcast(bill ~ product, data = df)
## bill A B C D E
## 1 1 1 2 1 0 0
## 2 2 1 0 1 1 1
## 3 3 0 0 1 1 0
out <- structure(
data.frame(products, apply(products, 1, function(x) sum(rowSums(dat[x] > 0) == 2) )),
names = c("prod1", "prod2", "count")
)
out[out$count != 0,]
## prod1 prod2 count
## 1 A B 1
## 2 A C 2
## 3 A E 1
## 4 A D 1
## 5 B C 1
## 8 C E 1
## 9 C D 2
## 10 E D 1
Here's another approach:
library(qdap)
dat <- unlist(lapply(split(df$product, df$bill), function(x) {
y <- outer(unique(x), unique(x), paste)
unlist(y[upper.tri(y)])
}))
dat2 <- data.frame(table(dat), stringsAsFactors = FALSE)
colsplit2df(dat2, sep=" ", new.names=paste0("prod", 1:2))
## prod1 prod2 Freq
## 1 A B 1
## 2 A C 2
## 3 A D 1
## 4 A E 1
## 5 B C 1
## 6 C D 2
## 7 C E 1
## 8 E D 1
res <- table(df$bill, df$product)
##> res
##
## A B C D E
## 1 1 2 1 0 0
## 2 1 0 1 1 1
## 3 0 0 1 1 0
res2 <- ifelse(res > 0, 1, 0)
##> res2
##
## A B C D E
## 1 1 1 1 0 0
## 2 1 0 1 1 1
## 3 0 0 1 1 0
cor(res2)
##
## A B C D E
##A 1.0 0.5 NA -0.5 0.5
##B 0.5 1.0 NA -1.0 -0.5
##C NA NA 1 NA NA
##D -0.5 -1.0 NA 1.0 0.5
##E 0.5 -0.5 NA 0.5 1.0
##Warning message:
##In cor(res2) : the standard deviation is zero
I do realize that this does not answer the question that you asked.
But, it may get you closer to the answer that, presumably, you seek. Namely, what is the impact of a customer ordering one product on the likelihood (positive or negative) that will order one of the others.

Resources