Count how many times a vector/row matches data frame - r

I have a large data frame with "positive" (1) or "negative" (0) data points.
data example
my_data <- data.frame(cell = 1:4, marker_a = c(1, 0, 0, 0),
marker_b = c(0,1,1,1), marker_c = c(0,1,1,0), marker_d = c(0,1,0,1))
cell marker_a marker_b marker_c marker_d
1 1 1 0 0 0
2 2 0 1 1 1
3 3 0 1 1 0
4 4 0 1 0 1
...
I have a different data.frame with all the possible combinations of positive and negative markers any my_data$cell can have
combinations_df <- expand.grid(
marker_a = c(0, 1),
marker_b = c(0, 1),
marker_c = c(0, 1),
marker_d = c(0, 1)
)
marker_a marker_b marker_c marker_d
1 0 0 0 0
2 1 0 0 0
3 0 1 0 0
4 1 1 0 0
5 0 0 1 0
6 1 0 1 0
7 0 1 1 0
8 1 1 1 0
9 0 0 0 1
10 1 0 0 1
11 0 1 0 1
12 1 1 0 1
13 0 0 1 1
14 1 0 1 1
15 0 1 1 1
16 1 1 1 1
How can I get a data.frame where each row/combination is matched vs every row of my_data and return the final count for each combination
Example of expected output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
1 14969 15223 15300 14779 14844 16049 15374 15648 15045 15517 15116 15405 14990 15347 14432 15569

I'm guessing the data.table way is fairly efficient:
library(data.table)
setDT(my_data)
my_data[ combinations_df, on = names(combinations_df), .N, by = .EACHI ]
marker_a marker_b marker_c marker_d N
1: 0 0 0 0 0
2: 1 0 0 0 1
3: 0 1 0 0 0
4: 1 1 0 0 0
5: 0 0 1 0 0
6: 1 0 1 0 0
7: 0 1 1 0 1
8: 1 1 1 0 0
9: 0 0 0 1 0
10: 1 0 0 1 0
11: 0 1 0 1 1
12: 1 1 0 1 0
13: 0 0 1 1 0
14: 1 0 1 1 0
15: 0 1 1 1 1
16: 1 1 1 1 0
If you only care about combinations that show up in the data, "chain" a filtering command:
my_data[ combinations_df, on = names(combinations_df), .N, by = .EACHI ][ N > 0 ]
marker_a marker_b marker_c marker_d N
1: 1 0 0 0 1
2: 0 1 1 0 1
3: 0 1 0 1 1
4: 0 1 1 1 1
Alternately, in this case you don't even need combinations_df...
my_data[, .N, by = marker_a:marker_d ]
marker_a marker_b marker_c marker_d N
1: 1 0 0 0 1
2: 0 1 1 1 1
3: 0 1 1 0 1
4: 0 1 0 1 1

You are writing your combinations in "binary", so no need of any join, but just little math. Try this:
setNames(tabulate(as.matrix(my_data[,2:5])%*%2^(0:3)+1,16),1:16)
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0

Perhaps you may need
setNames(sapply(do.call(paste0, combinations_df ),
function(x) sum(do.call(paste0, my_data[-1])==x)), 1:nrow(combinations_df ))

Related

Count occurences of teams in matrix in R

Have a 1000*16 matrix from a simulation with team names as characters. I want to count number of occurrences per team in all 16 columns.
I know I could do apply(test, 2, table) but that makes the data hard to work with afterward since all teams is not included in every column.
If you have a vector that is all the unique team names you could do something like this. I'm counting occurrences here via column to ensure that not every team (in this case letter) is not included.
set.seed(15)
letter_mat <- matrix(
sample(
LETTERS,
size = 1000*16,
replace = TRUE
),
ncol = 16,
nrow = 1000
)
output <- t(
apply(
letter_mat,
1,
function(x) table(factor(x, levels = LETTERS))
)
)
head(output)
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
[1,] 1 2 0 1 1 1 1 0 0 0 1 0 0 0 0 1 1 1 1 1 0 1 1 0 0 1
[2,] 0 1 0 2 2 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 2 2 1
[3,] 1 1 0 0 1 0 1 2 1 0 0 0 0 0 1 0 1 0 1 1 0 0 3 0 1 1
[4,] 0 1 0 0 0 1 0 0 0 2 0 1 0 0 1 1 1 1 2 0 2 3 0 0 0 0
[5,] 2 1 0 0 0 0 0 2 0 2 1 1 1 0 0 2 0 2 1 0 0 1 0 0 0 0
[6,] 0 0 0 0 0 1 3 1 0 0 0 0 1 1 3 0 1 0 0 1 0 0 0 1 0 3

create a loop to get samples in grouped data which meet a condition

I have a dataframe where data are grouped by ID. I need to know how many cells are the 10% of each group in order to select this number in a sample, but this sample should select the cells which EP is 1.
I've tried to do a nested For loop: one For to know the quantity of cells which are the 10% for each group and the bigger one to sample this number meeting the condition EP==1
x <- data.frame("ID"=rep(1:2, each=10),"EP" = rep(0:1, times=10))
x
ID EP
1 1 0
2 1 1
3 1 0
4 1 1
5 1 0
6 1 1
7 1 0
8 1 1
9 1 0
10 1 1
11 2 0
12 2 1
13 2 0
14 2 1
15 2 0
16 2 1
17 2 0
18 2 1
19 2 0
20 2 1
for(j in 1:1000){
for (i in 1:nrow(x)){
d <- x[x$ID==i,]
npix <- 10*nrow(d)/100
}
r <- sample(d[d$EP==1,],npix)
print(r)
}
data frame with 0 columns and 0 rows
data frame with 0 columns and 0 rows
data frame with 0 columns and 0 rows
.
.
.
until 1000
I would want to get this dataframe, where each sample is in a new column in x, and the cell sampled has "1":
ID EP s1 s2....s1000
1 1 0 0 0 ....
2 1 1 0 1
3 1 0 0 0
4 1 1 0 0
5 1 0 0 0
6 1 1 0 0
7 1 0 0 0
8 1 1 0 0
9 1 0 0 0
10 1 1 1 0
11 2 0 0 0
12 2 1 0 0
13 2 0 0 0
14 2 1 0 1
15 2 0 0 0
16 2 1 0 0
17 2 0 0 0
18 2 1 1 0
19 2 0 0 0
20 2 1 0 0
see that each 1 in S1 and s2 are the sampled cells and correspond to 10% of cells in each group (1, 2) which meet the condition EP==1
you can try
set.seed(1231)
x <- data.frame("ID"=rep(1:2, each=10),"EP" = rep(0:1, times=10))
library(tidyverse)
x %>%
group_by(ID) %>%
mutate(index= ifelse(EP==1, 1:n(),0)) %>%
mutate(s1 = ifelse(index %in% sample(index[index!=0], n()*0.1), 1, 0)) %>%
mutate(s2 = ifelse(index %in% sample(index[index!=0], n()*0.1), 1, 0))
# A tibble: 20 x 5
# Groups: ID [2]
ID EP index s1 s2
<int> <int> <dbl> <dbl> <dbl>
1 1 0 0 0 0
2 1 1 2 0 0
3 1 0 0 0 0
4 1 1 4 0 0
5 1 0 0 0 0
6 1 1 6 1 1
7 1 0 0 0 0
8 1 1 8 0 0
9 1 0 0 0 0
10 1 1 10 0 0
11 2 0 0 0 0
12 2 1 2 0 0
13 2 0 0 0 0
14 2 1 4 0 1
15 2 0 0 0 0
16 2 1 6 0 0
17 2 0 0 0 0
18 2 1 8 0 0
19 2 0 0 0 0
20 2 1 10 1 0
We can write a function which gives us 1's which are 10% for each ID and place it where EP = 1.
library(dplyr)
rep_func <- function() {
x %>%
group_by(ID) %>%
mutate(s1 = 0,
s1 = replace(s1, sample(which(EP == 1), floor(0.1 * n())), 1)) %>%
pull(s1)
}
then use replicate to repeat it for n times
n <- 5
x[paste0("s", seq_len(n))] <- replicate(n, rep_func())
x
# ID EP s1 s2 s3 s4 s5
#1 1 0 0 0 0 0 0
#2 1 1 0 0 0 0 0
#3 1 0 0 0 0 0 0
#4 1 1 0 0 0 0 0
#5 1 0 0 0 0 0 0
#6 1 1 1 0 0 1 0
#7 1 0 0 0 0 0 0
#8 1 1 0 1 0 0 0
#9 1 0 0 0 0 0 0
#10 1 1 0 0 1 0 1
#11 2 0 0 0 0 0 0
#12 2 1 0 0 1 0 0
#13 2 0 0 0 0 0 0
#14 2 1 1 1 0 0 0
#15 2 0 0 0 0 0 0
#16 2 1 0 0 0 0 1
#17 2 0 0 0 0 0 0
#18 2 1 0 0 0 1 0
#19 2 0 0 0 0 0 0
#20 2 1 0 0 0 0 0

Sequence of two numbers with decreasing occurrence of one of them

I would like to create a sequence from two numbers, such that the occurrence of one of the numbers decreases (from n_1 to 1) while for the other number the occurrences are fixed at n_2.
I've been looking around for and tried using seq and rep to do it but I can't seem to figure it out.
Here is an example for c(0,1) and n_1=5, n_2=3:
0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,0,0,0,1,1,1,0,0,1,1,1,0,1,1,1
And here for c(0,1) and n_1=2, n_2=1:
0,0,1,0,1
Maybe something like this?
rep(rep(c(0, 1), n_1), times = rbind(n_1:1, n_2))
## [1] 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 1 1 1 0 1 1 1
Here it is as a function (without any sanity checks):
myfun <- function(vec, n1, n2) rep(rep(vec, n1), times = rbind(n1:1, n2))
myfun(c(0, 1), 2, 1)
## [1] 0 0 1 0 1
inverse.rle
Another alternative is to use inverse.rle:
y <- list(lengths = rbind(n_1:1, n_2),
values = rep(c(0, 1), n_1))
inverse.rle(y)
## [1] 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 1 1 1 0 1 1 1
An alternative (albeit slower) method using a similar concept:
unlist(mapply(rep,c(0,1),times=rbind(n_1:1,n_2)))
###[1] 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 1 1 1 0 1 1 1
Here is another approach using upper-triangle of a matrix:
f_rep <- function(num1, n_1, num2, n_2){
m <- matrix(rep(c(num1, num2), times=c(n_1+1, n_2)), n_1+n_2+1, n_1+n_2+1, byrow = T)
t(m)[lower.tri(m,diag=FALSE)][1:sum((n_1:1)+n_2)]
}
f_rep(0, 5, 1, 3)
#[1] 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 1 1 1 0 1 1 1
f_rep(2, 4, 3, 3)
#[1] 2 2 2 2 3 3 3 2 2 2 3 3 3 2 2 3 3 3 2 3 3 3
myf = function(x, n){
rep(rep(x,n[1]), unlist(lapply(0:(n[1]-1), function(i) n - c(i,0))))
}
myf(c(0,1), c(5,3))
#[1] 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 1 1 1 0 1 1 1

adding data frame of counts to template data frame in R

I have data.frames of counts such as:
a <- data.frame(id=1:10,
"1"=c(rep(1,3),rep(0,7)),
"3"=c(rep(0,4),rep(1,6)))
names(a)[2:3] <- c("1","3")
a
> a
id 1 3
1 1 1 0
2 2 1 0
3 3 1 0
4 4 0 0
5 5 0 1
6 6 0 1
7 7 0 1
8 8 0 1
9 9 0 1
10 10 0 1
and a template data.frame such as
m <- data.frame(id=1:10,
"1"= rep(0,10),
"2"= rep(0,10),
"3"= rep(0,10),
"4"= rep(0,10))
names(m)[-1] <- 1:4
m
> m
id 1 2 3 4
1 1 0 0 0 0
2 2 0 0 0 0
3 3 0 0 0 0
4 4 0 0 0 0
5 5 0 0 0 0
6 6 0 0 0 0
7 7 0 0 0 0
8 8 0 0 0 0
9 9 0 0 0 0
10 10 0 0 0 0
and I want to add the values of a into the template m
in the appropraite columns, leaving the rest as 0.
This is working but I would like to know
if there is a more elegant way, perhaps using plyr or data.table:
provi <- rbind.fill(a,m)
provi[is.na(provi)] <- 0
mnew <- aggregate(provi[,-1],by=list(provi$id),FUN=sum)
names(mnew)[1] <- "id"
mnew <- mnew[c(1,order(names(mnew)[-1])+1)]
mnew
> mnew
id 1 2 3 4
1 1 1 0 0 0
2 2 1 0 0 0
3 3 1 0 0 0
4 4 0 0 0 0
5 5 0 0 1 0
6 6 0 0 1 0
7 7 0 0 1 0
8 8 0 0 1 0
9 9 0 0 1 0
10 10 0 0 1 0
I guess the concise option would be:
m[names(a)] <- a
Or we match the column names ('i1'), use that to create the column index with max.col, cbind with the row index ('i2'), and a similar step can be done to create 'i3'. We change the values in 'm' corresponding to 'i2' with the 'a' values based on 'i3'.
i1 <- match(names(a)[-1], names(m)[-1])
i2 <- cbind(m$id, i1[max.col(a[-1], 'first')]+1L)
i3 <- cbind(a$id, max.col(a[-1], 'first')+1L)
m[i2] <- a[i3]
m
# id 1 2 3 4
#1 1 1 0 0 0
#2 2 1 0 0 0
#3 3 1 0 0 0
#4 4 0 0 0 0
#5 5 0 0 1 0
#6 6 0 0 1 0
#7 7 0 0 1 0
#8 8 0 0 1 0
#9 9 0 0 1 0
#10 10 0 0 1 0
A data.table option would be melt/dcast
library(data.table)
dcast(melt(setDT(a), id.var='id')[,
variable:= factor(variable, levels=1:4)],
id~variable, value.var='value', drop=FALSE, fill=0)
# id 1 2 3 4
# 1: 1 1 0 0 0
# 2: 2 1 0 0 0
# 3: 3 1 0 0 0
# 4: 4 0 0 0 0
# 5: 5 0 0 1 0
# 6: 6 0 0 1 0
# 7: 7 0 0 1 0
# 8: 8 0 0 1 0
# 9: 9 0 0 1 0
#10: 10 0 0 1 0
A similar dplyr/tidyr option would be
library(dplyr)
library(tidyr)
gather(a, Var, Val, -id) %>%
mutate(Var=factor(Var, levels=1:4)) %>%
spread(Var, Val, drop=FALSE, fill=0)
You could use merge, too:
res <- suppressWarnings(merge(a, m, by="id", suffixes = c("", "")))
(res[, which(!duplicated(names(res)))][, names(m)])
# id 1 2 3 4
# 1 1 1 0 0 0
# 2 2 1 0 0 0
# 3 3 1 0 0 0
# 4 4 0 0 0 0
# 5 5 0 0 1 0
# 6 6 0 0 1 0
# 7 7 0 0 1 0
# 8 8 0 0 1 0
# 9 9 0 0 1 0
# 10 10 0 0 1 0

merge one data frame by row with another data frame as a template

I want to merge each row of the data.frame my.samples to another data.frame my.template to obtain the desired.result.
The template my.template could be created with expand.grid. So, even though this is a minimal example the output data set desired.result is still large.
I have posted below several attempts that did not work and one attempt that does work. However, the code that works seems overly complex.
Thank you for any advice. I prefer base R. There are numerous other posts about merging data frames. I looked at quite a few, but did not see this scenario addressed. Sorry if I overlooked it.
my.samples <- read.table(text = '
obs X1 X2 X3 z
1 2 1 0 1
2 0 0 0 1
3 0 1 2 1
', header = TRUE)
my.template <- read.table(text = '
X1 X2 X3
0 0 0
0 0 1
0 0 2
0 1 0
0 1 1
0 1 2
0 2 0
0 2 1
0 2 2
1 0 0
1 0 1
1 0 2
1 1 0
1 1 1
1 1 2
1 2 0
1 2 1
1 2 2
2 0 0
2 0 1
2 0 2
2 1 0
2 1 1
2 1 2
2 2 0
2 2 1
2 2 2
', header = TRUE)
desired.result <- read.table(text = '
obs X1 X2 X3 z
1 0 0 0 0
1 0 0 1 0
1 0 0 2 0
1 0 1 0 0
1 0 1 1 0
1 0 1 2 0
1 0 2 0 0
1 0 2 1 0
1 0 2 2 0
1 1 0 0 0
1 1 0 1 0
1 1 0 2 0
1 1 1 0 0
1 1 1 1 0
1 1 1 2 0
1 1 2 0 0
1 1 2 1 0
1 1 2 2 0
1 2 0 0 0
1 2 0 1 0
1 2 0 2 0
1 2 1 0 1
1 2 1 1 0
1 2 1 2 0
1 2 2 0 0
1 2 2 1 0
1 2 2 2 0
2 0 0 0 1
2 0 0 1 0
2 0 0 2 0
2 0 1 0 0
2 0 1 1 0
2 0 1 2 0
2 0 2 0 0
2 0 2 1 0
2 0 2 2 0
2 1 0 0 0
2 1 0 1 0
2 1 0 2 0
2 1 1 0 0
2 1 1 1 0
2 1 1 2 0
2 1 2 0 0
2 1 2 1 0
2 1 2 2 0
2 2 0 0 0
2 2 0 1 0
2 2 0 2 0
2 2 1 0 0
2 2 1 1 0
2 2 1 2 0
2 2 2 0 0
2 2 2 1 0
2 2 2 2 0
3 0 0 0 0
3 0 0 1 0
3 0 0 2 0
3 0 1 0 0
3 0 1 1 0
3 0 1 2 1
3 0 2 0 0
3 0 2 1 0
3 0 2 2 0
3 1 0 0 0
3 1 0 1 0
3 1 0 2 0
3 1 1 0 0
3 1 1 1 0
3 1 1 2 0
3 1 2 0 0
3 1 2 1 0
3 1 2 2 0
3 2 0 0 0
3 2 0 1 0
3 2 0 2 0
3 2 1 0 0
3 2 1 1 0
3 2 1 2 0
3 2 2 0 0
3 2 2 1 0
3 2 2 2 0
', header = TRUE)
# this works for one obs at a time
merge(my.samples[1,], my.template, by=c('X1', 'X2', 'X3'), all=TRUE)
# this does not work
apply(my.samples, 1, function(x) merge(x, my.template, by=c('X1', 'X2', 'X3'), all=TRUE))
# this does not work
my.output <- matrix(0, nrow=(3^3 * max(my.samples$obs)), ncol=5)
for(i in 1:max(desired.result$obs)) {
x <- merge(my.samples[i,], my.template, by=c('X1', 'X2', 'X3'), all=TRUE)
my.output[((i-1) * 3^3 +1) : ((i-1) * 3^3 + 3^3), 1:5] <- x
}
# this works
for(i in 1:max(desired.result$obs)) {
x <- merge(my.samples[i,], my.template, by=c('X1', 'X2', 'X3'), all=TRUE)
x$obs <- i
x$z[is.na(x$z)] <- 0
if(i == 1) {my.output = x}
if(i > 1) {my.output = rbind(my.output, x)}
}
my.output
all.equal(my.output[1:3], desired.result[,2:4])
I believe this should work
#expand template
full<-do.call(rbind, lapply(unique(my.samples$obs),
function(x) cbind(obs=x, my.template)))
#merge
result<-merge(full, my.samples, all.x=T)
#change NA's to 0
result$z[is.na(result$z)]<-0
#> all(result==desired.result)
#[1] TRUE
I like the answer posted by #MrFlick but when I added another column to my.samples I discovered that I had to modify the code. Below is what I came up with.
my.samples <- read.table(text = '
obs X1 X2 X3 z aa
1 2 1 0 1 20
2 0 0 0 1 -10
3 0 1 2 1 10
', header = TRUE)
my.template <- read.table(text = '
X1 X2 X3
0 0 0
0 0 1
0 0 2
0 1 0
0 1 1
0 1 2
0 2 0
0 2 1
0 2 2
1 0 0
1 0 1
1 0 2
1 1 0
1 1 1
1 1 2
1 2 0
1 2 1
1 2 2
2 0 0
2 0 1
2 0 2
2 1 0
2 1 1
2 1 2
2 2 0
2 2 1
2 2 2
', header = TRUE)
obs.aa <- my.samples[, c(1, ncol(my.samples))]
my.template2 <- merge(my.template, obs.aa)
my.template3 <- merge(my.template2, my.samples, by=c('obs', 'aa', paste0('X', 1:(ncol(my.samples)-3))), all = TRUE)
my.template3$z[is.na(my.template3$z)] <- 0
my.template3

Resources