Sum pairs of columns by group - r

I wish to sum pairs of columns by group. In the example below I wish to sum pairs (v1 and v2), (v3 and v4), and (v5 and v6), each by r1, r2 and r3.
I can do this using the sapply statement below and I get the correct answer. However, the required code is complex. Could someone show me how to do the same operation perhaps in package data.table or with rollapply and/or other options? I have not yet explored those options.
Sorry if this is a duplicate.
my.data <- read.table(text= "
r1 r2 r3 t1 t2 t3 v1 v2 v3 v4 v5 v6
1 0 0 10 20 30 1 0 0 0 0 0
1 0 0 10 20 30 1 1 0 0 0 0
1 0 0 10 20 30 1 0 1 0 0 0
1 0 0 10 20 30 1 0 1 1 0 0
1 0 0 10 20 30 0 0 0 0 0 0
0 1 0 10 20 30 0 1 1 1 1 1
0 1 0 10 20 30 0 0 1 1 1 1
0 1 0 10 20 30 0 0 0 1 1 1
0 1 0 10 20 30 0 0 0 0 1 1
0 1 0 10 20 30 0 0 0 0 0 1
0 0 1 10 20 30 1 1 1 1 1 1
0 0 1 10 20 30 1 0 1 1 1 1
0 0 1 10 20 30 1 0 0 1 1 1
0 0 1 10 20 30 1 0 0 0 1 1
0 0 1 10 20 30 1 0 0 0 0 1
", header=TRUE, na.strings=NA)
my.data$my.group <- which(my.data[,1:3]==1, arr.ind=TRUE)[,2]
my.data
my.sums <- t(sapply(split(my.data[,7:(ncol(my.data)-1)], my.data$my.group), function(i) sapply(seq(2, ncol(i), 2), function(j) sum(i[,c((j-1),j)], na.rm=TRUE))))
my.sums
# [,1] [,2] [,3]
# 1 5 3 0
# 2 1 5 9
# 3 6 5 9

Here's a pretty general expression that you can probably simplify if you want it to match your specific data dimensions/column names/etc:
library(data.table)
dt = data.table(my.data)
dt[, lapply(1:(ncol(.SD)/2), function(x) sum(.SD[[2*x-1]], .SD[[2*x]])),
by = eval(grep('^r', names(dt), value = TRUE)),
.SDcols = grep('^v', names(dt), value = TRUE)]
# r1 r2 r3 V1 V2 V3
#1: 1 0 0 5 3 0
#2: 0 1 0 1 5 9
#3: 0 0 1 6 5 9

Also, using aggregate and mapply:
DF <- my.data
#function to sum 2 columns
fun <- function(col1, col2)
{
rowSums(aggregate(DF[c(col1, col2)], by = list(DF$r1, DF$r2, DF$r3), sum)[c(4, 5)])
}
#all pairs of columns, to be summed, in a matrix
#(7 is the column of v1)
args_mat <- matrix(7:ncol(DF), ncol = 2, byrow = T)
#apply `fun` to all pairs
mapply(fun, args_mat[,1], args_mat[,2])
# [,1] [,2] [,3]
#[1,] 5 3 0
#[2,] 1 5 9
#[3,] 6 5 9

Related

Count number of pairs across elements in a list in R?

Similar questions have been asked about counting pairs, however none seem to be specifically useful for what I'm trying to do.
What I want is to count the number of pairs across multiple list elements and turn it into a matrix. For example, if I have a list like so:
myList <- list(
a = c(2,4,6),
b = c(1,2,3,4),
c = c(1,2,5,7),
d = c(1,2,4,5,8)
)
We can see that the pair 1:2 appears 3 times (once each in a, b, and c). The pair 1:3 appears only once in b. The pair 1:4 appears 2 times (once each in b and d)... etc.
I would like to count the number of times a pair appears and then turn it into a symmetrical matrix. For example, my desired output would look something like the matrix I created manually (where each element of the matrix is the total count for that pair of values):
> myMatrix
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
[1,] 0 3 1 2 2 0 1 1
[2,] 3 0 1 3 2 1 1 1
[3,] 1 1 0 1 0 0 0 0
[4,] 2 3 1 0 0 0 0 1
[5,] 2 2 0 0 0 0 1 1
[6,] 0 1 0 0 0 0 0 0
[7,] 1 1 0 0 1 0 0 0
[8,] 1 1 0 1 1 0 0 0
Any suggestions are greatly appreciated
Inspired by #akrun's answer, I think you can use a crossproduct to get this very quickly and simply:
out <- tcrossprod(table(stack(myList)))
diag(out) <- 0
# values
#values 1 2 3 4 5 6 7 8
# 1 0 3 1 2 2 0 1 1
# 2 3 0 1 3 2 1 1 1
# 3 1 1 0 1 0 0 0 0
# 4 2 3 1 0 1 1 0 1
# 5 2 2 0 1 0 0 1 1
# 6 0 1 0 1 0 0 0 0
# 7 1 1 0 0 1 0 0 0
# 8 1 1 0 1 1 0 0 0
Original answer:
Use combn to get the combinations, as well as reversing each combination.
Then convert to a data.frame and table the results.
tab <- lapply(myList, \(x) combn(x, m=2, FUN=\(cm) rbind(cm, rev(cm)), simplify=FALSE))
tab <- data.frame(do.call(rbind, unlist(tab, rec=FALSE)))
table(tab)
# X2
#X1 1 2 3 4 5 6 7 8
# 1 0 3 1 2 2 0 1 1
# 2 3 0 1 3 2 1 1 1
# 3 1 1 0 1 0 0 0 0
# 4 2 3 1 0 1 1 0 1
# 5 2 2 0 1 0 0 1 1
# 6 0 1 0 1 0 0 0 0
# 7 1 1 0 0 1 0 0 0
# 8 1 1 0 1 1 0 0 0
We could loop over the list, get the pairwise combinations with combn, stack it to a two column dataset, convert the 'values' column to factor with levels specified as 1 to 8, get the frequency count (table), do a cross product (crossprod), convert the output back to logical, and then Reduce the list elements by adding elementwise and finally assign the diagonal elements to 0. (If needed set the names attributes of dimnames to NULL
out <- Reduce(`+`, lapply(myList, function(x)
crossprod(table(transform(stack(setNames(
combn(x,
2, simplify = FALSE), combn(x, 2, paste, collapse="_"))),
values = factor(values, levels = 1:8))[2:1]))> 0))
diag(out) <- 0
names(dimnames(out)) <- NULL
-output
> out
1 2 3 4 5 6 7 8
1 0 3 1 2 2 0 1 1
2 3 0 1 3 2 1 1 1
3 1 1 0 1 0 0 0 0
4 2 3 1 0 1 1 0 1
5 2 2 0 1 0 0 1 1
6 0 1 0 1 0 0 0 0
7 1 1 0 0 1 0 0 0
8 1 1 0 1 1 0 0 0
I thought of a solution based on #TarJae answer, is not a elegant one, but it was a fun challenge!
Libraries
library(tidyverse)
Code
map_df(myList,function(x) as_tibble(t(combn(x,2)))) %>%
count(V1,V2) %>%
{. -> temp_df} %>%
bind_rows(
temp_df %>%
rename(V2 = V1, V1 = V2)
) %>%
full_join(
expand_grid(V1 = 1:8,V2 = 1:8)
) %>%
replace_na(replace = list(n = 0)) %>%
arrange(V2,V1) %>%
pivot_wider(names_from = V1,values_from = n) %>%
as.matrix()
Output
V2 1 2 3 4 5 6 7 8
[1,] 1 0 3 1 2 2 0 1 1
[2,] 2 3 0 1 3 2 1 1 1
[3,] 3 1 1 0 1 0 0 0 0
[4,] 4 2 3 1 0 1 1 0 1
[5,] 5 2 2 0 1 0 0 1 1
[6,] 6 0 1 0 1 0 0 0 0
[7,] 7 1 1 0 0 1 0 0 0
[8,] 8 1 1 0 1 1 0 0 0
First identify the possible combination of each vector from the list to a tibble then I bind them to one tibble and count the combinations.
library(tidyverse)
a <- as_tibble(t(combn(myList[[1]],2)))
b <- as_tibble(t(combn(myList[[2]],2)))
c <- as_tibble(t(combn(myList[[3]],2)))
d <- as_tibble(t(combn(myList[[4]],2)))
bind_rows(a,b,c,d) %>%
count(V1, V2)
V1 V2 n
<dbl> <dbl> <int>
1 1 2 3
2 1 3 1
3 1 4 2
4 1 5 2
5 1 7 1
6 1 8 1
7 2 3 1
8 2 4 3
9 2 5 2
10 2 6 1
11 2 7 1
12 2 8 1
13 3 4 1
14 4 5 1
15 4 6 1
16 4 8 1
17 5 7 1
18 5 8 1

Create new column when when values repeat 3 or more times

Problem
I'm trying to create a new column (b) based on values from a previous column (a). Column a is binary, consisting of either 0's or 1's. If there are three or more 1's in a row in column a, then keep them in column b. I'm close to the desired output, but when there are two 1's in a row, the ifelse grabs the second value because it's meeting the first condition.
Desired Output–Column b
df <- data.frame(a = c(1,1,1,0,0,1,0,1,1,0,1,1,1,0,1,1,0,1,1,1,1),
b = c(1,1,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,1,1))
df
a b
1 1 1
2 1 1
3 1 1
4 0 0
5 0 0
6 1 0
7 0 0
8 1 0 #
9 1 0 #
10 0 0
11 1 1
12 1 1
13 1 1
14 0 0
15 1 0 #
16 1 0 #
17 0 0
18 1 1
19 1 1
20 1 1
21 1 1
Failed Attempt...s
require(dplyr)
df_fail <- df %>% mutate(b=ifelse((lag(df$a) + df$a) > 1 |(df$a + lead(df$a) + lead(df$a,2)) >= 3, df$a,NA))
df_fail
a b
1 1 1
2 1 1
3 1 1
4 0 0
5 0 0
6 1 0
7 0 0
8 1 0
9 1 1 # should be 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 0
15 1 0
16 1 1 # should be 0
17 0 0
18 1 1
19 1 1
20 1 1
21 1 1
We can use rle from base R to change the elements that have less than 3 repeating 1s to 0
inverse.rle(within.list(rle(df$a), values[values == 1 & lengths <3] <- 0))
#[1] 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1
Or use rleid from data.table
library(data.table)
library(dplyr)
df %>%
group_by(grp = rleid(a)) %>%
mutate(b1 = if(n() <3 & all(a == 1)) 0 else a) %>%
ungroup %>%
select(-grp)

Flag observations before and after a specific value in another column

Say I have a df:
df <- data.frame(flag = c(rep(0, 20)),
include = c(rep(1, 20)))
df[c(4,8,16), ]$flag <- 1
df
flag include
1 0 1
2 0 1
3 0 1
4 1 1
5 0 1
6 0 1
7 0 1
8 1 1
9 0 1
10 0 1
11 0 1
12 0 1
13 0 1
14 0 1
15 0 1
16 1 1
17 0 1
18 0 1
19 0 1
20 0 1
What I wish to do is change the include flag to 0 if the row is within +/- two rows of a row where flag == 1. The result would look like:
flag include
1 0 1
2 0 0
3 0 0
4 1 1
5 0 0
6 0 0
7 0 0
8 1 1
9 0 0
10 0 0
11 0 1
12 0 1
13 0 1
14 0 0
15 0 0
16 1 1
17 0 0
18 0 0
19 0 1
20 0 1
I've thought of some 'innovative' (read: inefficient and over complicated) ways to do it but was thinking there must be a simple way I'm overlooking.
Would be nice if the answer was such that I could generalize this to +/- n rows, since I have a lot more data and would be looking to potentially search within +/- 10 rows...
Another option with data.table:
library(data.table)
n = 2
# find the row number where flag is one
flag_one = which(df$flag == 1)
# find the index where include needs to be updated
idx = setdiff(outer(flag_one, -n:n, "+"), flag_one)
# update include in place
setDT(df)[idx[idx >= 1 & idx <= nrow(df)], include := 0][]
# or as #Frank commented the last step with base R would be
# df$include[idx[idx >= 1 & idx <= nrow(df)]] = 0
# flag include
# 1: 0 1
# 2: 0 0
# 3: 0 0
# 4: 1 1
# 5: 0 0
# 6: 0 0
# 7: 0 0
# 8: 1 1
# 9: 0 0
#10: 0 0
#11: 0 1
#12: 0 1
#13: 0 1
#14: 0 0
#15: 0 0
#16: 1 1
#17: 0 0
#18: 0 0
#19: 0 1
#20: 0 1
Put in a function:
update_n <- function(df, n) {
flag_one = which(df$flag == 1)
idx = setdiff(outer(flag_one, -n:n, "+"), flag_one)
df$include[idx[idx >= 1 & idx <= nrow(df)]] = 0
df
}
There must be another simpler way but the first way which I could think of is using sapply and which
df$include[sapply(which(df$flag == 1) , function(x) c(x-2, x-1, x+1, x+2))] <- 0
df
# flag include
#1 0 1
#2 0 0
#3 0 0
#4 1 1
#5 0 0
#6 0 0
#7 0 0
#8 1 1
#9 0 0
#10 0 0
#11 0 1
#12 0 1
#13 0 1
#14 0 0
#15 0 0
#16 1 1
#17 0 0
#18 0 0
#19 0 1
#20 0 1
We first find out all the indices where flag is 1 and then create the required sequence of numbers around each of it and turn that index of include to 0.
For variable n we can do
n = 2
df$include[sapply(which(df$flag == 1),function(x) setdiff(seq(x-n, x+n),x))] <- 0
replace(x = df$include,
list = sapply(1:NROW(df), function(i)
any(df$flag[c(max(1, i-2):max(1, i-1),
min(i+1, NROW(df)):min(i+2, NROW(df)))] == 1)), values = 0)
# [1] 1 0 0 1 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 1
For n rows,
replace(x = df$include,
list = sapply(1:NROW(df), function(i)
any(df$flag[c(max(1, i-n):max(1, i-1),
min(i+1, NROW(df)):min(i+n, NROW(df)))] == 1)), values = 0)
Another way is to use zoo::rollapply. To determine if a row is within +/- two rows of a row where flag == 1, we check if the maximum flag in a window is 1.
We need rollapply rather than rollmax because we need to specify partial = T.
is_within_flag_window <- function(flag, n) {
zoo::rollapply(flag, width = (2 * n) + 1, partial = T, FUN = max) == 1
}
df %>%
mutate(include = ifelse(flag == 1, 1,
ifelse(is_within_flag_window(flag, 2), 0,
1)))
Use which and outer.
df$include[outer(which(df$flag==1), -2:2, `+`)] <- 0
If flag=1 within one or two positions of each other then restore the ones overwritten at position 0. Note this step is critical in case the "flag" overlaps in a particular range.
df$include[which(df$flag==1)] <- 1
flag include
1 0 1
2 0 0
3 0 0
4 1 1
5 0 0
6 0 0
7 0 0
8 1 1
9 0 0
10 0 0
11 0 1
12 0 1
13 0 1
14 0 0
15 0 0
16 1 1
17 0 0
18 0 0
19 0 1
20 0 1
If flag = 1 within one or two rows of the beginning or end of the dataset, R will throw errors. Use this:
## assign i for convenience/readability
i <- pmax(1, pmin(nrow(df), outer(which(df$flag==1), -2:2, `+`)))
df$include[i] <- 0
Restore 1s as before

Creating a factor/categorical variable from 4 dummies

I have a data frame with four columns, let's call them V1-V4 and ten observations. Exactly one of V1-V4 is 1 for each row, and the others of V1-V4 are 0. I want to create a new column called NEWCOL that takes on the value of 3 if V3 is 1, 4 if V4 is 1, and is 0 otherwise.
I have to do this for MANY sets of variables V1-V4 so I would like the solution to be as short as possible so that it will be easy to replicate.
This does it for 4 columns to add a fifth using matrix multiplication:
> cbind( mydf, newcol=data.matrix(mydf) %*% c(0,0,3,4) )
V1 V2 V3 V4 newcol
1 1 0 0 0 0
2 1 0 0 0 0
3 0 1 0 0 0
4 0 1 0 0 0
5 0 0 1 0 3
6 0 0 1 0 3
7 0 0 0 1 4
8 0 0 0 1 4
9 0 0 0 1 4
10 0 0 0 1 4
It's generalizable to getting multiple columns.... we just need the rules. You need to make a matric with the the same number of rows as there are columns in the original data and have one column for each of the new factors needed to build each new variable. This shows how to build one new column from the sum of 3 times the third column plus 4 times the fourth, and another new column from one times the first and 2 times the second.
> cbind( mydf, newcol=data.matrix(mydf) %*% matrix(c(0,0,3,4, # first set of factors
1,2,0,0), # second set
ncol=2) )
V1 V2 V3 V4 newcol.1 newcol.2
1 1 0 0 0 0 1
2 1 0 0 0 0 1
3 0 1 0 0 0 2
4 0 1 0 0 0 2
5 0 0 1 0 3 0
6 0 0 1 0 3 0
7 0 0 0 1 4 0
8 0 0 0 1 4 0
9 0 0 0 1 4 0
10 0 0 0 1 4 0
An example data set:
mydf <- data.frame(V1 = c(1, 1, rep(0, 8)),
V2 = c(0, 0, 1, 1, rep(0, 6)),
V3 = c(rep(0, 4), 1, 1, rep(0, 4)),
V4 = c(rep(0, 6), rep(1, 4)))
# V1 V2 V3 V4
# 1 1 0 0 0
# 2 1 0 0 0
# 3 0 1 0 0
# 4 0 1 0 0
# 5 0 0 1 0
# 6 0 0 1 0
# 7 0 0 0 1
# 8 0 0 0 1
# 9 0 0 0 1
# 10 0 0 0 1
Here's an easy approach to generate the new column:
mydf <- transform(mydf, NEWCOL = V3 * 3 + V4 * 4)
# V1 V2 V3 V4 NEWCOL
# 1 1 0 0 0 0
# 2 1 0 0 0 0
# 3 0 1 0 0 0
# 4 0 1 0 0 0
# 5 0 0 1 0 3
# 6 0 0 1 0 3
# 7 0 0 0 1 4
# 8 0 0 0 1 4
# 9 0 0 0 1 4
# 10 0 0 0 1 4

Splitting one column into multiple columns

I have a huge dataset in which there is one column including several values for each subject (row). Here is a simplified sample dataframe:
data <- data.frame(subject = c(1:8), sex = c(1, 2, 2, 1, 2, 1, 1, 2),
age = c(35, 29, 31, 46, 64, 57, 49, 58),
v1 = c("2", "0", "3,5", "2 1", "A,4", "B,1,C", "A and B,3", "5, 6 A or C"))
> data
subject sex age v1
1 1 1 35 2
2 2 2 29 0
3 3 2 31 3,5 # separated by a comma
4 4 1 46 2 1 # separated by a blank space
5 5 2 64 A,4
6 6 1 57 B,1,C
7 7 1 49 A and B,3
8 8 2 58 5, 6 A or C
I first want to remove the letters (A, B, A and B, …) in the fourth column (v1), and then split the fourth column into multiple columns just like this:
subject sex age x1 x2 x3 x4 x5 x6
1 1 1 35 0 1 0 0 0 0
2 2 2 29 0 0 0 0 0 0
3 3 2 31 0 0 1 0 1 0
4 4 1 46 1 1 0 0 0 0
5 5 2 64 0 0 0 1 0 0
6 6 1 57 1 0 0 0 0 0
7 7 1 49 0 0 1 0 0 0
8 8 2 58 0 0 0 0 1 1
where the 1st subject takes 1 at x2 because it takes 2 at v1 in the original dataset, the 3rd subject takes 1 at both x3 and x5 because it takes 3 and 5 at v1 in the original dataset, and so on.
I would appreciate any help on this question. Thanks a lot.
You can cbind this result to data[-4] and get what you need:
0+t(sapply(as.character(data$v1), function(line)
sapply(1:6, function(x) x %in% unlist(strsplit(line, split="\\s|\\,"))) ))
#----------------
[,1] [,2] [,3] [,4] [,5] [,6]
2 0 1 0 0 0 0
0 0 0 0 0 0 0
3,5 0 0 1 0 1 0
2 1 1 1 0 0 0 0
A,4 0 0 0 1 0 0
B,1,C 1 0 0 0 0 0
A and B,3 0 0 1 0 0 0
5, 6 A or C 0 0 0 0 1 1
One solution:
r <- sapply(strsplit(as.character(dt$v1), "[^0-9]+"), as.numeric)
m <- as.data.frame(t(sapply(r, function(x) {
y <- rep(0, 6)
y[x[!is.na(x)]] <- 1
y
})))
data <- cbind(data[, c("subject", "sex", "age")], m)
# subject sex age V1 V2 V3 V4 V5 V6
# 1 1 1 35 0 1 0 0 0 0
# 2 2 2 29 0 0 0 0 0 0
# 3 3 2 31 0 0 1 0 1 0
# 4 4 1 46 1 1 0 0 0 0
# 5 5 2 64 0 0 0 1 0 0
# 6 6 1 57 1 0 0 0 0 0
# 7 7 1 49 0 0 1 0 0 0
# 8 8 2 58 0 0 0 0 1 1
Following DWin's awesome solution, m could be modified as:
m <- as.data.frame(t(sapply(r, function(x) {
0 + 1:6 %in% x[!is.na(x)]
})))

Resources