Group and subtract all rows in a dataframe - r

Consider the following example:
t1 t2 t3 t4 t5 t6
A 4 6 7 8 5
A 3 6 8 1 4
A 2 3 5 3 1
A 3 4 3 1 3
B 3 6 8 3 4
B 3 9 3 7 3
B 5 2 3 2 1
I hope to get a dataframe which checks for differences between every pair of rows possible within a group (A and B seperately).
I have been trying to subset the dataframe based on a loop and manually calculate the differences between rows. This is leading to a lot of computation and it is hard to manage the referencing after all the differences have been calculated. I am basically ending up with several lists.
The resulting dataframe needs to contain the row differences for all possible row combinations(or permutations?) within a group.
For example, for B, without considering the first column i.e. characters, the following is the expected result:
t2 t3 t4 t5 t6
0 -3 5 -4 1
-2 4 5 1 3
-2 7 0 5 2
The sign is not very important. Only the magnitude

Here's one way to get a table with a row showing the difference between every pair of rows in each group. For example, row 2 of this output shows row 1 of the input minus row 3 of the input.
library(data.table)
setDT(df) # convert to data.table
df[, { pairs <- CJ(row1 = 1:.N, row2 = 1:.N)[row1 != row2]
data.table(pairs + .I[1] - 1, .SD[pairs[[1]]] - .SD[pairs[[2]]])
}, by = t1]
# t1 row1 row2 t2 t3 t4 t5 t6
# 1: A 1 2 1 0 -1 7 1
# 2: A 1 3 2 3 2 5 4
# 3: A 1 4 1 2 4 7 2
# 4: A 2 1 -1 0 1 -7 -1
# 5: A 2 3 1 3 3 -2 3
# 6: A 2 4 0 2 5 0 1
# 7: A 3 1 -2 -3 -2 -5 -4
# 8: A 3 2 -1 -3 -3 2 -3
# 9: A 3 4 -1 -1 2 2 -2
# 10: A 4 1 -1 -2 -4 -7 -2
# 11: A 4 2 0 -2 -5 0 -1
# 12: A 4 3 1 1 -2 -2 2
# 13: B 5 6 0 -3 5 -4 1
# 14: B 5 7 -2 4 5 1 3
# 15: B 6 5 0 3 -5 4 -1
# 16: B 6 7 -2 7 0 5 2
# 17: B 7 5 2 -4 -5 -1 -3
# 18: B 7 6 2 -7 0 -5 -2
This is a little redundant since it also shows row 3 - row 1 (which is just the negative). If you don't want this duplication, change row1 != row2 to row1 < row2.
df[, { pairs <- CJ(row1 = 1:.N, row2 = 1:.N)[row1 < row2]
data.table(pairs + .I[1] - 1, .SD[pairs[[1]]] - .SD[pairs[[2]]])
}, by = t1]
# t1 row1 row2 t2 t3 t4 t5 t6
# 1: A 1 2 1 0 -1 7 1
# 2: A 1 3 2 3 2 5 4
# 3: A 1 4 1 2 4 7 2
# 4: A 2 3 1 3 3 -2 3
# 5: A 2 4 0 2 5 0 1
# 6: A 3 4 -1 -1 2 2 -2
# 7: B 5 6 0 -3 5 -4 1
# 8: B 5 7 -2 4 5 1 3
# 9: B 6 7 -2 7 0 5 2
Explanation:
CJ(a, b) generates a data.table with a row for all possible pairs of values (a[i], b[j]). Example:
CJ(1:3, 1:3)
# V1 V2
# 1: 1 1
# 2: 1 2
# 3: 1 3
# 4: 2 1
# 5: 2 2
# 6: 2 3
# 7: 3 1
# 8: 3 2
# 9: 3 3
Since it is a data.table, you can subset by using non-$-prefixed column names in [], example
CJ(a = 1:3, b = 1:3)[a < b]
# a b
# 1: 1 2
# 2: 1 3
# 3: 2 3
Within the j part of dt[i, j, k], the variable .SD is the entire data.table subset to the given group (groups determined by the grouping vars in k). So this answer takes the first element of each pair, selects the rows of the group corresponding to those elements .SD[pairs[[1]]], and subtracts from that the rows of the group corresponding to the other element of each pair .SD[pairs[[2]]]. A data.table is created with pairs and the output of this subtraction. This is done for each group, and data.table automatically rbinds all the group outputs together.

Using tidyverse, you can use a Cartesian join within each t1:
dat %>%
gather(key, value, -grp) %>%
left_join(gather(dat, key, value, - grp), by = "grp") %>%
mutate(diff = value.x - value.y)
grp key.x value.x key.y value.y diff
1 A t2 4 t2 4 0
2 A t2 4 t2 3 1
3 A t2 4 t2 2 2
4 A t2 4 t2 3 1
5 A t2 4 t3 6 -2
6 A t2 4 t3 6 -2
Note that since my brain doesn't always comply I relabeled t1 as grp.

Here is an option. We split the dataframe by group, then we find every combo of rows per group, then we map out the differences and rejoin.
library(tidyverse)
testDat %>%
group_by(t1) %>%
mutate(row = row_number()) %>%
split(.$t1) %>%
map(
~nest(., data = -c(t1, row)) %>%
list(.,.) %>%
reduce(full_join, by = "t1") %>%
rename(row1 = row.x, row2 = row.y, vec1 = data.x, vec2 = data.y) %>%
filter(row1 != row2) %>%
mutate(diff = map2(vec1, vec2, ~unlist(.x)-unlist(.y)))%>%
select(-vec1, -vec2) %>%
unnest_wider(col = diff)
) %>%
bind_rows()
#> # A tibble: 18 x 8
#> # Groups: t1 [2]
#> t1 row1 row2 t2 t3 t4 t5 t6
#> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 2 1 0 -1 7 1
#> 2 A 1 3 2 3 2 5 4
#> 3 A 1 4 1 2 4 7 2
#> 4 A 2 1 -1 0 1 -7 -1
#> 5 A 2 3 1 3 3 -2 3
#> 6 A 2 4 0 2 5 0 1
#> 7 A 3 1 -2 -3 -2 -5 -4
#> 8 A 3 2 -1 -3 -3 2 -3
#> 9 A 3 4 -1 -1 2 2 -2
#> 10 A 4 1 -1 -2 -4 -7 -2
#> 11 A 4 2 0 -2 -5 0 -1
#> 12 A 4 3 1 1 -2 -2 2
#> 13 B 1 2 0 -3 5 -4 1
#> 14 B 1 3 -2 4 5 1 3
#> 15 B 2 1 0 3 -5 4 -1
#> 16 B 2 3 -2 7 0 5 2
#> 17 B 3 1 2 -4 -5 -1 -3
#> 18 B 3 2 2 -7 0 -5 -2

Another option is making use of outer
f1 <- function(dat) {
m1 <- outer(seq_len(nrow(dat)), seq_len(nrow(dat)),
FUN = Vectorize(function(i, j) list(dat[i, ] - dat[j, ])))
do.call(rbind, m1[row(m1)!= col(m1)])
}
do.call(rbind, lapply(split(df1[-1], df1$t1), f1))
Or using the function with tidyverse
library(dplyr)
library(purrr)
df1 %>%
group_split(t1) %>%
map_dfr(~ .x %>%
summarise(t1 = first(t1), out = list(f1(.[-1])))) %>%
unnest(out)
data
df1 <- structure(list(t1 = c("A", "A", "A", "A", "B", "B", "B"), t2 = c(4L,
3L, 2L, 3L, 3L, 3L, 5L), t3 = c(6L, 6L, 3L, 4L, 6L, 9L, 2L),
t4 = c(7L, 8L, 5L, 3L, 8L, 3L, 3L), t5 = c(8L, 1L, 3L, 1L,
3L, 7L, 2L), t6 = c(5L, 4L, 1L, 3L, 4L, 3L, 1L)),
class = "data.frame", row.names = c(NA,
-7L))

Here is a base R solution, where combn() is used:
dfout <- lapply(split(df,df$t1),
function(x) do.call(rbind,combn(seq(nrow(x)),2, function(v) x[v[1],-1]-x[v[2],-1],simplify = F)))
such that
> dfout
$A
t2 t3 t4 t5 t6
1 1 0 -1 7 1
2 2 3 2 5 4
3 1 2 4 7 2
21 1 3 3 -2 3
22 0 2 5 0 1
31 -1 -1 2 2 -2
$B
t2 t3 t4 t5 t6
5 0 -3 5 -4 1
51 -2 4 5 1 3
6 -2 7 0 5 2
DATA
df <- structure(list(t1 = c("A", "A", "A", "A", "B", "B", "B"), t2 = c(4L,
3L, 2L, 3L, 3L, 3L, 5L), t3 = c(6L, 6L, 3L, 4L, 6L, 9L, 2L),
t4 = c(7L, 8L, 5L, 3L, 8L, 3L, 3L), t5 = c(8L, 1L, 3L, 1L,
3L, 7L, 2L), t6 = c(5L, 4L, 1L, 3L, 4L, 3L, 1L)), class = "data.frame", row.names = c(NA,
-7L))

Related

By group relative order

I have a data set that looks like this
ID
Week
1
3
1
5
1
5
1
8
1
11
1
16
2
2
2
2
2
3
2
3
2
9
Now, what I would like to do is to add another column to the DataFrame so that, for every ID I will mark the week's relative position. More elaborately, I would like to the mark ID's earliest week (smallest number) as 1, then the next week for the ID as 2 and so forth, where if there are two observations of the same week they get the same number.
So, in the above example I should get:
ID
Week
Order
1
3
1
1
5
2
1
5
2
1
8
3
1
11
4
1
16
5
2
2
1
2
2
1
2
3
2
2
3
2
2
9
3
How could I achieve this?
Thank you very much!
A base R option using ave + match
transform(
df,
Order = ave(Week,
ID,
FUN = function(x) match(x, sort(unique(x)))
)
)
or ave + order (thank #IRTFM for comments)
transform(
df,
Order = ave(Week,
ID,
FUN = order
)
)
gives
ID Week Order
1 1 3 1
2 1 5 2
3 1 5 2
4 1 8 3
5 1 11 4
6 1 16 5
7 2 2 1
8 2 2 1
9 2 3 2
10 2 3 2
11 2 9 3
A data.table option with frank
> setDT(df)[, Order := frank(Week, ties.method = "dense"), ID][]
ID Week Order
1: 1 3 1
2: 1 5 2
3: 1 5 2
4: 1 8 3
5: 1 11 4
6: 1 16 5
7: 2 2 1
8: 2 2 1
9: 2 3 2
10: 2 3 2
11: 2 9 3
Data
> dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), Week = c(3L, 5L, 5L, 8L, 11L, 16L, 2L, 2L, 3L, 3L, 9L)), class = "data.frame", row.names =
c(NA,
-11L))
You can use dense_rank in dplyr :
library(dplyr)
df %>% group_by(ID) %>% mutate(Order = dense_rank(Week)) %>% ungroup
# ID Week Order
# <int> <int> <int>
# 1 1 3 1
# 2 1 5 2
# 3 1 5 2
# 4 1 8 3
# 5 1 11 4
# 6 1 16 5
# 7 2 2 1
# 8 2 2 1
# 9 2 3 2
#10 2 3 2
#11 2 9 3

How to custom arrange such that no group dimension contains the same index twice?

I have the following tibble containing all the permutations of some indexes:
bb <- as_tibble(expand.grid(v1=0:2, v2=0:2)) %>%
arrange(v1, v2)
bb
# A tibble: 9 x 2
v1 v2
<int> <int>
1 0 0
2 0 1
3 0 2
4 1 0
5 1 1
6 1 2
7 2 0
8 2 1
9 2 2
How can it be arranged in such a way that it generates this output instead:
v1 v2
<int> <int>
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
Where the output is three groups/sets such that within each set there is no repetition of the index within each variable. Note that there can be only so many rows per group/set fulfilling this criteria ...
Sorry that I am not very familiar with tibble, so I provide a solution with data.frame in base R:
shifter <- function(x, n) ifelse(n == 0, return(x), return(c(tail(x, -n), head(x, n))))
res <- `rownames<-`(Reduce(rbind,lapply(seq(length(dfs<-split(df,rep(0:2,3)))),
function(k) {
dfs[[k]][,2] <- shifter(dfs[[k]][,1],k-1)
dfs[[k]]})),seq(nrow(df)))
which gives:
> res
v1 v2
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
DATA
df <- structure(list(v1 = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L), v2 = c(0L,
1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-9L))
Update: a more efficient generator for all combinations with desired format is given as below:
genAllCombn <- function(n) {
v1 <- rep(0:(n-1),n)
v2 <- (v1 + rep(0:(n-1),1,each = n)) %% n
return(data.frame(v1,v2))
}
> genAllCombn(4)
v1 v2
1 0 0
2 1 1
3 2 2
4 3 3
5 0 1
6 1 2
7 2 3
8 3 0
9 0 2
10 1 3
11 2 0
12 3 1
13 0 3
14 1 0
15 2 1
16 3 2

applying sorting function in r for every four rows returns dataframe sorted but without extended selection, other columns are not sorted accordingly

I need every four rows to be sorted by the 4th column, separately from the next four rows, made a function :
for (i in seq(1,nrow(data_frame), by=4)) {
data_frame[i:(i+3),4] <- sort(data_frame[i:(i+3),4], decreasing=TRUE) }
problem is only the 4th column gets sorted but the corresponding rows are maintained.
from
x y z userID
-1 1 2 5 1
-2 1 1 2 2
-3 0 0 5 5
-6 1 2 5 3
-4 1 1 2 6
-5 0 0 5 4
-4 1 1 2 1
-5 0 0 5 5
to -
x y z userID
-1 1 2 5 5
-2 1 1 2 3
-3 0 0 5 2
-6 1 2 5 1
-4 1 1 2 6
-5 0 0 5 5
-4 1 1 2 4
-5 0 0 5 1
With tidyverse, we can use %/% to create a grouping column with %/% and use that to sort the 'userID'
library(tidyverse)
df1 %>%
group_by(grp = (row_number()-1) %/% 4 + 1) %>%
#or use
#group_by(grp = cumsum(rep(c(TRUE, FALSE, FALSE, FALSE), length.out = n()))) %>%
mutate(userID = sort(userID, decreasing = TRUE))
# A tibble: 8 x 5
# Groups: grp [2]
# x y z userID grp
# <int> <int> <int> <int> <dbl>
#1 1 2 5 5 1
#2 1 1 2 3 1
#3 0 0 5 2 1
#4 1 2 5 1 1
#5 1 1 2 6 2
#6 0 0 5 5 2
#7 1 1 2 4 2
#8 0 0 5 1 2
Or using base R with ave
with(df1, ave(userID, (seq_along(userID)-1) %/% 4 + 1,
FUN = function(x) sort(x, decreasing = TRUE)))
#[1] 5 3 2 1 6 5 4 1
data
df1 <- structure(list(x = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L), y = c(2L,
1L, 0L, 2L, 1L, 0L, 1L, 0L), z = c(5L, 2L, 5L, 5L, 2L, 5L, 2L,
5L), userID = c(1L, 2L, 5L, 3L, 6L, 4L, 1L, 5L)), row.names = c(NA,
-8L), class = "data.frame")
In base R, we can split every 4 rows, order the fourth column and return the updated dataframe back.
df[] <- do.call(rbind, lapply(split(df, gl(nrow(df)/4, 4)),
function(p) p[order(p[[4]], decreasing = TRUE), ]))
df
# x y z userID
#1 0 0 5 5
#2 1 2 5 3
#3 1 1 2 2
#4 1 2 5 1
#5 1 1 2 6
#6 0 0 5 5
#7 0 0 5 4
#8 1 1 2 1
tidyverse approach using the same logic would be
library(tidyverse)
df %>%
group_split(gl(n()/4, 4), keep = FALSE) %>%
map_dfr(. %>% arrange(desc(userID)))

Longest consecutive count of the same value per group

I have a data.frame as below and I want to add a variable describing the longest consecutive count of 1 in the VALUE variable observed in the group (i.e. longest consecutive rows with 1 in VALUE per group).
GROUP_ID VALUE
1 0
1 1
1 1
1 1
1 1
1 0
2 1
2 1
2 0
2 1
2 1
2 1
3 1
3 0
3 1
3 0
So the output would look like this:
GROUP_ID VALUE CONSECUTIVE
1 0 4
1 1 4
1 1 4
1 1 4
1 1 4
1 0 4
2 1 3
2 1 3
2 0 3
2 1 3
2 1 3
2 1 3
3 1 1
3 0 1
3 1 1
3 0 1
Any help would be greatly appreciated!
Using dplyr:
library(dplyr)
dat %>%
group_by(GROUP_ID) %>%
mutate(CONSECUTIVE = {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])})
which gives:
# A tibble: 16 x 3
# Groups: GROUP_ID [3]
GROUP_ID VALUE CONSECUTIVE
<int> <int> <int>
1 1 0 4
2 1 1 4
3 1 1 4
4 1 1 4
5 1 1 4
6 1 0 4
7 2 1 3
8 2 1 3
9 2 0 3
10 2 1 3
11 2 1 3
12 2 1 3
13 3 1 1
14 3 0 1
15 3 1 1
16 3 0 1
Or with data.table:
library(data.table)
setDT(dat) # convert to a 'data.table'
dat[, CONSECUTIVE := {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])}
, by = GROUP_ID][]
We can use ave with rle and get maximum occurrence of consecutive 1's for each group. (GROUP_ID)
df$Consecutive <- ave(df$VALUE, df$GROUP_ID, FUN = function(x) {
y <- rle(x == 1)
max(y$lengths[y$values])
})
df
# GROUP_ID VALUE Consecutive
#1 1 0 4
#2 1 1 4
#3 1 1 4
#4 1 1 4
#5 1 1 4
#6 1 0 4
#7 2 1 3
#8 2 1 3
#9 2 0 3
#10 2 1 3
#11 2 1 3
#12 2 1 3
#13 3 1 1
#14 3 0 1
#15 3 1 1
#16 3 0 1
Here is another option with data.table
library(data.table)
library(dplyr)
setDT(df1)[, CONSECUTIVE := max(table(na_if(rleid(VALUE)*VALUE, 0))), .(GROUP_ID)]
df1
# GROUP_ID VALUE CONSECUTIVE
# 1: 1 0 4
# 2: 1 1 4
# 3: 1 1 4
# 4: 1 1 4
# 5: 1 1 4
# 6: 1 0 4
# 7: 2 1 3
# 8: 2 1 3
# 9: 2 0 3
#10: 2 1 3
#11: 2 1 3
#12: 2 1 3
#13: 3 1 1
#14: 3 0 1
#15: 3 1 1
#16: 3 0 1
data
df1 <- structure(list(GROUP_ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), VALUE = c(0L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-16L))

R — Assign value to vector based on first episode

So I have a sequence dataset that looks like this
id epnum clockst
1 1 1 0
2 1 2 1
3 1 3 2
4 2 1 4
5 2 2 5
6 2 3 6
7 3 1 4
8 3 2 5
9 3 3 6
What I want is to create a vector of clockst based on epnum == 1.
So, I want basically this
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
However, I struggle to do so.
I came up with this, but it doesn't fully work.
dt$ep_start = ifelse(dt$epnum == 1 & dt$clockst == 0, 0,
ifelse(dt$epnum == 1 & dt$clockst == 4, 4, -9))
Any idea?
Data
dt = structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L), .Label = c("1", "2", "3"), class = "factor"), epnum = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", "3"), class = "factor"),
clockst = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 4L, 5L, 6L), .Label = c("0",
"1", "2", "4", "5", "6"), class = "factor")), .Names = c("id",
"epnum", "clockst"), row.names = c(NA, -9L), class = "data.frame")
Here is a solution using tidyverse:
First check the condition epnum == 1 and if TRUE, use clockst value if not NA. Then just fill NA with previous values.
Since clockst is a factor one needs to convert it to numeric while keeping the same values so as.numeric(as.character( needs to be used.
library(tidyverse)
dt %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down")
#output:
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
Here is a quick comparison of the available answers. I chose to use a 90 k row data set:
df <- df[rep(1:nrow(df), times = 10000),] #where df = dt
dt <- data.table(df)
library(microbenchmark)
bench <- microbenchmark(SunBee = dt[, ep_start := .SD[1]$clockst, by = "id"],
missuse = df %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down"),
d.b. = df$clockst[rep(which(df$epnum == 1), rle(cumsum(df$epnum == 1))$lengths)],
www = df %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup())
plot(bench)
with a 900 k row data set:
oh man I really need to learn DT.
Another tidyverse solution. arrange is not required if you are certain that the rows are in the right order.
library(dplyr)
dt2 <- dt %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup()
dt2
# # A tibble: 9 x 4
# id epnum clockst ep_start
# <fctr> <fctr> <fctr> <fctr>
# 1 1 1 0 0
# 2 1 2 1 0
# 3 1 3 2 0
# 4 2 1 4 4
# 5 2 2 5 4
# 6 2 3 6 4
# 7 3 1 4 4
# 8 3 2 5 4
# 9 3 3 6 4
You can do this with library(data.table) as follows
T <- data.table(T)
T[, ep_start := .SD[1]$clockst, by = "id"]
This gives:
id epnum clockst ep_start
1: 1 1 0 0
2: 1 2 1 0
3: 1 3 2 0
4: 2 1 4 4
5: 2 2 5 4
6: 2 3 6 4
7: 3 1 4 4
8: 3 2 5 4
9: 3 3 6 4
dt$ep_start = dt$clockst[rep(which(dt$epnum == 1), rle(cumsum(dt$epnum == 1))$lengths)]
dt
# id epnum clockst ep_start
#1 1 1 0 0
#2 1 2 1 0
#3 1 3 2 0
#4 2 1 4 4
#5 2 2 5 4
#6 2 3 6 4
#7 3 1 4 4
#8 3 2 5 4
#9 3 3 6 4
Using match
clock = dt[dt$epnum == 1, ]
dt$ep_start = clock$clockst[match(dt$id, clock$id)]

Resources