Subset groups in a data.table using conditions on two columns - r

I have a data.table with a high number of groups. I would like to subset whole groups (not just rows) based on the conditions on multiple columns. Consider the following data.table:
DT <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L),
group = c("A", "A", "A", "A", "B", "B", "B", "B", "C", "C", "C", "C"),
y = c(14, 19, 16, 10, 6, 8, 14, 19, 10, 9, 6, 8),
x = c(3, 3, 2, 3, 3, 3, 3, 2, 2, 3, 3, 3)),
row.names = c(NA, -12L),
class = c("data.table", "data.frame"))
>DT
id group y x
1: 1 A 14 3
2: 2 A 19 3
3: 3 A 16 2
4: 4 A 10 3
5: 5 B 6 3
6: 6 B 8 3
7: 7 B 14 3
8: 8 B 19 2
9: 9 C 10 2
10: 10 C 9 3
11: 11 C 6 3
12: 12 C 8 3
I would like to keep groups that have y=6 and x=3 in the same row. So that I would have only class B and C (preferably using data.table package in R):
id group y x
1: 5 B 6 3
2: 6 B 8 3
3: 7 B 14 3
4: 8 B 19 2
5: 9 C 10 2
6: 10 C 9 3
7: 11 C 6 3
8: 12 C 8 3
All my attempts gave me only those rows containing y=6 and x=3, which I do not want:
id group y x
1: 5 B 6 3
2: 11 C 6 3

With data.table:
DT[,.SD[any(x == 3 & y == 6)], by=group]
group id y x
<char> <int> <num> <num>
1: B 5 6 3
2: B 6 8 3
3: B 7 14 3
4: B 8 19 2
5: C 9 10 2
6: C 10 9 3
7: C 11 6 3
8: C 12 8 3
Another possibly faster option:
DT[, if (any(x == 3 & y == 6)) .SD, by=group]

Try dplyr package
#select groups containing y and x
groups = DT %>% filter(y == 6, x == 3) %>% select(group) %>% unique() %>% unlist() %>% as.vector()
# filter for selected groups
DT %>% filter(group %in% groups)

A data.table option
> DT[group %in% DT[.(3, 6), group, on = .(x, y)]]
id group y x
1: 5 B 6 3
2: 6 B 8 3
3: 7 B 14 3
4: 8 B 19 2
5: 9 C 10 2
6: 10 C 9 3
7: 11 C 6 3
8: 12 C 8 3

Related

Replace value when value above and below are the same

I have the following dataframe df (dput below):
> df
group value
1 A 2
2 A 2
3 A 3
4 A 2
5 A 1
6 A 2
7 A 2
8 A 2
9 B 3
10 B 3
11 B 3
12 B 4
13 B 3
14 B 3
15 B 4
16 B 4
I would like to replace value when the value above and below are the same per group. For example row 3 has a value above of 2 and below of 2 which means the 3 should be 2. The desired output should look like this:
group value
1 A 2
2 A 2
3 A 2
4 A 2
5 A 2
6 A 2
7 A 2
8 A 2
9 B 3
10 B 3
11 B 3
12 B 3
13 B 3
14 B 3
15 B 4
16 B 4
So I was wondering if anyone knows how to replace values when the value above and below are the same like in the example above?
dput data:
df<-structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B", "B", "B"), value = c(2, 2, 3, 2,
1, 2, 2, 2, 3, 3, 3, 4, 3, 3, 4, 4)), class = "data.frame", row.names = c(NA,
-16L))
With ifelse, lead and lag:
library(dplyr)
df %>%
mutate(value = ifelse(lead(value, default = TRUE) == lag(value, default = TRUE),
lag(value), value))
group value
1 A 2
2 A 2
3 A 2
4 A 2
5 A 2
6 A 2
7 A 2
8 A 2
9 B 3
10 B 3
11 B 3
12 B 3
13 B 3
14 B 3
15 B 4
16 B 4

summing up values in R dataframe without aggregation

How can I add up row values in a dataframe based on conditions without having to aggregate the whole table?
I have this df:
town party votes
1 a A 1
2 a B 2
3 a C 3
4 b A 4
5 b B 5
6 b C 6
7 c A 7
8 c B 8
9 c C 9
I would like to add the votes of one party to those of another by town, without touching the values of the third one.
Basically to run df$votes[df$party == A] = df$votes[df$party == A] + df$votes[df$party == B] for each category of df$town
I'm interpreting your pseudo-code as wanting to only update party "A" to the sum of both "A" and "B"'s votes.
base R
do.call(rbind, by(df, df$town,
function(Z) {
ind <- Z$party %in% c("A", "B")
Z$votes[Z$party == "A"] <- sum(Z$votes[ind])
Z
}
))
# town party votes
# a.1 a A 3
# a.2 a B 2
# a.3 a C 3
# b.4 b A 9
# b.5 b B 5
# b.6 b C 6
# c.7 c A 15
# c.8 c B 8
# c.9 c C 9
dplyr
library(dplyr)
df %>%
group_by(town) %>%
mutate(
votes = if_else(party == "A", sum(votes[party %in% c("A", "B")]), votes)
) %>%
ungroup()
# # A tibble: 9 x 3
# town party votes
# <chr> <chr> <int>
# 1 a A 3
# 2 a B 2
# 3 a C 3
# 4 b A 9
# 5 b B 5
# 6 b C 6
# 7 c A 15
# 8 c B 8
# 9 c C 9
data.table
library(data.table)
DT <- as.data.table(df) # normally setDT(df) is canonical
DT[, votes := fifelse(party == "A", sum(votes[party %in% c("A", "B")]), votes),
by = .(town)]
# town party votes
# <char> <char> <int>
# 1: a A 3
# 2: a B 2
# 3: a C 3
# 4: b A 9
# 5: b B 5
# 6: b C 6
# 7: c A 15
# 8: c B 8
# 9: c C 9
You can try mutate with dplyr if you want to keep the structure of the dataframe
library(dplyr)
df %>%
group_by(town) %>%
mutate(sum=ifelse(party!="C", sum(votes[party!="C"]), votes)) %>%
ungroup()
# A tibble: 9 × 4
town party votes sum
<chr> <chr> <int> <int>
1 a A 1 3
2 a B 2 3
3 a C 3 3
4 b A 4 9
5 b B 5 9
6 b C 6 6
7 c A 7 15
8 c B 8 15
9 c C 9 9
Another way using summarise
df %>%
filter(party!="C") %>%
group_by(town) %>%
summarise(sum=sum(votes))
# A tibble: 3 × 2
town sum
<chr> <int>
1 a 3
2 b 9
3 c 15
tidyverse
df <- data.frame(
stringsAsFactors = FALSE,
town = c("a", "a", "a", "b", "b", "b", "c", "c", "c"),
party = c("A", "B", "C", "A", "B", "C", "A", "B", "C"),
votes = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L)
)
library(tidyverse)
df %>%
group_by(town, grp_party = party %in% c("A", "B")) %>%
mutate(new_party = paste0(party, collapse = ""), new_votes = sum(votes)) %>%
ungroup() %>%
select(-grp_party)
#> # A tibble: 9 x 5
#> town party votes new_party new_votes
#> <chr> <chr> <int> <chr> <int>
#> 1 a A 1 AB 3
#> 2 a B 2 AB 3
#> 3 a C 3 C 3
#> 4 b A 4 AB 9
#> 5 b B 5 AB 9
#> 6 b C 6 C 6
#> 7 c A 7 AB 15
#> 8 c B 8 AB 15
#> 9 c C 9 C 9
Created on 2022-02-08 by the reprex package (v2.0.1)
data.table
library(data.table)
setDT(df)[, votes:= lapply(.SD, sum), by = list(town, party %in% c("A", "B"))][]
#> town party votes
#> 1: a A 3
#> 2: a B 3
#> 3: a C 3
#> 4: b A 9
#> 5: b B 9
#> 6: b C 6
#> 7: c A 15
#> 8: c B 15
#> 9: c C 9
Created on 2022-02-08 by the reprex package (v2.0.1)

Find overlaps in time intervals by group and return subsetted data.frame

Say I have this dataframe, which has two IDs (1/2) with their start and end times in three different zones (A/B/C):
df <- structure(list(id = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2), zone = c("A",
"B", "A", "C", "B", "A", "B", "A", "B", "C"), start = c(0, 6,
7, 8, 10, 0, 3, 5, 6, 7), end = c(6, 7, 8, 10, 11, 3, 5, 6, 7,
11)), row.names = c(NA, -10L), class = "data.frame")
df
id zone start end
1 1 A 0 6
2 1 B 6 7
3 1 A 7 8
4 1 C 8 10
5 1 B 10 11
6 2 A 0 3
7 2 B 3 5
8 2 A 5 6
9 2 B 6 7
10 2 C 7 11
If we look at each zone, we can visually inspect the times when IDs are in the same zone and when they are not:
split(df,df$zone)
$A
id zone start end
1 1 A 0 6
3 1 A 7 8
6 2 A 0 3
8 2 A 5 6
$B
id zone start end
2 1 B 6 7
5 1 B 10 11
7 2 B 3 5
9 2 B 6 7
$C
id zone start end
4 1 C 8 10
10 2 C 7 11
e.g. Both 1 and 2 are together in zone A from 0-3, and from 5-6, but not at other times.
Desired Output
I want to extract three dataframes.
A dataframe showing the times and zones they are together:
zone start end id
1 A 0 3 1-2
2 A 5 6 1-2
3 B 6 7 1-2
4 C 8 10 1-2
2 & 3: Dataframes for times when they are not together:
#id=1
zone start end
1 A 3 5
2 A 7 8
3 B 10 11
#id=2
zone start end
1 B 3 5
2 C 7 8
3 C 10 11
I have been trying to work with foverlaps from data.table and the intervals package, but can't seem to work out the correct method.
e.g. Subsetting each zone/id, I can sort of get an output that includes overlaps, but it doesn't seem to be quite the right direction:
A <- split(df,df$zone)$A
Asp <- split(A,A$id)
x <- setDT(Asp[[1]])
y <- setDT(Asp[[2]])
setkey(y, start, end)
foverlaps(x, y, type="any")
id zone start end i.id i.zone i.start i.end
1: 2 A 0 3 1 A 0 6
2: 2 A 5 6 1 A 0 6
3: NA <NA> NA NA 1 A 7 8
Any help greatly appreciated.
EDIT: Extra example dataset that seemed to bring up some issues with current suggested solutions:
df2 <- structure(list(start = c(0, 5, 6, 8, 10, 13, 15, 20, 22, 26,
29, 37, 40, 42, 0, 3, 6, 9, 15, 20, 25, 33, 35, 40), id = c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2), zone = c("A", "B", "A", "D", "C", "B", "C", "B", "A",
"B", "A", "D", "C", "D", "A", "B", "C", "D", "A", "B", "C", "B",
"A", "D"), end = c(5, 6, 8, 10, 13, 15, 20, 22, 26, 29, 37, 40,
42, 45, 3, 6, 9, 15, 20, 25, 33, 35, 40, 45)), class = c("data.table", "data.frame"), row.names = c(NA, -24L))
df2
start id zone end
1: 0 1 A 5
2: 5 1 B 6
3: 6 1 A 8
4: 8 1 D 10
5: 10 1 C 13
6: 13 1 B 15
7: 15 1 C 20
8: 20 1 B 22
9: 22 1 A 26
10: 26 1 B 29
11: 29 1 A 37
12: 37 1 D 40
13: 40 1 C 42
14: 42 1 D 45
15: 0 2 A 3
16: 3 2 B 6
17: 6 2 C 9
18: 9 2 D 15
19: 15 2 A 20
20: 20 2 B 25
21: 25 2 C 33
22: 33 2 B 35
23: 35 2 A 40
24: 40 2 D 45
start id zone end
This seems to work, filtering the foverlaps output:
DT = data.table(df)
setkey(DT, start, end)
oDT0 = foverlaps(DT[id==1], DT[id==2])
oDT0[, `:=`(
ostart = pmax(start, i.start),
oend = pmin(end, i.end)
)]
oDT = oDT0[ostart < oend]
# together
oDT[zone == i.zone, .(ids = '1-2', zone, ostart, oend)]
# ids zone ostart oend
# 1: 1-2 A 0 3
# 2: 1-2 A 5 6
# 3: 1-2 B 6 7
# 4: 1-2 C 8 10
# apart
oDT[zone != i.zone, .(id, zone, i.id, i.zone, ostart, oend)]
# id zone i.id i.zone ostart oend
# 1: 2 B 1 A 3 5
# 2: 2 C 1 A 7 8
# 3: 2 C 1 B 10 11
Repeating for new input... not sure if it's correct since no expected output was provided:
> DT = data.table(df2)
> ...
> oDT[zone == i.zone, .(ids = '1-2', zone, ostart, oend)]
ids zone ostart oend
1: 1-2 A 0 3
2: 1-2 B 5 6
3: 1-2 D 9 10
4: 1-2 B 20 22
5: 1-2 A 35 37
6: 1-2 D 42 45
> oDT[zone != i.zone, .(id, zone, i.id, i.zone, ostart, oend)]
id zone i.id i.zone ostart oend
1: 2 B 1 A 3 5
2: 2 C 1 A 6 8
3: 2 C 1 D 8 9
4: 2 D 1 C 10 13
5: 2 D 1 B 13 15
6: 2 A 1 C 15 20
7: 2 B 1 A 22 25
8: 2 C 1 A 25 26
9: 2 C 1 B 26 29
10: 2 C 1 A 29 33
11: 2 B 1 A 33 35
12: 2 A 1 D 37 40
13: 2 D 1 C 40 42
I suspect there is a way to pass arguments to foverlaps to avoid needing to define and filter by ostart and oend. As of the latest CRAN version of the package, the doc indicates that minoverlap is not yet implemented, so maybe it is necessary for now.
I think you are almost there. You can try the code below by defining a function f
f <- function(A) {
Asp <- split(A, by = "id")
u <- na.omit(foverlaps(Asp[[1]], setkey(Asp[[2]], start, end)))
r <- c()
for (k in 1:nrow(u)) {
if (u[k, end - start < i.end - i.start]) {
p <- u[k, .(start, end)]
} else {
p <- u[k, .(start = i.start, end = i.end)]
}
r[[k]] <- p
}
cbind(
zone = u[, zone],
rbindlist(r),
id = paste0(unique(A[, id]), collapse = "-")
)
}
and then run
rbindlist(Map(f, split(setDT(df), by = "zone")))
which gives
> rbindlist(Map(f, split(setDT(df), by = "zone")))
zone start end id
1: A 0 3 1-2
2: A 5 6 1-2
3: B 6 7 1-2
4: C 8 10 1-2
For the first data.frame, you can also use a non-equi join:
ovlap <- df[df, on=.(zone, id<id, start<end, end>start), nomatch=0L,
.(zone, id2=i.id, i.start, i.end, id1=x.id, x.start, x.end)][,
.(start=max(x.start, i.start), end=min(x.end, i.end)),
.(zone, id1, id2, i.start)][,
i.start := NULL][]
# zone id1 id2 start end
#1: A 1 2 0 3
#2: A 1 2 5 6
#3: B 1 2 6 7
#4: C 1 2 8 10
For the other output data.frames, you can perform a non-equi join first with the previous result and then for each interval find the sub-intervals where the other partner is not around:
rangeDiff <- function(DT) {
DT[,
if (is.na(x.start[1L])) {
.(start=i.start, end=i.end)
} else {
.(start=c(i.start, x.end+1L),
end=c(x.start-1L, i.end))
},
.(zone, id, i.start, i.end)][
start<=end][,
c("i.start","i.end") := NULL][]
} #rangeDiff
rangeDiff(ovlap[df[id==1L], on=.(zone, id1=id, start<end, end>start),
.(zone, id, i.start, i.end, x.start, x.end)])
# zone id V1 V2
#1: A 1 4 4
#2: A 1 7 8
#3: B 1 10 11
rangeDiff(ovlap[df[id==2L], on=.(zone, id2=id, start<end, end>start),
.(zone, id, i.start, i.end, x.start, x.end)])
# zone id V1 V2
#1: B 2 3 5
#2: C 2 7 7
#3: C 2 11 11
There is some inconsistency in OP where the bounds of intervals are inclusive or exclusive. I have used inclusive when both ids are around in the same zone (i.e. in the first output data.frame).
Edit: show output for df2
ovlap
# zone id1 id2 start end
#1: A 1 2 0 3
#2: A 1 2 35 37
#3: B 1 2 5 6
#4: B 1 2 20 22
#5: D 1 2 9 10
#6: D 1 2 42 45
other required data.frames:
rangeDiff(ovlap[df[id==1L], on=.(zone, id1=id, start<end, end>start),
.(zone, id, i.start, i.end, x.start, x.end)])
# zone id start end
# 1: A 1 4 5
# 2: A 1 6 8
# 3: A 1 22 26
# 4: A 1 29 34
# 5: B 1 13 15
# 6: B 1 26 29
# 7: C 1 10 13
# 8: C 1 15 20
# 9: C 1 40 42
# 10: D 1 8 8
# 11: D 1 37 40
rangeDiff(ovlap[df[id==2L], on=.(zone, id2=id, start<end, end>start),
.(zone, id, i.start, i.end, x.start, x.end)])
# zone id start end
# 1: A 2 15 20
# 2: A 2 38 40
# 3: B 2 3 4
# 4: B 2 23 25
# 5: B 2 33 35
# 6: C 2 6 9
# 7: C 2 25 33
# 8: D 2 11 15
# 9: D 2 40 41
df2 sorted by zone for easier checking:
start id zone end
1: 0 1 A 5
2: 6 1 A 8
3: 22 1 A 26
4: 29 1 A 37
5: 0 2 A 3
6: 15 2 A 20
7: 35 2 A 40
8: 5 1 B 6
9: 13 1 B 15
10: 20 1 B 22
11: 26 1 B 29
12: 3 2 B 6
13: 20 2 B 25
14: 33 2 B 35
15: 10 1 C 13
16: 15 1 C 20
17: 40 1 C 42
18: 6 2 C 9
19: 25 2 C 33
20: 8 1 D 10
21: 37 1 D 40
22: 42 1 D 45
23: 9 2 D 15
24: 40 2 D 45
Updated Solution
I have made some modifications to the previous solution so that it works with the newly presented data set df2:
I tried to create all combinations of id == 1 and id == 2 in every zone to try and find their intersects
Then I created a custom function to take a subset of our data set plus a pair of ids to extract their start to end values so that we have two vectors and we can find their intersects easily
` In the end I applied this function to every subset of our data set
library(dplyr)
library(tidyr)
library(purrr)
fn <- function(data, x, y) {
base::intersect(data %>%
filter(row_number() == x) %>%
select(start, end) %>%
{map2(.$start, .$end, ~ .x:.y)} %>%
unlist(),
data %>%
filter(row_number() == y) %>%
select(start, end) %>%
{map2(.$start, .$end, ~ .x:.y)} %>%
unlist())
}
Then we apply it on our data set:
split(df2, df2$zone) %>%
map(~ .x %>%
mutate(grp = row_number()) %>%
{expand.grid(.$grp[.$id == 1], .$grp[.$id == 2])} %>%
rowwise() %>%
mutate(insec = list(fn(.x, Var1, Var2))) %>%
filter(length(insec) != 0) %>%
unnest(cols = c(insec)) %>%
group_by(Var1, Var2) %>%
filter(row_number() == 1 | row_number() == n()) %>%
filter(n() > 1) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = id, values_from = insec) %>%
ungroup()) %>%
keep(~ nrow(.x) != 0) %>%
imap_dfr(~ .x %>%
mutate(zone
= .y) %>%
select(!starts_with("Var"))) %>%
relocate(zone) %>%
rename(start = `1`, end = `2`)
# A tibble: 6 x 3
zone start end
<chr> <int> <int>
1 A 0 3
2 A 35 37
3 B 5 6
4 B 20 22
5 D 9 10
6 D 42 45

Conditional statement within group

I have a dataframe in which I want to make a new column with values based on condition within groups. So for the dataframe below, I want to make a new column n_actions which gives
Cond1. for the whole group GROUP the number 2 if a 6 appears in column STEP
Cond 2. for the whole group GROUP the number 3 if a 9 appears in column STEP
Cond 3. if not a 6 or 9 appears within column STEP for the GROUP, then 1
#dataframe start
dataframe <- data.frame(group = c("A", "A", "A", "B", "B", "B", "B", "B", "B", "C", "C", "C", "D", "D", "D", "D", "D", "D", "D", "D", "D"),
step = c(1, 2, 3, 1, 2, 3, 4, 5, 6, 1, 2, 3, 1, 2, 3, 4, 5, 6, 7, 8, 9))
# dataframe desired
dataframe$n_actions <- c(rep(1, 3), rep(2, 6,), rep(1, 3), rep(3, 9))
Try out:
library(dplyr)
dataframe %>%
group_by(group) %>%
mutate(n_actions = ifelse(9 %in% step, 3,
ifelse(6 %in% step, 2, 1)))
# A tibble: 21 x 3
# Groups: group [4]
group step n_actions
<fctr> <dbl> <dbl>
1 A 1 1
2 A 2 1
3 A 3 1
4 B 1 2
5 B 2 2
6 B 3 2
7 B 4 2
8 B 5 2
9 B 6 2
10 C 1 1
# ... with 11 more rows
Another way with dplyr's case_when:
library(dplyr)
dataframe %>%
group_by(group) %>%
mutate(
n_actions1 = case_when(
9 %in% step ~ 3,
6 %in% step ~ 2,
TRUE ~ 1
)
)
Output:
# A tibble: 21 x 3
# Groups: group [4]
group step n_actions
<fct> <dbl> <dbl>
1 A 1 1
2 A 2 1
3 A 3 1
4 B 1 2
5 B 2 2
6 B 3 2
7 B 4 2
8 B 5 2
9 B 6 2
10 C 1 1
11 C 2 1
12 C 3 1
13 D 1 3
14 D 2 3
15 D 3 3
16 D 4 3
17 D 5 3
18 D 6 3
19 D 7 3
20 D 8 3
21 D 9 3
You could divide the maximum value per group by %/% 3, it seems.
dataframe <- transform(dataframe,
n_actions2 = ave(step, group, FUN = function(x) max(x) %/% 3))
dataframe
# group step n_actions n_actions2
#1 A 1 1 1
#2 A 2 1 1
#3 A 3 1 1
#4 B 1 2 2
#5 B 2 2 2
#6 B 3 2 2
#7 B 4 2 2
#8 B 5 2 2
#9 B 6 2 2
#10 C 1 1 1
#11 C 2 1 1
#12 C 3 1 1
#13 D 1 3 3
#14 D 2 3 3
#15 D 3 3 3
#16 D 4 3 3
#17 D 5 3 3
#18 D 6 3 3
#19 D 7 3 3
#20 D 8 3 3
#21 D 9 3 3

Backfill data frame in R based on maximum number

I have a data frame with a number of id's and max values.
df <- data.frame(id = c("a", "b", "c"), max.number = c(0, 6, 4))
id max.number
1 a 0
2 b 6
3 c 4
I want to create a new data frame that backfills down to zero for each id. It would look like:
id number
1 a 0
2 b 0
3 b 1
4 b 2
5 b 3
6 b 4
7 b 5
8 b 6
9 b 0
10 c 1
11 c 2
12 c 3
13 c 4
I've tried to do this through a nested for loop but can't get it to work correctly. I would love some guidance in doing this with apply if possible.
This would be very easy with a data.table table:
# install.packages("data.table")
library(data.table)
df <- data.frame(id = c("a", "b", "c"), max.number = c(0, 6, 4))
dt <- as.data.table(df)
# or directly:
# dt <- data.table(id = c("a", "b", "c"), max.number = c(0, 6, 4))
dt[, .(number=seq(0, max.number)), by=id]
Output:
id number
1: a 0
2: b 0
3: b 1
4: b 2
5: b 3
6: b 4
7: b 5
8: b 6
9: c 0
10: c 1
11: c 2
12: c 3
13: c 4

Resources