data.table fill missing values from other rows by group - r

# have
> aDT <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA))
> aDT
colA colB
1: 1 4
2: 1 NA
3: 1 NA
4: 1 1
5: 2 4
6: 2 3
7: 2 NA
8: 2 NA
9: 3 4
10: 3 NA
11: 3 2
12: 3 NA
# want
> bDT <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,1,1,1,4,3,3,3,4,2,2,2))
> bDT
colA colB
1: 1 4
2: 1 1
3: 1 1
4: 1 1
5: 2 4
6: 2 3
7: 2 3
8: 2 3
9: 3 4
10: 3 2
11: 3 2
12: 3 2
Would like to fill missing values according to the algorithm below:
within each group ('colA'),
use the value from one row below, if it's still NA, keeps going until the last row within that group
if all NAs in rows below, look at rows above (go up 1 row at a time)
if all NAs, then NA
Since the dataset is quite large, algorithmic efficiency is part of consideration. Not sure if there's any package for this type of operation already. How to do it?

With data.table and zoo:
library(data.table)
library(zoo)
# Last observation carried forward from last row of group
dt <- dt[, colB := na.locf0(colB, fromLast = TRUE), by = colA]
# Last observation carried forward for first row of group
dt[, colB := na.locf(colB), by = colA][]
Or in a single chain:
dt[, colB := na.locf0(colB, fromLast = TRUE), by = colA][
, colB := na.locf(colB), by = colA][]
Both return:
colA colB
1: 1 4
2: 1 1
3: 1 1
4: 1 1
5: 2 4
6: 2 3
7: 2 3
8: 2 3
9: 3 4
10: 3 2
11: 3 2
12: 3 2
Data:
text <- "colA colB
1 4
1 NA
1 NA
1 1
2 4
2 3
2 NA
2 NA
3 4
3 NA
3 2
3 NA"
dt <- fread(input = text, stringsAsFactors = FALSE)

Here is one way using tidyverse and zoo::na.locf:
library(tidyverse);
library(zoo);
df %>%
group_by(colA) %>%
arrange(colA) %>%
mutate(colB = na.locf(colB, na.rm = F, fromLast = TRUE)) %>%
mutate(colB = na.locf(colB, na.rm = F));
## A tibble: 12 x 2
## Groups: colA [3]
# colA colB
# <dbl> <dbl>
# 1 1.00 4.00
# 2 1.00 1.00
# 3 1.00 1.00
# 4 1.00 1.00
# 5 2.00 4.00
# 6 2.00 3.00
# 7 2.00 3.00
# 8 2.00 3.00
# 9 3.00 4.00
#10 3.00 2.00
#11 3.00 2.00
#12 3.00 2.00
Or the data.table way:
library(data.table);
dt[, .(na.locf(na.locf(colB, na.rm = F, fromLast = T), na.rm = F)), by = .(colA)];
# colA V1
# 1: 1 4
# 2: 1 1
# 3: 1 1
# 4: 1 1
# 5: 2 4
# 6: 2 3
# 7: 2 3
# 8: 2 3
# 9: 3 4
#10: 3 2
#11: 3 2
#12: 3 2
The key in both cases is to apply na.locf twice: First to replace NAs from the bottom, then replace the remaining NAs from the top.
Sample data
# As data.frame
df <- data.frame(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA));
# As data.table
dt <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA));

library(tidyverse)
aDT%>%group_by(colA)%>%fill(colB,.direction="up")%>%fill(colB)
# A tibble: 12 x 2
# Groups: colA [3]
colA colB
<dbl> <dbl>
1 1 4
2 1 1
3 1 1
4 1 1
5 2 4
6 2 3
7 2 3
8 2 3
9 3 4
10 3 2
11 3 2
12 3 2

Related

R data.table group by continuous values

I need some help with grouping data by continuous values.
If I have this data.table
dt <- data.table::data.table( a = c(1,1,1,2,2,2,2,1,1,2), b = seq(1:10), c = seq(1:10)+1 )
a b c
1: 1 1 2
2: 1 2 3
3: 1 3 4
4: 2 4 5
5: 2 5 6
6: 2 6 7
7: 2 7 8
8: 1 8 9
9: 1 9 10
10: 2 10 11
I need a group for every following equal values in column a. Of this group i need the first (also min possible) value of column b and the last (also max possible) value of column c.
Like this:
a b c
1: 1 1 4
2: 2 4 8
3: 1 8 10
4: 2 10 11
Thank you very much for your help. I do not get it solved alone.
Probably we can try
> dt[, .(a = a[1], b = b[1], c = c[.N]), rleid(a)][, -1]
a b c
1: 1 1 4
2: 2 4 8
3: 1 8 10
4: 2 10 11
An option with dplyr
library(dplyr)
dt %>%
group_by(grp = cumsum(c(TRUE, diff(a) != 0))) %>%
summarise(across(a:b, first), c = last(c)) %>%
select(-grp)
-output
# A tibble: 4 × 3
a b c
<dbl> <int> <dbl>
1 1 1 4
2 2 4 8
3 1 8 10
4 2 10 11

R Repeat values based on value in dataframe

I am trying to create a long data frame whose values are created from a lookup dataframe
df_lookup = data.frame(id = c(1,2,3), one = c(10,9,7), two = c(0,1,2), three = c(0,0,1))
df_lookup
#> id one two three
#> 1 1 10 0 0
#> 2 2 9 1 0
#> 3 3 7 2 1
The output I am looking for is a data frame with 30 rows, where first ten rows are all 1s, for the next 10 rows the values are nine 1s and one 2, for the last 10 rows the values are seven 1s, two 2s and one 3.
df_output
id bin
1: 1 1
2: 1 1
3: 1 1
4: 1 1
5: 1 1
6: 1 1
7: 1 1
8: 1 1
9: 1 1
10: 1 1
11: 2 1
12: 2 1
13: 2 1
14: 2 1
15: 2 1
16: 2 1
17: 2 1
18: 2 1
19: 2 1
20: 2 2
21: 3 1
22: 3 1
23: 3 1
24: 3 1
25: 3 1
26: 3 1
27: 3 1
28: 3 2
29: 3 2
30: 3 3
Based on some online search for similar questions like here
I was able to come up with the following code
df_lookup = data.frame(id = c(1,2,3), one = c(10,9,7), two = c(0,1,2), three = c(0,0,1))
col_names = c("one","two","three")
setDT(df_lookup)
df_output = data.frame()
for (j in 1:length(col_names)){
temp_df = df_lookup[, .(rep(j, get(as.character(col_names[j])))),.(id)]
df_output = rbind(df_output,temp_df)
}
names(df_output) = c("id","bin")
df_output = df_output[order(df_output$id,df_output$bin),]
While this solves the purpose, it can take some time to run when there are many 'id' or many 'df_lookup' tables which I need to loop through.
So wanted to check if there's any optimal/faster way to achieve 'df_output'
A data.table solution using melt() and rep()
library(data.table)
df_lookup = data.frame(id = c(1,2,3),
one = c(10,9,7),
two = c(0,1,2),
three = c(0,0,1))
dt <- data.table::as.data.table(df_lookup)
# into long format
dt_melt <- melt(dt, id.vars = "id")
dt_melt
#> id variable value
#> 1: 1 one 10
#> 2: 2 one 9
#> 3: 3 one 7
#> 4: 1 two 0
#> 5: 2 two 1
#> 6: 3 two 2
#> 7: 1 three 0
#> 8: 2 three 0
#> 9: 3 three 1
dt_exploded <- dt_melt[, rep(variable, value), by = id]
dt_exploded[, bin := data.table::fcase(V1 == "one", 1,
V1 == "two", 2,
V1 == "three", 3)][]
#> id V1 bin
#> 1: 1 one 1
#> 2: 1 one 1
#> 3: 1 one 1
#> 4: 1 one 1
#> 5: 1 one 1
#> 6: 1 one 1
#> 7: 1 one 1
#> 8: 1 one 1
#> 9: 1 one 1
#> 10: 1 one 1
#> 11: 2 one 1
#> 12: 2 one 1
#> 13: 2 one 1
#> 14: 2 one 1
#> 15: 2 one 1
#> 16: 2 one 1
#> 17: 2 one 1
#> 18: 2 one 1
#> 19: 2 one 1
#> 20: 2 two 2
#> 21: 3 one 1
#> 22: 3 one 1
#> 23: 3 one 1
#> 24: 3 one 1
#> 25: 3 one 1
#> 26: 3 one 1
#> 27: 3 one 1
#> 28: 3 two 2
#> 29: 3 two 2
#> 30: 3 three 3
#> id V1 bin
You could do the mapping of one to 1 at any time though
library(tidyverse)
df_lookup = data.frame(id = c(1,2,3), one = c(10,9,7), two = c(0,1,2), three = c(0,0,1))
df_lookup %>%
pivot_longer(-id) %>%
uncount(value) %>%
mutate(
bin = case_when(name == "one" ~ 1L,
name == "two" ~ 2L,
name == "three" ~ 3L),
.keep = "unused"
)
#> # A tibble: 30 x 2
#> id bin
#> <dbl> <int>
#> 1 1 1
#> 2 1 1
#> 3 1 1
#> 4 1 1
#> 5 1 1
#> 6 1 1
#> 7 1 1
#> 8 1 1
#> 9 1 1
#> 10 1 1
#> # ... with 20 more rows
#> # i Use `print(n = ...)` to see more rows
Created on 2022-08-19 with reprex v2.0.2

Group variable by "n" consecutive integers in data.table

library(data.table)
DT <- data.table(var = 1:100)
I want to create a second variable, group that groups the values in var by n consecutive integers. So if n is equal to 1, it would return the same column as var. If n=2, it would return me:
var group
1: 1 1
2: 2 1
3: 3 2
4: 4 2
5: 5 3
6: 6 3
If n=3, it would return me:
var group
1: 1 1
2: 2 1
3: 3 1
4: 4 2
5: 5 2
6: 6 2
and so on. I would like to do this as flexibly as possibly.
Note that there could be repeated values:
var group
1: 1 1
2: 1 1
3: 2 1
4: 3 2
5: 3 2
6: 4 2
Here, group corresponds to n=2. Thank you!
I think we can use findInterval for this:
DT <- data.table(var = c(1L, 1:10))
n <- 2
DT[, group := findInterval(var, seq(min(var), max(var) + n, by = n))]
# var group
# <int> <int>
# 1: 1 1
# 2: 1 1
# 3: 2 1
# 4: 3 2
# 5: 4 2
# 6: 5 3
# 7: 6 3
# 8: 7 4
# 9: 8 4
# 10: 9 5
# 11: 10 5
n <- 3
DT[, group := findInterval(var, seq(min(var), max(var) + n, by = n))]
# var group
# <int> <int>
# 1: 1 1
# 2: 1 1
# 3: 2 1
# 4: 3 1
# 5: 4 2
# 6: 5 2
# 7: 6 2
# 8: 7 3
# 9: 8 3
# 10: 9 3
# 11: 10 4
(The +n in the call to seq is so that we always have a little more than we need; if we did just seq(min(.),max(.),by=n), it would be possible the highest values of var would be outside of the sequence. One could also do c(seq(min(.), max(.), by=n), Inf) for the same effect.)

Count length of sequential consequtive values per group in R

I have a dataset with consequtive values and I would like to know the count of how many times each length occurs.
More specifically, I want to find out how many id's have a sequence running from 1:2, from 1:3, from 1:4 etc.
Only sequences starting from 1 are of interest.
In this example, id1 would have a "full" sequence running from 1:3 (as the number 4 is missing), id2 has a sequence running from 1:5, id3 has a sequence running from 1:6, id4 is not counted since it does not start with a value of 1 and id 5 has a sequence running from 1:3.
So we end up with two sequences until 3, one until 5 and one until 6.
Is there a clever way to calculate this, without resorting to inefficient loops?
Example data:
data <- data.table( id = c(1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5),
value = c(1,2,3,5,1,2,3,4,5,10,11,1,2,3,4,5,6,2,3,4,5,6,7,8,1,2,3,7))
> data
id value
1: 1 1
2: 1 2
3: 1 3
4: 1 5
5: 2 1
6: 2 2
7: 2 3
8: 2 4
9: 2 5
10: 2 10
11: 2 11
12: 3 1
13: 3 2
14: 3 3
15: 3 4
16: 3 5
17: 3 6
18: 4 2
19: 4 3
20: 4 4
21: 4 5
22: 4 6
23: 4 7
24: 4 8
25: 5 1
26: 5 2
27: 5 3
28: 5 7
id value
out <- data[, len0 := rleid(c(TRUE, diff(value) == 1L)), by = .(id) ][
, .(value1 = first(value), len = .N), by = .(id, len0) ]
out
# id len0 value1 len
# <num> <int> <num> <int>
# 1: 1 1 1 3
# 2: 1 2 5 1
# 3: 2 1 1 5
# 4: 2 2 10 1
# 5: 2 3 11 1
# 6: 3 1 1 6
# 7: 4 1 2 7
# 8: 5 1 1 3
# 9: 5 2 7 1
Walk-through:
within each id, the len0 is created to identify the increase-by-1 steps
within id,len0, summarize with the first value (in case you only want those starting at 1, see below) and the length of the run
If you just want to know those whose sequences begin at one, filter on value1:
out[ value1 == 1L, ]
# id len0 value1 len
# <num> <int> <num> <int>
# 1: 1 1 1 3
# 2: 2 1 1 5
# 3: 3 1 1 6
# 4: 5 1 1 3
(I think you only need id and len at this point.)
Here is another option:
data[rowid(id)==value, max(value), id]
output:
id V1
1: 1 3
2: 2 5
3: 3 6
4: 5 3
library(data.table)
dt <- data.table( id = c(1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5),
value = c(1,2,3,5,1,2,3,4,5,10,11,1,2,3,4,5,6,2,3,4,5,6,7,8,1,2,3,7))
dt[, n := seq_len(.N) - value, by = id]
res <- dt[n == 0, .SD[value == max(value)], by = id][, n := NULL]
head(res)
#> id value
#> 1: 1 3
#> 2: 2 5
#> 3: 3 6
#> 4: 5 3
Created on 2021-02-04 by the reprex package (v1.0.0)
One option utilizing dplyr might be:
data %>%
group_by(id) %>%
mutate(rleid = with(rle(c(0, diff(value)) <= 1), rep(seq_along(values), lengths))) %>%
filter(rleid == 1 & min(value) == 1) %>%
summarise(value = paste(value, collapse = "")) %>%
group_by(value) %>%
summarise(n = n(),
ids = toString(id))
value n ids
<chr> <int> <chr>
1 123 2 1, 5
2 12345 1 2
3 123456 1 3

R find intervals in data.table

i want to add a new column with intervals or breakpoints by group. As an an example:
This is my data.table:
x <- data.table(a = c(1:8,1:8), b = c(rep("A",8),rep("B",8)))
I have already the breakpoint or rowindices:
pos <- data.table(b = c("A","A","B","B"), bp = c(3,5,2,4))
Here i can find the interval for group "A" with:
findInterval(1:nrow(x[b=="A"]), pos[b=="A"]$bp)
How can i do this for each group. In this case "A" and "B"?
An option is to split the datasets by 'b' column, use Map to loop over the corresponding lists, and apply findInterval
Map(function(u, v) findInterval(seq_len(nrow(u)), v$bp),
split(x, x$b), split(pos, pos$b))
#$A
#[1] 0 0 1 1 2 2 2 2
#$B
#[1] 0 1 1 2 2 2 2 2
or another option is to group by 'b' from 'x', then use findInterval by subsetting the 'bp' from 'pos' by filtering with a logical condition created based on .BY
x[, findInterval(seq_len(.N), pos$bp[pos$b==.BY]), b]
# b V1
# 1: A 0
# 2: A 0
# 3: A 1
# 4: A 1
# 5: A 2
# 6: A 2
# 7: A 2
# 8: A 2
# 9: B 0
#10: B 1
#11: B 1
#12: B 2
#13: B 2
#14: B 2
#15: B 2
#16: B 2
Another option using rolling join in data.table:
pos[, ri := rowid(b)]
x[, intvl := fcoalesce(pos[x, on=.(b, bp=a), roll=Inf, ri], 0L)]
output:
a b intvl
1: 1 A 0
2: 2 A 0
3: 3 A 1
4: 4 A 1
5: 5 A 2
6: 6 A 2
7: 7 A 2
8: 8 A 2
9: 1 B 0
10: 2 B 1
11: 3 B 1
12: 4 B 2
13: 5 B 2
14: 6 B 2
15: 7 B 2
16: 8 B 2
We can nest the pos data into list by b and join with x and use findInterval to get corresponding groups.
library(dplyr)
pos %>%
tidyr::nest(data = bp) %>%
right_join(x, by = 'b') %>%
group_by(b) %>%
mutate(interval = findInterval(a, data[[1]][[1]])) %>%
select(-data)
# b a interval
# <chr> <int> <int>
# 1 A 1 0
# 2 A 2 0
# 3 A 3 1
# 4 A 4 1
# 5 A 5 2
# 6 A 6 2
# 7 A 7 2
# 8 A 8 2
# 9 B 1 0
#10 B 2 1
#11 B 3 1
#12 B 4 2
#13 B 5 2
#14 B 6 2
#15 B 7 2
#16 B 8 2

Resources