R Repeat values based on value in dataframe - r

I am trying to create a long data frame whose values are created from a lookup dataframe
df_lookup = data.frame(id = c(1,2,3), one = c(10,9,7), two = c(0,1,2), three = c(0,0,1))
df_lookup
#> id one two three
#> 1 1 10 0 0
#> 2 2 9 1 0
#> 3 3 7 2 1
The output I am looking for is a data frame with 30 rows, where first ten rows are all 1s, for the next 10 rows the values are nine 1s and one 2, for the last 10 rows the values are seven 1s, two 2s and one 3.
df_output
id bin
1: 1 1
2: 1 1
3: 1 1
4: 1 1
5: 1 1
6: 1 1
7: 1 1
8: 1 1
9: 1 1
10: 1 1
11: 2 1
12: 2 1
13: 2 1
14: 2 1
15: 2 1
16: 2 1
17: 2 1
18: 2 1
19: 2 1
20: 2 2
21: 3 1
22: 3 1
23: 3 1
24: 3 1
25: 3 1
26: 3 1
27: 3 1
28: 3 2
29: 3 2
30: 3 3
Based on some online search for similar questions like here
I was able to come up with the following code
df_lookup = data.frame(id = c(1,2,3), one = c(10,9,7), two = c(0,1,2), three = c(0,0,1))
col_names = c("one","two","three")
setDT(df_lookup)
df_output = data.frame()
for (j in 1:length(col_names)){
temp_df = df_lookup[, .(rep(j, get(as.character(col_names[j])))),.(id)]
df_output = rbind(df_output,temp_df)
}
names(df_output) = c("id","bin")
df_output = df_output[order(df_output$id,df_output$bin),]
While this solves the purpose, it can take some time to run when there are many 'id' or many 'df_lookup' tables which I need to loop through.
So wanted to check if there's any optimal/faster way to achieve 'df_output'

A data.table solution using melt() and rep()
library(data.table)
df_lookup = data.frame(id = c(1,2,3),
one = c(10,9,7),
two = c(0,1,2),
three = c(0,0,1))
dt <- data.table::as.data.table(df_lookup)
# into long format
dt_melt <- melt(dt, id.vars = "id")
dt_melt
#> id variable value
#> 1: 1 one 10
#> 2: 2 one 9
#> 3: 3 one 7
#> 4: 1 two 0
#> 5: 2 two 1
#> 6: 3 two 2
#> 7: 1 three 0
#> 8: 2 three 0
#> 9: 3 three 1
dt_exploded <- dt_melt[, rep(variable, value), by = id]
dt_exploded[, bin := data.table::fcase(V1 == "one", 1,
V1 == "two", 2,
V1 == "three", 3)][]
#> id V1 bin
#> 1: 1 one 1
#> 2: 1 one 1
#> 3: 1 one 1
#> 4: 1 one 1
#> 5: 1 one 1
#> 6: 1 one 1
#> 7: 1 one 1
#> 8: 1 one 1
#> 9: 1 one 1
#> 10: 1 one 1
#> 11: 2 one 1
#> 12: 2 one 1
#> 13: 2 one 1
#> 14: 2 one 1
#> 15: 2 one 1
#> 16: 2 one 1
#> 17: 2 one 1
#> 18: 2 one 1
#> 19: 2 one 1
#> 20: 2 two 2
#> 21: 3 one 1
#> 22: 3 one 1
#> 23: 3 one 1
#> 24: 3 one 1
#> 25: 3 one 1
#> 26: 3 one 1
#> 27: 3 one 1
#> 28: 3 two 2
#> 29: 3 two 2
#> 30: 3 three 3
#> id V1 bin
You could do the mapping of one to 1 at any time though

library(tidyverse)
df_lookup = data.frame(id = c(1,2,3), one = c(10,9,7), two = c(0,1,2), three = c(0,0,1))
df_lookup %>%
pivot_longer(-id) %>%
uncount(value) %>%
mutate(
bin = case_when(name == "one" ~ 1L,
name == "two" ~ 2L,
name == "three" ~ 3L),
.keep = "unused"
)
#> # A tibble: 30 x 2
#> id bin
#> <dbl> <int>
#> 1 1 1
#> 2 1 1
#> 3 1 1
#> 4 1 1
#> 5 1 1
#> 6 1 1
#> 7 1 1
#> 8 1 1
#> 9 1 1
#> 10 1 1
#> # ... with 20 more rows
#> # i Use `print(n = ...)` to see more rows
Created on 2022-08-19 with reprex v2.0.2

Related

R data.table group by continuous values

I need some help with grouping data by continuous values.
If I have this data.table
dt <- data.table::data.table( a = c(1,1,1,2,2,2,2,1,1,2), b = seq(1:10), c = seq(1:10)+1 )
a b c
1: 1 1 2
2: 1 2 3
3: 1 3 4
4: 2 4 5
5: 2 5 6
6: 2 6 7
7: 2 7 8
8: 1 8 9
9: 1 9 10
10: 2 10 11
I need a group for every following equal values in column a. Of this group i need the first (also min possible) value of column b and the last (also max possible) value of column c.
Like this:
a b c
1: 1 1 4
2: 2 4 8
3: 1 8 10
4: 2 10 11
Thank you very much for your help. I do not get it solved alone.
Probably we can try
> dt[, .(a = a[1], b = b[1], c = c[.N]), rleid(a)][, -1]
a b c
1: 1 1 4
2: 2 4 8
3: 1 8 10
4: 2 10 11
An option with dplyr
library(dplyr)
dt %>%
group_by(grp = cumsum(c(TRUE, diff(a) != 0))) %>%
summarise(across(a:b, first), c = last(c)) %>%
select(-grp)
-output
# A tibble: 4 × 3
a b c
<dbl> <int> <dbl>
1 1 1 4
2 2 4 8
3 1 8 10
4 2 10 11

Group variable by "n" consecutive integers in data.table

library(data.table)
DT <- data.table(var = 1:100)
I want to create a second variable, group that groups the values in var by n consecutive integers. So if n is equal to 1, it would return the same column as var. If n=2, it would return me:
var group
1: 1 1
2: 2 1
3: 3 2
4: 4 2
5: 5 3
6: 6 3
If n=3, it would return me:
var group
1: 1 1
2: 2 1
3: 3 1
4: 4 2
5: 5 2
6: 6 2
and so on. I would like to do this as flexibly as possibly.
Note that there could be repeated values:
var group
1: 1 1
2: 1 1
3: 2 1
4: 3 2
5: 3 2
6: 4 2
Here, group corresponds to n=2. Thank you!
I think we can use findInterval for this:
DT <- data.table(var = c(1L, 1:10))
n <- 2
DT[, group := findInterval(var, seq(min(var), max(var) + n, by = n))]
# var group
# <int> <int>
# 1: 1 1
# 2: 1 1
# 3: 2 1
# 4: 3 2
# 5: 4 2
# 6: 5 3
# 7: 6 3
# 8: 7 4
# 9: 8 4
# 10: 9 5
# 11: 10 5
n <- 3
DT[, group := findInterval(var, seq(min(var), max(var) + n, by = n))]
# var group
# <int> <int>
# 1: 1 1
# 2: 1 1
# 3: 2 1
# 4: 3 1
# 5: 4 2
# 6: 5 2
# 7: 6 2
# 8: 7 3
# 9: 8 3
# 10: 9 3
# 11: 10 4
(The +n in the call to seq is so that we always have a little more than we need; if we did just seq(min(.),max(.),by=n), it would be possible the highest values of var would be outside of the sequence. One could also do c(seq(min(.), max(.), by=n), Inf) for the same effect.)

Counter based on ID and value in a column

I have a dataframe that contains an ID and Type column. I want a counter that if the Type is "T" then the counter in the next row would be counter + 1 for every ID. Basically, the counter is the Output_column in this example.
ID <- c(1,1,1,1,1,1,3,3,4,4,4,4)
Type <- c("A","A","T","A","A","A","A","A","T","A","T","A")
Output_Column <- c(1,1,1,2,2,2,1,1,1,2,2,3)
ID Type Output_Column
1 1 A 1
2 1 A 1
3 1 T 1
4 1 A 2
5 1 A 2
6 1 A 2
7 3 A 1
8 3 A 1
9 4 T 1
10 4 A 2
11 4 T 2
12 4 A 3
d <- data.frame(ID,Type, Output_Column)
baseR solution
output_col <- as.numeric(ave(Type, ID, FUN = function(x) cumsum(c('T', x[-length(x)]) == 'T')))
output_col
[1] 1 1 1 2 2 2 1 1 1 2 2 3
Here's data.table version :
library(data.table)
setDT(d)[, res := shift(cumsum(Type == 'T') + 1, fill = 1), ID]
d
# ID Type Output_Column res
# 1: 1 A 1 1
# 2: 1 A 1 1
# 3: 1 T 1 1
# 4: 1 A 2 2
# 5: 1 A 2 2
# 6: 1 A 2 2
# 7: 3 A 1 1
# 8: 3 A 1 1
# 9: 4 T 1 1
#10: 4 A 2 2
#11: 4 T 2 2
#12: 4 A 3 3
Here is a way to achieve it using group_by, lag, and cumsum
library(dplyr)
d %>%
# group by ID so calculation is within each ID
group_by(ID) %>%
mutate(
# create a counter variable check if previous Type is "T"
# Here default is "T" which result the first row of ID will start at 1
counter = if_else(lag(Type, default = "T") == "T", 1, 0),
# cumsum the counter which result same as the expected output column
output_column_calculated = cumsum(counter)) %>%
ungroup() %>%
# Remove the counter column if not needed
select(-counter)
#> # A tibble: 12 x 4
#> ID Type Output_Column output_column_calculated
#> <dbl> <chr> <dbl> <dbl>
#> 1 1 A 1 1
#> 2 1 A 1 1
#> 3 1 T 1 1
#> 4 1 A 2 2
#> 5 1 A 2 2
#> 6 1 A 2 2
#> 7 3 A 1 1
#> 8 3 A 1 1
#> 9 4 T 1 1
#> 10 4 A 2 2
#> 11 4 T 2 2
#> 12 4 A 3 3
Created on 2021-04-26 by the reprex package (v2.0.0)

Count length of sequential consequtive values per group in R

I have a dataset with consequtive values and I would like to know the count of how many times each length occurs.
More specifically, I want to find out how many id's have a sequence running from 1:2, from 1:3, from 1:4 etc.
Only sequences starting from 1 are of interest.
In this example, id1 would have a "full" sequence running from 1:3 (as the number 4 is missing), id2 has a sequence running from 1:5, id3 has a sequence running from 1:6, id4 is not counted since it does not start with a value of 1 and id 5 has a sequence running from 1:3.
So we end up with two sequences until 3, one until 5 and one until 6.
Is there a clever way to calculate this, without resorting to inefficient loops?
Example data:
data <- data.table( id = c(1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5),
value = c(1,2,3,5,1,2,3,4,5,10,11,1,2,3,4,5,6,2,3,4,5,6,7,8,1,2,3,7))
> data
id value
1: 1 1
2: 1 2
3: 1 3
4: 1 5
5: 2 1
6: 2 2
7: 2 3
8: 2 4
9: 2 5
10: 2 10
11: 2 11
12: 3 1
13: 3 2
14: 3 3
15: 3 4
16: 3 5
17: 3 6
18: 4 2
19: 4 3
20: 4 4
21: 4 5
22: 4 6
23: 4 7
24: 4 8
25: 5 1
26: 5 2
27: 5 3
28: 5 7
id value
out <- data[, len0 := rleid(c(TRUE, diff(value) == 1L)), by = .(id) ][
, .(value1 = first(value), len = .N), by = .(id, len0) ]
out
# id len0 value1 len
# <num> <int> <num> <int>
# 1: 1 1 1 3
# 2: 1 2 5 1
# 3: 2 1 1 5
# 4: 2 2 10 1
# 5: 2 3 11 1
# 6: 3 1 1 6
# 7: 4 1 2 7
# 8: 5 1 1 3
# 9: 5 2 7 1
Walk-through:
within each id, the len0 is created to identify the increase-by-1 steps
within id,len0, summarize with the first value (in case you only want those starting at 1, see below) and the length of the run
If you just want to know those whose sequences begin at one, filter on value1:
out[ value1 == 1L, ]
# id len0 value1 len
# <num> <int> <num> <int>
# 1: 1 1 1 3
# 2: 2 1 1 5
# 3: 3 1 1 6
# 4: 5 1 1 3
(I think you only need id and len at this point.)
Here is another option:
data[rowid(id)==value, max(value), id]
output:
id V1
1: 1 3
2: 2 5
3: 3 6
4: 5 3
library(data.table)
dt <- data.table( id = c(1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5),
value = c(1,2,3,5,1,2,3,4,5,10,11,1,2,3,4,5,6,2,3,4,5,6,7,8,1,2,3,7))
dt[, n := seq_len(.N) - value, by = id]
res <- dt[n == 0, .SD[value == max(value)], by = id][, n := NULL]
head(res)
#> id value
#> 1: 1 3
#> 2: 2 5
#> 3: 3 6
#> 4: 5 3
Created on 2021-02-04 by the reprex package (v1.0.0)
One option utilizing dplyr might be:
data %>%
group_by(id) %>%
mutate(rleid = with(rle(c(0, diff(value)) <= 1), rep(seq_along(values), lengths))) %>%
filter(rleid == 1 & min(value) == 1) %>%
summarise(value = paste(value, collapse = "")) %>%
group_by(value) %>%
summarise(n = n(),
ids = toString(id))
value n ids
<chr> <int> <chr>
1 123 2 1, 5
2 12345 1 2
3 123456 1 3

data.table fill missing values from other rows by group

# have
> aDT <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA))
> aDT
colA colB
1: 1 4
2: 1 NA
3: 1 NA
4: 1 1
5: 2 4
6: 2 3
7: 2 NA
8: 2 NA
9: 3 4
10: 3 NA
11: 3 2
12: 3 NA
# want
> bDT <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,1,1,1,4,3,3,3,4,2,2,2))
> bDT
colA colB
1: 1 4
2: 1 1
3: 1 1
4: 1 1
5: 2 4
6: 2 3
7: 2 3
8: 2 3
9: 3 4
10: 3 2
11: 3 2
12: 3 2
Would like to fill missing values according to the algorithm below:
within each group ('colA'),
use the value from one row below, if it's still NA, keeps going until the last row within that group
if all NAs in rows below, look at rows above (go up 1 row at a time)
if all NAs, then NA
Since the dataset is quite large, algorithmic efficiency is part of consideration. Not sure if there's any package for this type of operation already. How to do it?
With data.table and zoo:
library(data.table)
library(zoo)
# Last observation carried forward from last row of group
dt <- dt[, colB := na.locf0(colB, fromLast = TRUE), by = colA]
# Last observation carried forward for first row of group
dt[, colB := na.locf(colB), by = colA][]
Or in a single chain:
dt[, colB := na.locf0(colB, fromLast = TRUE), by = colA][
, colB := na.locf(colB), by = colA][]
Both return:
colA colB
1: 1 4
2: 1 1
3: 1 1
4: 1 1
5: 2 4
6: 2 3
7: 2 3
8: 2 3
9: 3 4
10: 3 2
11: 3 2
12: 3 2
Data:
text <- "colA colB
1 4
1 NA
1 NA
1 1
2 4
2 3
2 NA
2 NA
3 4
3 NA
3 2
3 NA"
dt <- fread(input = text, stringsAsFactors = FALSE)
Here is one way using tidyverse and zoo::na.locf:
library(tidyverse);
library(zoo);
df %>%
group_by(colA) %>%
arrange(colA) %>%
mutate(colB = na.locf(colB, na.rm = F, fromLast = TRUE)) %>%
mutate(colB = na.locf(colB, na.rm = F));
## A tibble: 12 x 2
## Groups: colA [3]
# colA colB
# <dbl> <dbl>
# 1 1.00 4.00
# 2 1.00 1.00
# 3 1.00 1.00
# 4 1.00 1.00
# 5 2.00 4.00
# 6 2.00 3.00
# 7 2.00 3.00
# 8 2.00 3.00
# 9 3.00 4.00
#10 3.00 2.00
#11 3.00 2.00
#12 3.00 2.00
Or the data.table way:
library(data.table);
dt[, .(na.locf(na.locf(colB, na.rm = F, fromLast = T), na.rm = F)), by = .(colA)];
# colA V1
# 1: 1 4
# 2: 1 1
# 3: 1 1
# 4: 1 1
# 5: 2 4
# 6: 2 3
# 7: 2 3
# 8: 2 3
# 9: 3 4
#10: 3 2
#11: 3 2
#12: 3 2
The key in both cases is to apply na.locf twice: First to replace NAs from the bottom, then replace the remaining NAs from the top.
Sample data
# As data.frame
df <- data.frame(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA));
# As data.table
dt <- data.table(colA = c(1,1,1,1,2,2,2,2,3,3,3,3), colB = c(4,NA,NA,1,4,3,NA,NA,4,NA,2,NA));
library(tidyverse)
aDT%>%group_by(colA)%>%fill(colB,.direction="up")%>%fill(colB)
# A tibble: 12 x 2
# Groups: colA [3]
colA colB
<dbl> <dbl>
1 1 4
2 1 1
3 1 1
4 1 1
5 2 4
6 2 3
7 2 3
8 2 3
9 3 4
10 3 2
11 3 2
12 3 2

Resources