extend data with data.table - r

I have this data.table
library(data.table)
data.table(
id = c(rep(1, 3), rep(2, 2)),
begin = c(1, 4, 8, 1, 11),
end = c(3, 7, 12, 10, 12),
state = c("A", "B", "A", "B", "A")
)
I would like to have this output :
data.table(
id = c(1, 2),
m1 = c("A", "B"),
m2 = c("A", "B"),
m3 = c("A", "B"),
m4 = c("B", "B"),
m5 = c("B", "B"),
m6 = c("B", "B"),
m7 = c("B", "B"),
m8 = c("A", "B"),
m9 = c("A", "B"),
m10 = c("A", "B"),
m11 = c("A", "A"),
m12 = c("A", "A")
)
Those who used to do sequence analysis may have recognized that I'm trying to do what seqformat do in the TRaMiNeR package would do, but with higher performance due to use of data.table

One option with data.table would be to melt the dataset after creating a sequence column, then grouped by 'i1', 'id', 'state', get the sequence of first and last 'value', dcast it from 'long' to 'wide'
dt1 <- melt(dt[, i1 := seq_len(.N)], id.vars = c("i1", "id", "state"))[,
paste0("m", seq(first(value), last(value))), .(i1, id, state)]
dcast(dt1, id ~ V1, value.var = "state")[]
# id m1 m10 m11 m12 m2 m3 m4 m5 m6 m7 m8 m9
#1: 1 A A A A A A B B B B A A
#2: 2 B B A A B B B B B B B B

A solution using the tidyverse.
library(tidyverse)
library(data.table)
dat <- data.table(
id = c(rep(1, 3), rep(2, 2)),
begin = c(1, 4, 8, 1, 11),
end = c(3, 7, 12, 10, 12),
state = c("A", "B", "A", "B", "A")
)
dat2 <- dat %>%
mutate(Index = map2(begin, end, `:`)) %>%
unnest() %>%
mutate(Index = str_c("m", Index)) %>%
select(id, state, Index) %>%
spread(Index, state) %>%
select(id, str_c("m", 1:(ncol(.) - 1)))
dat2
# id m1 m2 m3 m4 m5 m6 m7 m8 m9 m10 m11 m12
# 1 1 A A A B B B B A A A A A
# 2 2 B B B B B B B B B B A A

An alternative solution:
dt[, unlist(Map(`:`, begin, end)), by = .(id, state)
][, dcast(.SD, id ~ sprintf("m%02d", V1), value.var = "state")]
which gives:
id m01 m02 m03 m04 m05 m06 m07 m08 m09 m10 m11 m12
1: 1 A A A B B B B A A A A A
2: 2 B B B B B B B B B B A A
It is possibly better to keep the data in long format. Long format is often easier to work with in R later on in your data processing/analysis.
You could achieve that with just:
dt[, unlist(Map(`:`, begin, end)), by = .(id, state)][order(id, V1)]
which gives:
id state V1
1: 1 A 1
2: 1 A 2
3: 1 A 3
4: 1 B 4
5: 1 B 5
6: 1 B 6
7: 1 B 7
8: 1 A 8
9: 1 A 9
10: 1 A 10
11: 1 A 11
12: 1 A 12
13: 2 B 1
14: 2 B 2
15: 2 B 3
16: 2 B 4
17: 2 B 5
18: 2 B 6
19: 2 B 7
20: 2 B 8
21: 2 B 9
22: 2 B 10
23: 2 A 11
24: 2 A 12
(where the [order(id, V1)]-part isn't necessary)
Used data:
dt <- data.table(
id = c(rep(1, 3), rep(2, 2)),
begin = c(1, 4, 8, 1, 11),
end = c(3, 7, 12, 10, 12),
state = c("A", "B", "A", "B", "A")
)

Related

Generate table with count of all combinations by group in a efficient way

I have the following dataset example:
df <- tibble(group = c(rep(1, 6), rep(2, 6)),
class1 = c("A", "A", "B", "B", "B", "C", "B", "B", "B", "C", "C", "C"),
class2 = c("A", "B", "B", "B", "C", "B", "B", "B", "A", "C", "A", "B"))
df
I would like to do a table of all combinations between class1 and class2, by group in a fast way.
I try the code below, but it is painfully slow for my data (that is huge > 10 million rows). It takes more than 30 minutes.
output <- df %>% table() %>% as.data.table()
output desired:
output <- tibble(group = c(rep(1, 9), rep(1, 9)),
class1 = c(rep("A", 3), rep("B", 3), rep("C", 3),
rep("A", 3), rep("B", 3), rep("C", 3)),
class2 = rep(c("A", "B", "C"), 6),
N = c(1, 1, 0, 0, 2, 1, 0, 1, 0, 0, 0, 0, 1, 2, 0, 1, 1, 1))
output
Thanks for any help
Does this work:
library(dplyr)
library(tidyr)
df %>% mutate(N = 1) %>% complete( group, class1, class2) %>%
distinct() %>% mutate(N = replace_na(N, 0))
# A tibble: 18 × 4
group class1 class2 N
<dbl> <chr> <chr> <dbl>
1 1 A A 1
2 1 A B 1
3 1 A C 0
4 1 B A 0
5 1 B B 1
6 1 B C 1
7 1 C A 0
8 1 C B 1
9 1 C C 0
10 2 A A 0
11 2 A B 0
12 2 A C 0
13 2 B A 1
14 2 B B 1
15 2 B C 0
16 2 C A 1
17 2 C B 1
18 2 C C 1
This can be a bit faster than table:
library(data.table)
df <- data.table(group = c(rep(1, 6), rep(2, 6)),
class1 = c("A", "A", "B", "B", "B", "C", "B", "B", "B", "C", "C", "C"),
class2 = c("A", "B", "B", "B", "C", "B", "B", "B", "A", "C", "A", "B"))
u <- lapply(df, function(x) sort(unique(x)))
m <- rev(cumprod(c(1, rev(lengths(u)))))
do.call(CJ, u)[
, N := tabulate(rowSums(mapply(function(i) (match(df[[i]], u[[i]]) - 1)*m[i + 1], 1:ncol(df))) + 1, m[1])
][]
#> group class1 class2 N
#> 1: 1 A A 1
#> 2: 1 A B 1
#> 3: 1 A C 0
#> 4: 1 B A 0
#> 5: 1 B B 2
#> 6: 1 B C 1
#> 7: 1 C A 0
#> 8: 1 C B 1
#> 9: 1 C C 0
#> 10: 2 A A 0
#> 11: 2 A B 0
#> 12: 2 A C 0
#> 13: 2 B A 1
#> 14: 2 B B 2
#> 15: 2 B C 0
#> 16: 2 C A 1
#> 17: 2 C B 1
#> 18: 2 C C 1
Timing a much larger data set:
library(stringi)
df <- data.table(
group = sample(20, 2e7, TRUE),
class1 = stri_rand_strings(2e7, 2, "[A-Za-z]"),
class2 = stri_rand_strings(2e7, 2, "[A-Za-z]")
)
system.time({
u <- lapply(df, function(x) sort(unique(x)))
m <- rev(cumprod(c(1, rev(lengths(u)))))
output <- do.call(CJ, u)[
, N := tabulate(rowSums(mapply(function(i) (match(df[[i]], u[[i]]) - 1)*m[i + 1], 1:ncol(df))) + 1, m[1])
]
})
#> user system elapsed
#> 3.98 0.68 4.41
Compared to table:
system.time({output <- setorder(as.data.table(table(df)))})
#> user system elapsed
#> 28.40 3.64 13.77
Even with 20M rows, table is finishing within seconds. My guess is the > 30 minute timing experienced by the OP is due to a large number of combinations of group, class1, and class2.
With data.table:
setDT(df)[CJ(group=unique(group),class1=unique(class1),class2=unique(class2))
,.(group,x.group,class1,class2),on=.(group,class1,class2)][
,.(N=sum(!is.na(x.group))),by=.(group,class1,class2)]
group class1 class2 N
<num> <char> <char> <int>
1: 1 A A 1
2: 1 A B 1
3: 1 A C 0
4: 1 B A 0
5: 1 B B 2
6: 1 B C 1
7: 1 C A 0
8: 1 C B 1
9: 1 C C 0
10: 2 A A 0
11: 2 A B 0
12: 2 A C 0
13: 2 B A 1
14: 2 B B 2
15: 2 B C 0
16: 2 C A 1
17: 2 C B 1
18: 2 C C 1
However, this is much slower than your initial solution:
microbenchmark::microbenchmark(table = {df %>% table() %>% as.data.table()},
data.table = setDT(df)[CJ(group=unique(group),class1=unique(class1),class2=unique(class2)),.(group,x.group,class1,class2),on=.(group,class1,class2)][
,.(N=sum(!is.na(x.group))),by=.(group,class1,class2)] )
Unit: microseconds
expr min lq mean median uq max neval
table 546.501 615.9015 737.100 697.6505 775.152 1619.901 100
data.table 4242.001 4495.0010 5038.249 4766.6005 5192.601 14618.100 100

parsing column names when joining data.tables

I need to join the information of column h of dataframe Y into dataframe X. The code below shows the desired output.
library(data.table)
X <- data.table(
a1 = rep("A", 6),
b1 = rep(1,6),
c1 = rep(c(0,1), 1, each = 3),
d = letters[1:6]
)
Y <- data.table(
a2 = rep(c("A","B", "C"), 1, each = 2),
b2 = rep(c(1, 2, 3), 1, each = 2),
c2 = rep(c(0,1), 3),
h = letters[7:12]
)
# final result
X[Y,
on = .(a1 = a2,
b1 = b2,
c1 = c2),
h := i.h
][]
#> a1 b1 c1 d h
#> 1: A 1 0 a g
#> 2: A 1 0 b g
#> 3: A 1 0 c g
#> 4: A 1 1 d h
#> 5: A 1 1 e h
#> 6: A 1 1 f h
Created on 2020-08-03 by the reprex package (v0.3.0)
The problem, however, is that the names of the columns that I use for making the join vary depending on the information stored somewhere else. So, let's assume that the name of the column c1 in X is stored in var, say var <- "c2". Now, when I tried to do the join, nothing seems to work.
# None the attempts below works
var <- "c1"
# attempt 1
X[Y,
on = .(a1 = a2,
b1 = b2,
eval(var) = c2),
h := i.h
][]
# attempt 2
X[Y,
on = .(a1 = a2,
b1 = b2,
get(var) = c2),
h := i.h
][]
# attempt 3
cond <- paste0(deparse(var), " = c2")
parcond <- parse(text = cond)
X[Y,
on = .(a1 = a2,
b1 = b2,
eval(parcond)),
h := i.h
][]
At the end, the only way I found to solve it is very inelegant, but it seems to be working.
var <- "c1"
setnames(X, var, "c2")
X[Y,
on = c("a1" = "a2",
"b1" = "b2",
"c2"),
h := i.h
][]
setnames(X, "c2", var)
However, I wonder if there is a better way to do this programmatically.
I checked all these links, but I could not find a solution that works for me.
Thank you so much for your help.
Thanks to #chinsoon12 for his/her comment, the solution to the problem would be as follows,
library(data.table)
X <- data.table(
a1 = rep("A", 6),
b1 = rep(1,6),
c1 = rep(c(0,1), 1, each = 3),
d = letters[1:6]
)
Y <- data.table(
a2 = rep(c("A","B", "C"), 1, each = 2),
b2 = rep(c(1, 2, 3), 1, each = 2),
c2 = rep(c(0,1), 3),
h = letters[7:12]
)
var <- "c1"
onkey <- c("a1==a2", "b1==b2", paste0(var,"==c2"))
X[Y,
on=onkey,
h := i.h
][]
#> a1 b1 c1 d h
#> 1: A 1 0 a g
#> 2: A 1 0 b g
#> 3: A 1 0 c g
#> 4: A 1 1 d h
#> 5: A 1 1 e h
#> 6: A 1 1 f h
Created on 2020-08-11 by the reprex package (v0.3.0)

find column value and name based on minimum value in other column

I have a data.table that looks like this
library( data.table )
dt <- data.table( p1 = c("a", "b", "c", "d", "e", "f", "g"),
p2 = c("b", "c", "d", "a", "f", "g", "h"),
p3 = c("z", "x", NA, NA, "y", NA, "s"),
t1 = c(1, 2, 3, NA, 5, 6, 7),
t2 = c(7, 6, 5, NA, 3, 2, NA),
t3 = c(8, 3, NA, NA, 2, NA, 1) )
# p1 p2 p3 t1 t2 t3
# 1: a b z 1 7 8
# 2: b c x 2 6 3
# 3: c d <NA> 3 5 NA
# 4: d a <NA> NA NA NA
# 5: e f y 5 3 2
# 6: f g <NA> 6 2 NA
# 7: g h s 7 NA 1
It has p-columns, representing names, and t-columns, representing values.
t1 is the value corresponding to p1, t2 to p2, etc..
On each row, values of p-columns are unique (or NA). The same goes for the values in the t-columns.
What I want to do is to create three new columns:
t_min, the minimum value of all t-columns for each row (exclude NA's)
p_min, if t_min exists (is not NA), the corresponding value of the p-column... so if the t2-column has the t-min value, the corresponding value of column p2.
p_col_min, the name of the column with the value if p_min. So if the p_min value comes from colum p2, then "p2".
I prefer a data.table, since my actual data contains a lot more rows and columns. I know melting is an option, but I would like to preserve my memory with this data, so lesser memory used is better (production data contains several million rows and >200 columns).
So far I've found a way to create the t_min-column using the following:
t_cols = dt[ , .SD, .SDcols = grep( "t[1-3]", names( dt ), value = TRUE ) ]
dt[ !all( is.na( t_cols ) ),
t_min := do.call( pmin, c( .SD, list( na.rm = TRUE ) ) ),
.SDcols = names( t_cols ) ]
But I cannot wrap my head around creating the p_min and p_col_min columns. I suppose which.min() comes into play somewhere, but I cannot figure it out. Probably something simple I'm overlooking (it always seems to be.. ;-) ).
desired output
dt.desired <- data.table( p1 = c("a", "b", "c", "d", "e", "f", "g"),
p2 = c("b", "c", "d", "a", "f", "g", "h"),
p3 = c("z", "x", NA, NA, "y", NA, "s"),
t1 = c(1, 2, 3, NA, 5, 6, 7),
t2 = c(7, 6, 5, NA, 3, 2, NA),
t3 = c(8, 3, NA, NA, 2, NA, 1),
t_min = c(1,2,3,NA,2,2,1),
p_min = c("a","b","c",NA,"y","g","s"),
p_col_min = c("p1","p1","p1",NA,"p3","p2","p3") )
# p1 p2 p3 t1 t2 t3 t_min p_min p_col_min
# 1: a b z 1 7 8 1 a p1
# 2: b c x 2 6 3 2 b p1
# 3: c d <NA> 3 5 NA 3 c p1
# 4: d a <NA> NA NA NA NA <NA> <NA>
# 5: e f y 5 3 2 2 y p3
# 6: f g <NA> 6 2 NA 2 g p2
# 7: g h s 7 NA 1 1 s p3
I cannot guarantee whether this is a solution efficient enough for your working data, but this is what I would try first:
m1 <- as.matrix(dt[, grep('^t', names(dt)), with = FALSE])
m2 <- as.matrix(dt[, grep('^p', names(dt)), with = FALSE])
t_min <- apply(m1, 1, min, na.rm = TRUE)
t_min[is.infinite(t_min)] <- NA_real_
p_min_index <- rep(NA_integer_, length(t_min))
p_min_index[!is.na(t_min)] <- apply(m1[!is.na(t_min), ], 1, which.min)
dt[, t_min := t_min]
dt[, p_min := m2[cbind(seq_len(nrow(m2)), p_min_index)] ]
dt[, p_min_col := grep('^p', names(dt), value = TRUE)[p_min_index] ]
# p1 p2 p3 t1 t2 t3 t_min p_min p_min_col
# 1: a b z 1 7 8 1 a p1
# 2: b c x 2 6 3 2 b p1
# 3: c d <NA> 3 5 NA 3 c p1
# 4: d a <NA> NA NA NA NA <NA> <NA>
# 5: e f y 5 3 2 2 y p3
# 6: f g <NA> 6 2 NA 2 g p2
# 7: g h s 7 NA 1 1 s p3
In addition, It looks like that the 2nd row in your desired output is incorrect?
A simple and efficient approach is to loop through the "t*" columns and track all respective values in a single pass.
First initialize appropriate vectors:
p.columns = which(startsWith(names(dt), "p"))
t.columns = which(startsWith(names(dt), "t"))
p_col_min = integer(nrow(dt))
p_min = character(nrow(dt))
t_min = rep_len(Inf, nrow(dt))
and iterate while updating:
for(i in seq_along(p.columns)) {
cur.min = which(dt[[t.columns[i]]] < t_min)
p_col_min[cur.min] = p.columns[i]
t_min[cur.min] = dt[[t.columns[i]]][cur.min]
p_min[cur.min] = dt[[p.columns[i]]][cur.min]
}
Finally fill with NAs where needed:
whichNA = is.infinite(t_min)
is.na(t_min) = is.na(p_min) = is.na(p_col_min) = whichNA
t_min
#[1] 1 2 3 NA 2 2 1
p_min
#[1] "a" "b" "c" NA "y" "g" "s"
p_col_min
#[1] 1 1 1 NA 3 2 3
Here's another route:
dt[, t_min := do.call(pmin, c(.SD, na.rm = TRUE)), .SDcols = patterns('t[[:digit:]]')]
dt[!is.na(t_min),
c('p_min', 'p_min_col') := {
arr_ind = .SD[, which(t_min == .SD, arr.ind = TRUE), .SDcols = patterns('t[[:digit:]]')]
arr_ind = arr_ind[order(arr_ind[, 1]), ]
p_m = .SD[, as.matrix(.SD)[arr_ind], .SDcols = patterns('p')]
p_m_c = grep('^p', names(.SD), value = TRUE)[arr_ind[, 2]]
list(p_m, p_m_c)
}
]
Here is another option:
ri <- dt[, .I[rowSums(is.na(.SD))==ncol(.SD)], .SDcols=t1:t3]
dt[-ri, c("t_min", "p_min", "p_col_min") := {
pmat <- .SD[, .SD, .SDcols=p1:p3]
tmat <- as.matrix(.SD[, .SD, .SDcols=t1:t3])
i <- max.col(-replace(tmat, is.na(tmat), Inf), "first")
y <- cbind(seq_len(.N), i)
.(t_min = tmat[y],
p_min = as.matrix(pmat)[y],
p_col_min = names(pmat)[i])
}]
dt
output:
p1 p2 p3 t1 t2 t3 t_min p_min p_col_min
1: a b z 1 7 8 1 a p1
2: b c x 2 6 3 2 b p1
3: c d <NA> 3 5 NA 3 c p1
4: d a <NA> NA NA NA NA <NA> <NA>
5: e f y 5 3 2 2 y p3
6: f g <NA> 6 2 NA 2 g p2
7: g h s 7 NA 1 1 s p3

How to return first element of a group excluding NA's when non-NA values exist

I have a data frame named df which looks like.
x y
A NA
B d1
L d2
F c1
L s2
A c4
B NA
B NA
A c1
F a5
G NA
H NA
I want to group by x and fill in NA values with the first non-NA element in that group if possible. Note that some groups will not have a non-NA element so returning NA is fine for that case.
df %>% group_by(x) %>% mutate(new_y = first(y))
returns the first value including NA's even when non-NA values exist for that group.
We can use replace
df %>%
group_by(x) %>%
mutate(y = replace(y, is.na(y), y[!is.na(y)][1]))
# x y
# <chr> <chr>
#1 A c4
#2 B d1
#3 L d2
#4 F c1
#5 L s2
#6 A c4
#7 B d1
#8 B d1
#9 A c1
#10 F a5
#11 G <NA>
#12 H <NA>
Or we can do a join in data.table
library(data.table)
library(tidyr)
setDT(df)[df[order(x, is.na(y)), .SD[1L], x], y := coalesce(y, i.y),on = .(x)]
df
# x y
# 1: A c4
# 2: B d1
# 3: L d2
# 4: F c1
# 5: L s2
# 6: A c4
# 7: B d1
# 8: B d1
# 9: A c1
#10: F a5
#11: G NA
#12: H NA
Or using base R
df$y <- with(df, ave(y, x, FUN = function(x) replace(x, is.na(x), x[!is.na(x)][1])))
data
df <- structure(list(x = c("A", "B", "L", "F", "L", "A", "B", "B",
"A", "F", "G", "H"), y = c(NA, "d1", "d2", "c1", "s2", "c4",
NA, NA, "c1", "a5", NA, NA)), .Names = c("x", "y"), class = "data.frame",
row.names = c(NA, -12L))

Compare two data.frames to find the rows in data.frame 1 and data.frame 2 which have equal values in selected columns

I have 2 data frames (a1 and a2)
a1
A B C D
1 A 6 8
2 D 7 3 #**
3 X 3 3
a2
A B C D
4 D 2 3 #**
5 Z 3 5
6 X 3 4
a1 <- data.frame(
A = 1:3,
B = c("A", "D", "X"),
C = c(6, 7, 3),
D = c(8, 3, 3)
)
a2 <- data.frame(
A = 4:6,
B = c("D", "Z", "X"),
C = c(2, 3, 3),
D = c(3, 5, 4)
)
I want to get the tuples (a1$A,a2$A) for the rows which have the same values in colums B and D
In this example, I would get
(2,4) because they have the same values in colums B and D, respectively D and 3
Use merge to merge the data frames.
merged <- merge(a1, a2, c("B", "D"))
subset(merged, select = c(A.x, A.y))

Resources