I have a data.table that looks like this
library( data.table )
dt <- data.table( p1 = c("a", "b", "c", "d", "e", "f", "g"),
p2 = c("b", "c", "d", "a", "f", "g", "h"),
p3 = c("z", "x", NA, NA, "y", NA, "s"),
t1 = c(1, 2, 3, NA, 5, 6, 7),
t2 = c(7, 6, 5, NA, 3, 2, NA),
t3 = c(8, 3, NA, NA, 2, NA, 1) )
# p1 p2 p3 t1 t2 t3
# 1: a b z 1 7 8
# 2: b c x 2 6 3
# 3: c d <NA> 3 5 NA
# 4: d a <NA> NA NA NA
# 5: e f y 5 3 2
# 6: f g <NA> 6 2 NA
# 7: g h s 7 NA 1
It has p-columns, representing names, and t-columns, representing values.
t1 is the value corresponding to p1, t2 to p2, etc..
On each row, values of p-columns are unique (or NA). The same goes for the values in the t-columns.
What I want to do is to create three new columns:
t_min, the minimum value of all t-columns for each row (exclude NA's)
p_min, if t_min exists (is not NA), the corresponding value of the p-column... so if the t2-column has the t-min value, the corresponding value of column p2.
p_col_min, the name of the column with the value if p_min. So if the p_min value comes from colum p2, then "p2".
I prefer a data.table, since my actual data contains a lot more rows and columns. I know melting is an option, but I would like to preserve my memory with this data, so lesser memory used is better (production data contains several million rows and >200 columns).
So far I've found a way to create the t_min-column using the following:
t_cols = dt[ , .SD, .SDcols = grep( "t[1-3]", names( dt ), value = TRUE ) ]
dt[ !all( is.na( t_cols ) ),
t_min := do.call( pmin, c( .SD, list( na.rm = TRUE ) ) ),
.SDcols = names( t_cols ) ]
But I cannot wrap my head around creating the p_min and p_col_min columns. I suppose which.min() comes into play somewhere, but I cannot figure it out. Probably something simple I'm overlooking (it always seems to be.. ;-) ).
desired output
dt.desired <- data.table( p1 = c("a", "b", "c", "d", "e", "f", "g"),
p2 = c("b", "c", "d", "a", "f", "g", "h"),
p3 = c("z", "x", NA, NA, "y", NA, "s"),
t1 = c(1, 2, 3, NA, 5, 6, 7),
t2 = c(7, 6, 5, NA, 3, 2, NA),
t3 = c(8, 3, NA, NA, 2, NA, 1),
t_min = c(1,2,3,NA,2,2,1),
p_min = c("a","b","c",NA,"y","g","s"),
p_col_min = c("p1","p1","p1",NA,"p3","p2","p3") )
# p1 p2 p3 t1 t2 t3 t_min p_min p_col_min
# 1: a b z 1 7 8 1 a p1
# 2: b c x 2 6 3 2 b p1
# 3: c d <NA> 3 5 NA 3 c p1
# 4: d a <NA> NA NA NA NA <NA> <NA>
# 5: e f y 5 3 2 2 y p3
# 6: f g <NA> 6 2 NA 2 g p2
# 7: g h s 7 NA 1 1 s p3
I cannot guarantee whether this is a solution efficient enough for your working data, but this is what I would try first:
m1 <- as.matrix(dt[, grep('^t', names(dt)), with = FALSE])
m2 <- as.matrix(dt[, grep('^p', names(dt)), with = FALSE])
t_min <- apply(m1, 1, min, na.rm = TRUE)
t_min[is.infinite(t_min)] <- NA_real_
p_min_index <- rep(NA_integer_, length(t_min))
p_min_index[!is.na(t_min)] <- apply(m1[!is.na(t_min), ], 1, which.min)
dt[, t_min := t_min]
dt[, p_min := m2[cbind(seq_len(nrow(m2)), p_min_index)] ]
dt[, p_min_col := grep('^p', names(dt), value = TRUE)[p_min_index] ]
# p1 p2 p3 t1 t2 t3 t_min p_min p_min_col
# 1: a b z 1 7 8 1 a p1
# 2: b c x 2 6 3 2 b p1
# 3: c d <NA> 3 5 NA 3 c p1
# 4: d a <NA> NA NA NA NA <NA> <NA>
# 5: e f y 5 3 2 2 y p3
# 6: f g <NA> 6 2 NA 2 g p2
# 7: g h s 7 NA 1 1 s p3
In addition, It looks like that the 2nd row in your desired output is incorrect?
A simple and efficient approach is to loop through the "t*" columns and track all respective values in a single pass.
First initialize appropriate vectors:
p.columns = which(startsWith(names(dt), "p"))
t.columns = which(startsWith(names(dt), "t"))
p_col_min = integer(nrow(dt))
p_min = character(nrow(dt))
t_min = rep_len(Inf, nrow(dt))
and iterate while updating:
for(i in seq_along(p.columns)) {
cur.min = which(dt[[t.columns[i]]] < t_min)
p_col_min[cur.min] = p.columns[i]
t_min[cur.min] = dt[[t.columns[i]]][cur.min]
p_min[cur.min] = dt[[p.columns[i]]][cur.min]
}
Finally fill with NAs where needed:
whichNA = is.infinite(t_min)
is.na(t_min) = is.na(p_min) = is.na(p_col_min) = whichNA
t_min
#[1] 1 2 3 NA 2 2 1
p_min
#[1] "a" "b" "c" NA "y" "g" "s"
p_col_min
#[1] 1 1 1 NA 3 2 3
Here's another route:
dt[, t_min := do.call(pmin, c(.SD, na.rm = TRUE)), .SDcols = patterns('t[[:digit:]]')]
dt[!is.na(t_min),
c('p_min', 'p_min_col') := {
arr_ind = .SD[, which(t_min == .SD, arr.ind = TRUE), .SDcols = patterns('t[[:digit:]]')]
arr_ind = arr_ind[order(arr_ind[, 1]), ]
p_m = .SD[, as.matrix(.SD)[arr_ind], .SDcols = patterns('p')]
p_m_c = grep('^p', names(.SD), value = TRUE)[arr_ind[, 2]]
list(p_m, p_m_c)
}
]
Here is another option:
ri <- dt[, .I[rowSums(is.na(.SD))==ncol(.SD)], .SDcols=t1:t3]
dt[-ri, c("t_min", "p_min", "p_col_min") := {
pmat <- .SD[, .SD, .SDcols=p1:p3]
tmat <- as.matrix(.SD[, .SD, .SDcols=t1:t3])
i <- max.col(-replace(tmat, is.na(tmat), Inf), "first")
y <- cbind(seq_len(.N), i)
.(t_min = tmat[y],
p_min = as.matrix(pmat)[y],
p_col_min = names(pmat)[i])
}]
dt
output:
p1 p2 p3 t1 t2 t3 t_min p_min p_col_min
1: a b z 1 7 8 1 a p1
2: b c x 2 6 3 2 b p1
3: c d <NA> 3 5 NA 3 c p1
4: d a <NA> NA NA NA NA <NA> <NA>
5: e f y 5 3 2 2 y p3
6: f g <NA> 6 2 NA 2 g p2
7: g h s 7 NA 1 1 s p3
Related
Say I have a df like so:
T1 <- c("a","b","c","d","e")
T2 <- c("f","g","h","i","j")
score1 <- c(NA,0.01,0.5,0.78,NA)
score2 <- c(1, 2, 3, NA, 6)
df <- data.frame(T1, T2, score1, score2)
df
T1 T2 score1 score2
1 a f NA 1
2 b g 0.01 2
3 c h 0.50 3
4 d i 0.78 NA
5 e j NA 6
If I want to randomly create new T1-T2 pairs, how can I see if these new pairs are in the df but only if score1 column is not NA?
In other words, I randomly sample, say, 2 values from T1 and T2:
(l1 <- sample(df$T1, 2))
(l2 <- sample(df$T2, 2))
and get:
> l1
[1] "c" "d"
> l2
[1] "h" "g"
How would one go about to get the score2 of the c-h and d-g pairs from df but only if score1 is not NA?
My first instinct would be to create a new df2 without NAs in the score1 column:
df2 <- df[which(!is.na(df$score1)), ]
Then I can create a new df for the new pairs:
df3$X1 <- l1
df3$X2 <- l2
df3$X3 <- l2
df3$X4 <- l1
#stack X3 with X1 and X4 with X2 (considering that T1-T2 pair is the same as T2-T1 pair)
df4 <- data.frame(T1 = c(df3[,"X1"], df3[,"X3"]),
T2 = c(df3[,"X2"], df3[,"X4"]))
> df4
T1 T2
1 c h
2 d g
3 h c
4 g d
But I'm missing the last step of how to get see if the paired columns from df4 match the paired columns in df2. In the end, I want to get something like:
df
T1 T2 score1 score2
1 c h 0.50 3
2 d g NA NA
I think a merge/join operation makes sense here:
res <- merge(df, data.frame(T1=l1, T2=l2, found=TRUE), by = c("T1","T2"), all = TRUE)
subset(res, found, select = -found)
# T1 T2 score1 score2
# 3 c h 0.5 3
# 4 d g NA NA
Data
df <- structure(list(T1 = c("a", "b", "c", "d", "e"), T2 = c("f", "g", "h", "i", "j"), score1 = c(NA, 0.01, 0.5, 0.78, NA), score2 = c(1, 2, 3, NA, 6)), class = "data.frame", row.names = c(NA, -5L))
l1 <- c("c", "d"); l2 <- c("h", "g")
Something like this?
set.seed(2022)
(l1 <- sample(df$T1, 2))
#> [1] "d" "c"
(l2 <- sample(df$T2, 2))
#> [1] "h" "i"
mapply(\(x1, x2, data){
i <- match(x1, data$T1)
j <- match(x2, data$T2)
if(any(is.na(c(data$score1[i], data$score1[i])))) {
NA_real_
} else {
sum(c(data$score2[i], -1*data$score2[j]), na.rm = TRUE)
}
}, l1, l2, MoreArgs = list(data = df))
#> d c
#> -3 3
Created on 2022-01-30 by the reprex package (v2.0.1)
I want to replace the NA values for observations within a particular sub-group, but the sequence of the observations in that group is not ordered properly. So I am wondering if there exists some dplyr or plyr command that would allow me to replace missing values in a column belonging to one dataframe using the values from the same column from another dataframe while matching on the values of that "key" column.
Here's what I got. Hope someone could shed light on this. Thanks.
## data frame that contains missing values in "diff" column
df <- data.frame(type = c(1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3),
diff = c(0.1, 0.3, NA, NA, NA, NA, NA, 0.2, 0.7, NA, 0.5, NA),
name = c("A", "B", "C", "D", "E", "A", "B", "C", "F", "A", "B", "C"))
## replace with values from this smaller data frame
df2 <- data.frame(diff_rep = c(0.3, 0.2, 0.4), name = c("A", "B", "C"))
## replace using ifelse
df$diff <- ifelse(is.na(df$diff) & (df$type == 2), df2$diff_rep , df$diff)
df
type diff name
1 1 0.1 A
2 1 0.3 B
3 1 NA C
4 2 0.3 D
5 2 0.2 E
6 2 0.4 A
7 2 0.3 B
8 2 0.2 C
9 2 0.7 F
10 3 NA A
11 3 0.5 B
12 3 NA C
## desired output
type diff name
1 1 0.1 A
2 1 0.3 B
3 1 NA C
4 2 NA D
5 2 NA E
6 2 0.3 A
7 2 0.2 B
8 2 0.4 C
9 2 0.7 F
10 3 NA A
11 3 0.5 B
12 3 NA C
Assuminhg row 9 is a mistake, you can use a left join first and then use ifelse() and coalesce() to get your desired result. coalesce() returns the first non-missing value
left_join(df, df2, by = "name") %>%
mutate(diff_wanted = if_else(type == 2,
coalesce(diff, diff_rep),
diff),
diff_wanted = ifelse(name %in% df2$name,
diff_wanted,
NA)) %>%
select(type, diff_wanted, name)
I have this data.table
library(data.table)
data.table(
id = c(rep(1, 3), rep(2, 2)),
begin = c(1, 4, 8, 1, 11),
end = c(3, 7, 12, 10, 12),
state = c("A", "B", "A", "B", "A")
)
I would like to have this output :
data.table(
id = c(1, 2),
m1 = c("A", "B"),
m2 = c("A", "B"),
m3 = c("A", "B"),
m4 = c("B", "B"),
m5 = c("B", "B"),
m6 = c("B", "B"),
m7 = c("B", "B"),
m8 = c("A", "B"),
m9 = c("A", "B"),
m10 = c("A", "B"),
m11 = c("A", "A"),
m12 = c("A", "A")
)
Those who used to do sequence analysis may have recognized that I'm trying to do what seqformat do in the TRaMiNeR package would do, but with higher performance due to use of data.table
One option with data.table would be to melt the dataset after creating a sequence column, then grouped by 'i1', 'id', 'state', get the sequence of first and last 'value', dcast it from 'long' to 'wide'
dt1 <- melt(dt[, i1 := seq_len(.N)], id.vars = c("i1", "id", "state"))[,
paste0("m", seq(first(value), last(value))), .(i1, id, state)]
dcast(dt1, id ~ V1, value.var = "state")[]
# id m1 m10 m11 m12 m2 m3 m4 m5 m6 m7 m8 m9
#1: 1 A A A A A A B B B B A A
#2: 2 B B A A B B B B B B B B
A solution using the tidyverse.
library(tidyverse)
library(data.table)
dat <- data.table(
id = c(rep(1, 3), rep(2, 2)),
begin = c(1, 4, 8, 1, 11),
end = c(3, 7, 12, 10, 12),
state = c("A", "B", "A", "B", "A")
)
dat2 <- dat %>%
mutate(Index = map2(begin, end, `:`)) %>%
unnest() %>%
mutate(Index = str_c("m", Index)) %>%
select(id, state, Index) %>%
spread(Index, state) %>%
select(id, str_c("m", 1:(ncol(.) - 1)))
dat2
# id m1 m2 m3 m4 m5 m6 m7 m8 m9 m10 m11 m12
# 1 1 A A A B B B B A A A A A
# 2 2 B B B B B B B B B B A A
An alternative solution:
dt[, unlist(Map(`:`, begin, end)), by = .(id, state)
][, dcast(.SD, id ~ sprintf("m%02d", V1), value.var = "state")]
which gives:
id m01 m02 m03 m04 m05 m06 m07 m08 m09 m10 m11 m12
1: 1 A A A B B B B A A A A A
2: 2 B B B B B B B B B B A A
It is possibly better to keep the data in long format. Long format is often easier to work with in R later on in your data processing/analysis.
You could achieve that with just:
dt[, unlist(Map(`:`, begin, end)), by = .(id, state)][order(id, V1)]
which gives:
id state V1
1: 1 A 1
2: 1 A 2
3: 1 A 3
4: 1 B 4
5: 1 B 5
6: 1 B 6
7: 1 B 7
8: 1 A 8
9: 1 A 9
10: 1 A 10
11: 1 A 11
12: 1 A 12
13: 2 B 1
14: 2 B 2
15: 2 B 3
16: 2 B 4
17: 2 B 5
18: 2 B 6
19: 2 B 7
20: 2 B 8
21: 2 B 9
22: 2 B 10
23: 2 A 11
24: 2 A 12
(where the [order(id, V1)]-part isn't necessary)
Used data:
dt <- data.table(
id = c(rep(1, 3), rep(2, 2)),
begin = c(1, 4, 8, 1, 11),
end = c(3, 7, 12, 10, 12),
state = c("A", "B", "A", "B", "A")
)
I have a data frame named df which looks like.
x y
A NA
B d1
L d2
F c1
L s2
A c4
B NA
B NA
A c1
F a5
G NA
H NA
I want to group by x and fill in NA values with the first non-NA element in that group if possible. Note that some groups will not have a non-NA element so returning NA is fine for that case.
df %>% group_by(x) %>% mutate(new_y = first(y))
returns the first value including NA's even when non-NA values exist for that group.
We can use replace
df %>%
group_by(x) %>%
mutate(y = replace(y, is.na(y), y[!is.na(y)][1]))
# x y
# <chr> <chr>
#1 A c4
#2 B d1
#3 L d2
#4 F c1
#5 L s2
#6 A c4
#7 B d1
#8 B d1
#9 A c1
#10 F a5
#11 G <NA>
#12 H <NA>
Or we can do a join in data.table
library(data.table)
library(tidyr)
setDT(df)[df[order(x, is.na(y)), .SD[1L], x], y := coalesce(y, i.y),on = .(x)]
df
# x y
# 1: A c4
# 2: B d1
# 3: L d2
# 4: F c1
# 5: L s2
# 6: A c4
# 7: B d1
# 8: B d1
# 9: A c1
#10: F a5
#11: G NA
#12: H NA
Or using base R
df$y <- with(df, ave(y, x, FUN = function(x) replace(x, is.na(x), x[!is.na(x)][1])))
data
df <- structure(list(x = c("A", "B", "L", "F", "L", "A", "B", "B",
"A", "F", "G", "H"), y = c(NA, "d1", "d2", "c1", "s2", "c4",
NA, NA, "c1", "a5", NA, NA)), .Names = c("x", "y"), class = "data.frame",
row.names = c(NA, -12L))
I have 2 data frames (a1 and a2)
a1
A B C D
1 A 6 8
2 D 7 3 #**
3 X 3 3
a2
A B C D
4 D 2 3 #**
5 Z 3 5
6 X 3 4
a1 <- data.frame(
A = 1:3,
B = c("A", "D", "X"),
C = c(6, 7, 3),
D = c(8, 3, 3)
)
a2 <- data.frame(
A = 4:6,
B = c("D", "Z", "X"),
C = c(2, 3, 3),
D = c(3, 5, 4)
)
I want to get the tuples (a1$A,a2$A) for the rows which have the same values in colums B and D
In this example, I would get
(2,4) because they have the same values in colums B and D, respectively D and 3
Use merge to merge the data frames.
merged <- merge(a1, a2, c("B", "D"))
subset(merged, select = c(A.x, A.y))