how to see if random paired sample is in dataframe (with conditions) - r

Say I have a df like so:
T1 <- c("a","b","c","d","e")
T2 <- c("f","g","h","i","j")
score1 <- c(NA,0.01,0.5,0.78,NA)
score2 <- c(1, 2, 3, NA, 6)
df <- data.frame(T1, T2, score1, score2)
df
T1 T2 score1 score2
1 a f NA 1
2 b g 0.01 2
3 c h 0.50 3
4 d i 0.78 NA
5 e j NA 6
If I want to randomly create new T1-T2 pairs, how can I see if these new pairs are in the df but only if score1 column is not NA?
In other words, I randomly sample, say, 2 values from T1 and T2:
(l1 <- sample(df$T1, 2))
(l2 <- sample(df$T2, 2))
and get:
> l1
[1] "c" "d"
> l2
[1] "h" "g"
How would one go about to get the score2 of the c-h and d-g pairs from df but only if score1 is not NA?
My first instinct would be to create a new df2 without NAs in the score1 column:
df2 <- df[which(!is.na(df$score1)), ]
Then I can create a new df for the new pairs:
df3$X1 <- l1
df3$X2 <- l2
df3$X3 <- l2
df3$X4 <- l1
#stack X3 with X1 and X4 with X2 (considering that T1-T2 pair is the same as T2-T1 pair)
df4 <- data.frame(T1 = c(df3[,"X1"], df3[,"X3"]),
T2 = c(df3[,"X2"], df3[,"X4"]))
> df4
T1 T2
1 c h
2 d g
3 h c
4 g d
But I'm missing the last step of how to get see if the paired columns from df4 match the paired columns in df2. In the end, I want to get something like:
df
T1 T2 score1 score2
1 c h 0.50 3
2 d g NA NA

I think a merge/join operation makes sense here:
res <- merge(df, data.frame(T1=l1, T2=l2, found=TRUE), by = c("T1","T2"), all = TRUE)
subset(res, found, select = -found)
# T1 T2 score1 score2
# 3 c h 0.5 3
# 4 d g NA NA
Data
df <- structure(list(T1 = c("a", "b", "c", "d", "e"), T2 = c("f", "g", "h", "i", "j"), score1 = c(NA, 0.01, 0.5, 0.78, NA), score2 = c(1, 2, 3, NA, 6)), class = "data.frame", row.names = c(NA, -5L))
l1 <- c("c", "d"); l2 <- c("h", "g")

Something like this?
set.seed(2022)
(l1 <- sample(df$T1, 2))
#> [1] "d" "c"
(l2 <- sample(df$T2, 2))
#> [1] "h" "i"
mapply(\(x1, x2, data){
i <- match(x1, data$T1)
j <- match(x2, data$T2)
if(any(is.na(c(data$score1[i], data$score1[i])))) {
NA_real_
} else {
sum(c(data$score2[i], -1*data$score2[j]), na.rm = TRUE)
}
}, l1, l2, MoreArgs = list(data = df))
#> d c
#> -3 3
Created on 2022-01-30 by the reprex package (v2.0.1)

Related

find column value and name based on minimum value in other column

I have a data.table that looks like this
library( data.table )
dt <- data.table( p1 = c("a", "b", "c", "d", "e", "f", "g"),
p2 = c("b", "c", "d", "a", "f", "g", "h"),
p3 = c("z", "x", NA, NA, "y", NA, "s"),
t1 = c(1, 2, 3, NA, 5, 6, 7),
t2 = c(7, 6, 5, NA, 3, 2, NA),
t3 = c(8, 3, NA, NA, 2, NA, 1) )
# p1 p2 p3 t1 t2 t3
# 1: a b z 1 7 8
# 2: b c x 2 6 3
# 3: c d <NA> 3 5 NA
# 4: d a <NA> NA NA NA
# 5: e f y 5 3 2
# 6: f g <NA> 6 2 NA
# 7: g h s 7 NA 1
It has p-columns, representing names, and t-columns, representing values.
t1 is the value corresponding to p1, t2 to p2, etc..
On each row, values of p-columns are unique (or NA). The same goes for the values in the t-columns.
What I want to do is to create three new columns:
t_min, the minimum value of all t-columns for each row (exclude NA's)
p_min, if t_min exists (is not NA), the corresponding value of the p-column... so if the t2-column has the t-min value, the corresponding value of column p2.
p_col_min, the name of the column with the value if p_min. So if the p_min value comes from colum p2, then "p2".
I prefer a data.table, since my actual data contains a lot more rows and columns. I know melting is an option, but I would like to preserve my memory with this data, so lesser memory used is better (production data contains several million rows and >200 columns).
So far I've found a way to create the t_min-column using the following:
t_cols = dt[ , .SD, .SDcols = grep( "t[1-3]", names( dt ), value = TRUE ) ]
dt[ !all( is.na( t_cols ) ),
t_min := do.call( pmin, c( .SD, list( na.rm = TRUE ) ) ),
.SDcols = names( t_cols ) ]
But I cannot wrap my head around creating the p_min and p_col_min columns. I suppose which.min() comes into play somewhere, but I cannot figure it out. Probably something simple I'm overlooking (it always seems to be.. ;-) ).
desired output
dt.desired <- data.table( p1 = c("a", "b", "c", "d", "e", "f", "g"),
p2 = c("b", "c", "d", "a", "f", "g", "h"),
p3 = c("z", "x", NA, NA, "y", NA, "s"),
t1 = c(1, 2, 3, NA, 5, 6, 7),
t2 = c(7, 6, 5, NA, 3, 2, NA),
t3 = c(8, 3, NA, NA, 2, NA, 1),
t_min = c(1,2,3,NA,2,2,1),
p_min = c("a","b","c",NA,"y","g","s"),
p_col_min = c("p1","p1","p1",NA,"p3","p2","p3") )
# p1 p2 p3 t1 t2 t3 t_min p_min p_col_min
# 1: a b z 1 7 8 1 a p1
# 2: b c x 2 6 3 2 b p1
# 3: c d <NA> 3 5 NA 3 c p1
# 4: d a <NA> NA NA NA NA <NA> <NA>
# 5: e f y 5 3 2 2 y p3
# 6: f g <NA> 6 2 NA 2 g p2
# 7: g h s 7 NA 1 1 s p3
I cannot guarantee whether this is a solution efficient enough for your working data, but this is what I would try first:
m1 <- as.matrix(dt[, grep('^t', names(dt)), with = FALSE])
m2 <- as.matrix(dt[, grep('^p', names(dt)), with = FALSE])
t_min <- apply(m1, 1, min, na.rm = TRUE)
t_min[is.infinite(t_min)] <- NA_real_
p_min_index <- rep(NA_integer_, length(t_min))
p_min_index[!is.na(t_min)] <- apply(m1[!is.na(t_min), ], 1, which.min)
dt[, t_min := t_min]
dt[, p_min := m2[cbind(seq_len(nrow(m2)), p_min_index)] ]
dt[, p_min_col := grep('^p', names(dt), value = TRUE)[p_min_index] ]
# p1 p2 p3 t1 t2 t3 t_min p_min p_min_col
# 1: a b z 1 7 8 1 a p1
# 2: b c x 2 6 3 2 b p1
# 3: c d <NA> 3 5 NA 3 c p1
# 4: d a <NA> NA NA NA NA <NA> <NA>
# 5: e f y 5 3 2 2 y p3
# 6: f g <NA> 6 2 NA 2 g p2
# 7: g h s 7 NA 1 1 s p3
In addition, It looks like that the 2nd row in your desired output is incorrect?
A simple and efficient approach is to loop through the "t*" columns and track all respective values in a single pass.
First initialize appropriate vectors:
p.columns = which(startsWith(names(dt), "p"))
t.columns = which(startsWith(names(dt), "t"))
p_col_min = integer(nrow(dt))
p_min = character(nrow(dt))
t_min = rep_len(Inf, nrow(dt))
and iterate while updating:
for(i in seq_along(p.columns)) {
cur.min = which(dt[[t.columns[i]]] < t_min)
p_col_min[cur.min] = p.columns[i]
t_min[cur.min] = dt[[t.columns[i]]][cur.min]
p_min[cur.min] = dt[[p.columns[i]]][cur.min]
}
Finally fill with NAs where needed:
whichNA = is.infinite(t_min)
is.na(t_min) = is.na(p_min) = is.na(p_col_min) = whichNA
t_min
#[1] 1 2 3 NA 2 2 1
p_min
#[1] "a" "b" "c" NA "y" "g" "s"
p_col_min
#[1] 1 1 1 NA 3 2 3
Here's another route:
dt[, t_min := do.call(pmin, c(.SD, na.rm = TRUE)), .SDcols = patterns('t[[:digit:]]')]
dt[!is.na(t_min),
c('p_min', 'p_min_col') := {
arr_ind = .SD[, which(t_min == .SD, arr.ind = TRUE), .SDcols = patterns('t[[:digit:]]')]
arr_ind = arr_ind[order(arr_ind[, 1]), ]
p_m = .SD[, as.matrix(.SD)[arr_ind], .SDcols = patterns('p')]
p_m_c = grep('^p', names(.SD), value = TRUE)[arr_ind[, 2]]
list(p_m, p_m_c)
}
]
Here is another option:
ri <- dt[, .I[rowSums(is.na(.SD))==ncol(.SD)], .SDcols=t1:t3]
dt[-ri, c("t_min", "p_min", "p_col_min") := {
pmat <- .SD[, .SD, .SDcols=p1:p3]
tmat <- as.matrix(.SD[, .SD, .SDcols=t1:t3])
i <- max.col(-replace(tmat, is.na(tmat), Inf), "first")
y <- cbind(seq_len(.N), i)
.(t_min = tmat[y],
p_min = as.matrix(pmat)[y],
p_col_min = names(pmat)[i])
}]
dt
output:
p1 p2 p3 t1 t2 t3 t_min p_min p_col_min
1: a b z 1 7 8 1 a p1
2: b c x 2 6 3 2 b p1
3: c d <NA> 3 5 NA 3 c p1
4: d a <NA> NA NA NA NA <NA> <NA>
5: e f y 5 3 2 2 y p3
6: f g <NA> 6 2 NA 2 g p2
7: g h s 7 NA 1 1 s p3

Find first match to column 1 in columns 2:4 - R

Comparing "x1", "x2", an "x3" to "target", how do I return the first index of the column that matches "target"? An NA can result for no match.
pop <- c("A", "B", "C", "D")
target <- pop
x1 <- sample(pop)
x2 <- sample(pop)
x3 <- sample(pop)
df <- data.frame(target,x1,x2,x3)
> df
target x1 x2 x3
1 A B B D
2 B D C C
3 C C A A
4 D A D B
I have tried using something along the lines of:
min(which(df[3, 1] == df[3, 2:ncol(df)]))
...(row 3 being used as an example), but I don't know how to gracefully handle cases where there is no match, which is probably why I am having trouble using this in a function with apply(). The goal is either a new column on df or a vector of the returned values.
Thanks!
Here's a solution using match -
> df
target x1 x2 x3
1 A C A C
2 B A B B
3 C D D D
4 D B C A
apply(df, 1, function(x) match(TRUE, x[-1] == x[1]))
[1] 2 2 NA NA
Data -
df <- structure(list(target = c("A", "B", "C", "D"), x1 = c("C", "A",
"D", "B"), x2 = c("A", "B", "D", "C"), x3 = c("C", "B", "D",
"A")), .Names = c("target", "x1", "x2", "x3"), row.names = c(NA,
-4L), class = "data.frame")
There are many ways to do this. Loop through the columns 2:4, compare with the target and get the index of first match with which
sapply(df[-1], function(x) which(x == df$target)[1])
x1 x2 x3
#1 3 NA
If it is for comparing the rows
m1 <- df$target == df[-1]
max.col(m1, 'first') * NA^!rowSums(m1)
Or
apply(m1, 1, function(x) which(x)[1])
data
df <- data.frame(target,x1,x2,x3, stringsAsFactors = FALSE)

Copy selective row values from 1 dataframe to another in R

I have a dataframe:df <- data.frame(id = c('1','2','3'), b = c('b1', 'NA', 'b3'), c = c('c1', 'c2', 'NA'), d = c('d1', 'NA', 'NA'))
id b c d
1 b1 c1 d1
2 NA c2 NA
3 b3 NA NA
I have extracted values with id = 1 from df to another dataframe say df2 so df2 has 1 row
id b c d
1 b1 c1 d1
I need to copy all values from df2 to df1 wherever there is not an NA in df1
Result Table:
id b c d
1 b1 c1 d1
2 b1 c2 d1
3 b3 c1 d1
Thank you in advance. I asked similar question before but deleting it.
Based on your last comment that df2[3,3] should be c2 and not c1, a straightforward answer is to use zoo::na.locf.
library(zoo)
df2 <- na.locf(df)
# id b c d
# 1 1 b1 c1 d1
# 2 2 b1 c2 d1
# 3 3 b3 c2 d1
Data
df <- structure(list(id = c(1, 2, 3), b = c("b1", NA, "b3"), c = c("c1",
"c2", NA), d = c("d1", NA, NA)), class = "data.frame", row.names = c(NA,
-3L))
Assuming that there is a mistake in your question -> df2 will be equal to b1-c1-d1 not b1-c2-d1, here is the solution :
Initialize dataframe
df <- data.frame(id = c('1','2','3'), b = c('b1', 'NA', 'b3'), c = c('c1', 'c2', 'NA'), d = c('d1', 'NA', 'NA'))
Converting string NAs to actual detectable NAs
df <- data.frame(lapply(df, function(x) { gsub("NA", NA, x) }))
Obtaining default value row
df2<-df[df$id==1,]
For all rows, check if the column cell is na, then fill it with the df2 cell of the same column
for (r in 1:nrow(df)) for( c in colnames(df)) df[r,c]<-ifelse(is.na(df[r,c]),as.character(df2[1,c]),as.character(df[r,c]))

How get NA for values which are not found in a dataframe

I have a vector of values and a dataframe which I can find each item of a vector in a specific column of dataframe with the following command:
lapply(l, function(x) df[which(df$col1==x),col2])
How can I get NA for values which are not available in my dataframe?
For example:
df: col1 col2
1 a
1 b
2 c
l=c(1,3)
output: col1 col2
1 a,b
3 NA
Using data.table you could achieve this efficiently by running a binary join to l (your vector)
library(data.table)
setDT(df)[.(l), # join between `df` & `l`
on = .(col1), # using `col1`
.(col2 = toString(col2)), # paste the values in `col2` (you can add `unique`)
by = .EACHI] # do this per each value in `l`
# col1 col2
# 1: 1 a, b
# 2: 3 NA
DATA:
df <- structure(list(col1 = c(1L, 1L, 2L), col2 = c("a", "b", "c")), .Names = c("col1","col2"), class = "data.frame", row.names = c(NA, -3L))
l <- c(1, 3)
CODE:
library(magrittr)
lapply(l, function(x){
res<-df[[2]][df[[1]]==x] %>% paste(collapse=",")
if(res=="") res = NA
return(cbind(x,res))
}) %>% do.call(rbind,.)
Result:
x res
[1,] "1" "a,b"
[2,] "3" NA
Function which gives TRUE if sth is NOT integer(0), character(0), etc.
(they have in common that their length is zero):
non.zero.vec <- function(x) length(x) > 0
Any vector with such zero-length-value elements can be converted to NA using
zero2na <- function(vec) sapply(vec, function(x) ifelse(non.zero.vec(x), x, NA))
## e.g.
zero2na(c(1, 2, integer(0)) ## [1] 1 2 NA
Finally, this function does exactly what you want:
lookup <- function(df, key.col, val.col, keys) {
idxs <- lapply(keys, function(x) which(df[, key.col] == x))
lookups <- lapply(idxs, function(vec) if(length(vec) > 0) {df[vec , val.col]} else {NA})
lookupstrings <- unlist(lapply(lookups,
function(v) suppressWarnings(if(is.na(v)) {"NA"} else {paste(v, collapse = ", ")})))
res.df <- data.frame(unlist(keys), lookupstrings)
colnames(res.df) <- c(key.col, val.col)
res.df
}
df <- data.frame(col1 = c(1,1,2), col2 = c("a", "b", "c"))
lookup(df, "col1", "col2", c(1, 2, 3))
## output:
col1 col2
1 1 a, b
2 2 c
3 3 NA

Get elements by position from one data frame to another

Let's say we have two data frames:
df1 <- data.frame(A = letters[1:3], B = letters[4:6], C = letters[7:9], stringsAsFactors = FALSE)
A B C
1 a d g
2 b e h
3 c f i
df2 <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
V1 V2 V3
1 1 4 7
2 2 5 8
3 3 6 9
I need to build a function that takes as input a single value or a vector containing elements from one of the data frames and returns the elements from the other data frame according to their positional indexes.
The function should work like this:
> matchdf(values = c("a", "e", "i"), dfin = df1, dfout = df2)
[1] 1 5 9
> matchdf(values = c(1, 5, 9), dfin = df2, dfout = df1)
[1] "a" "e" "i"
> matchdf(values = c(1, 1, 1), dfin = df2, dfout = df1)
[1] "a" "a" "a"
This is what I have tried so far:
requiere(dplyr)
toVec <- function(df) df %>% as.matrix %>% as.vector
matchdf <- function(values, dfin, dfout) toVec(dfout)[toVec(dfin) %in% values]
# But sometimes the output values aren't in correct order:
> matchdf(c("c", "i", "h"), dt1, dt2)
[1] 3 8 9
# should output 3 9 8
> matchdf(values = c("a", "a", "a"), dfin = dt1, dfout = dt2)
[1] 1
# Should output 1 1 1
Feel free to use data.table or/and dplyr if it eases the task. I would prefer a solution without for loops.
Assumptions:
elements from df1 are different from df2
dim(df1) = dim(df2)
matchdf <- function(values, dfin, dfout){
unlist(sapply(values,
function(val) dfout[dfin == val],
USE.NAMES = F)
)
}
matchdf(c("c", "i", "h"), df1, df2)
#should output 3 9 8
[1] 3 9 8
matchdf(values = c("a", "a", "a"), dfin = df1, dfout = df2)
#should output 1 1 1
[1] 1 1 1
matchdf(values = c("X", "Y", "a"), dfin = df1, dfout = df2)
#should output vector, not list
[1] 1

Resources