Get elements by position from one data frame to another - r

Let's say we have two data frames:
df1 <- data.frame(A = letters[1:3], B = letters[4:6], C = letters[7:9], stringsAsFactors = FALSE)
A B C
1 a d g
2 b e h
3 c f i
df2 <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
V1 V2 V3
1 1 4 7
2 2 5 8
3 3 6 9
I need to build a function that takes as input a single value or a vector containing elements from one of the data frames and returns the elements from the other data frame according to their positional indexes.
The function should work like this:
> matchdf(values = c("a", "e", "i"), dfin = df1, dfout = df2)
[1] 1 5 9
> matchdf(values = c(1, 5, 9), dfin = df2, dfout = df1)
[1] "a" "e" "i"
> matchdf(values = c(1, 1, 1), dfin = df2, dfout = df1)
[1] "a" "a" "a"
This is what I have tried so far:
requiere(dplyr)
toVec <- function(df) df %>% as.matrix %>% as.vector
matchdf <- function(values, dfin, dfout) toVec(dfout)[toVec(dfin) %in% values]
# But sometimes the output values aren't in correct order:
> matchdf(c("c", "i", "h"), dt1, dt2)
[1] 3 8 9
# should output 3 9 8
> matchdf(values = c("a", "a", "a"), dfin = dt1, dfout = dt2)
[1] 1
# Should output 1 1 1
Feel free to use data.table or/and dplyr if it eases the task. I would prefer a solution without for loops.
Assumptions:
elements from df1 are different from df2
dim(df1) = dim(df2)

matchdf <- function(values, dfin, dfout){
unlist(sapply(values,
function(val) dfout[dfin == val],
USE.NAMES = F)
)
}
matchdf(c("c", "i", "h"), df1, df2)
#should output 3 9 8
[1] 3 9 8
matchdf(values = c("a", "a", "a"), dfin = df1, dfout = df2)
#should output 1 1 1
[1] 1 1 1
matchdf(values = c("X", "Y", "a"), dfin = df1, dfout = df2)
#should output vector, not list
[1] 1

Related

how to see if random paired sample is in dataframe (with conditions)

Say I have a df like so:
T1 <- c("a","b","c","d","e")
T2 <- c("f","g","h","i","j")
score1 <- c(NA,0.01,0.5,0.78,NA)
score2 <- c(1, 2, 3, NA, 6)
df <- data.frame(T1, T2, score1, score2)
df
T1 T2 score1 score2
1 a f NA 1
2 b g 0.01 2
3 c h 0.50 3
4 d i 0.78 NA
5 e j NA 6
If I want to randomly create new T1-T2 pairs, how can I see if these new pairs are in the df but only if score1 column is not NA?
In other words, I randomly sample, say, 2 values from T1 and T2:
(l1 <- sample(df$T1, 2))
(l2 <- sample(df$T2, 2))
and get:
> l1
[1] "c" "d"
> l2
[1] "h" "g"
How would one go about to get the score2 of the c-h and d-g pairs from df but only if score1 is not NA?
My first instinct would be to create a new df2 without NAs in the score1 column:
df2 <- df[which(!is.na(df$score1)), ]
Then I can create a new df for the new pairs:
df3$X1 <- l1
df3$X2 <- l2
df3$X3 <- l2
df3$X4 <- l1
#stack X3 with X1 and X4 with X2 (considering that T1-T2 pair is the same as T2-T1 pair)
df4 <- data.frame(T1 = c(df3[,"X1"], df3[,"X3"]),
T2 = c(df3[,"X2"], df3[,"X4"]))
> df4
T1 T2
1 c h
2 d g
3 h c
4 g d
But I'm missing the last step of how to get see if the paired columns from df4 match the paired columns in df2. In the end, I want to get something like:
df
T1 T2 score1 score2
1 c h 0.50 3
2 d g NA NA
I think a merge/join operation makes sense here:
res <- merge(df, data.frame(T1=l1, T2=l2, found=TRUE), by = c("T1","T2"), all = TRUE)
subset(res, found, select = -found)
# T1 T2 score1 score2
# 3 c h 0.5 3
# 4 d g NA NA
Data
df <- structure(list(T1 = c("a", "b", "c", "d", "e"), T2 = c("f", "g", "h", "i", "j"), score1 = c(NA, 0.01, 0.5, 0.78, NA), score2 = c(1, 2, 3, NA, 6)), class = "data.frame", row.names = c(NA, -5L))
l1 <- c("c", "d"); l2 <- c("h", "g")
Something like this?
set.seed(2022)
(l1 <- sample(df$T1, 2))
#> [1] "d" "c"
(l2 <- sample(df$T2, 2))
#> [1] "h" "i"
mapply(\(x1, x2, data){
i <- match(x1, data$T1)
j <- match(x2, data$T2)
if(any(is.na(c(data$score1[i], data$score1[i])))) {
NA_real_
} else {
sum(c(data$score2[i], -1*data$score2[j]), na.rm = TRUE)
}
}, l1, l2, MoreArgs = list(data = df))
#> d c
#> -3 3
Created on 2022-01-30 by the reprex package (v2.0.1)

Rename columns of a dataframe based on another dataframe except columns not in that dataframe in R

Given two dataframes df1 and df2 as follows:
df1:
df1 <- structure(list(A = 1L, B = 2L, C = 3L, D = 4L, G = 5L), class = "data.frame", row.names = c(NA,
-1L))
Out:
A B C D G
1 1 2 3 4 5
df2:
df2 <- structure(list(Col1 = c("A", "B", "C", "D", "X"), Col2 = c("E",
"Q", "R", "Z", "Y")), class = "data.frame", row.names = c(NA,
-5L))
Out:
Col1 Col2
1 A E
2 B Q
3 C R
4 D Z
5 X Y
I need to rename columns of df1 using df2, except column G since it not in df2's Col1.
I use df2$Col2[match(names(df1), df2$Col1)] based on the answer from here, but it returns "E" "Q" "R" "Z" NA, as you see column G become NA. I hope it keep the original name.
The expected result:
E Q R Z G
1 1 2 3 4 5
How could I deal with this issue? Thanks.
By using na.omit(it's little bit messy..)
colnames(df1)[na.omit(match(names(df1), df2$Col1))] <- df2$Col2[na.omit(match(names(df1), df2$Col1))]
df1
E Q R Z G
1 1 2 3 4 5
I have success to reproduce your error with
df2 <- data.frame(
Col1 = c("H","I","K","A","B","C","D"),
Col2 = c("a1","a2","a3","E","Q","R","Z")
)
The problem is location of df2$Col1 and names(df1) in match.
na.omit(match(names(df1), df2$Col1))
gives [1] 4 5 6 7, which index does not exist in df1 that has length 5.
For df1, we should change order of terms in match, na.omit(match(df2$Col1,names(df1))) gives [1] 1 2 3 4
colnames(df1)[na.omit(match(df2$Col1, names(df1)))] <- df2$Col2[na.omit(match(names(df1), df2$Col1))]
This will works.
A solution using the rename_with function from the dplyr package.
library(dplyr)
df3 <- df2 %>%
filter(Col1 %in% names(df1))
df4 <- df1 %>%
rename_with(.cols = df3$Col1, .fn = function(x) df3$Col2[df3$Col1 %in% x])
df4
# E Q R Z G
# 1 1 2 3 4 5

dplyr mutate to replace specific values in a data frame

I have a data frame that consists of characters "a", "b", "x", "y".
df <- data.frame(v1 = c("a", "b", "x", "y"),
v2 = c("a", "b", "a", "y"))
Now I want to replace all values with the following scheme and also convert the whole data frame to numeric.
"a" -> 0
"b" -> 1
"x" -> 1
"y" -> 2
I know this must be somehow possible with mutate_all but I cannot figure out how
df %>% mutate_all(replace("a", 1)) %>%
mutate_all(is.character, as.numeric)
One solution could be with case_when:
df %>%
mutate_all(funs(case_when(. == "a" ~ 0,
. %in% c("b", "x") ~ 1,
. == "y" ~ 2,
TRUE ~ NA_real_)))
# v1 v2
# 1 0 0
# 2 1 1
# 3 1 0
# 4 2 2
Create a named vector with mappings and then subset it using mutate_all
vec <- c(a = 0, b = 1, x = 1, y = 2)
library(dplyr)
df %>% mutate_all(~vec[.])
# v1 v2
#1 0 0
#2 1 1
#3 1 0
#4 2 2
In base R that would be just
df[] <- vec[unlist(df)]
data
df <- data.frame(v1 = c("a", "b", "x", "y"),
v2 = c("a", "b", "a", "y"), stringsAsFactors = FALSE)

How get NA for values which are not found in a dataframe

I have a vector of values and a dataframe which I can find each item of a vector in a specific column of dataframe with the following command:
lapply(l, function(x) df[which(df$col1==x),col2])
How can I get NA for values which are not available in my dataframe?
For example:
df: col1 col2
1 a
1 b
2 c
l=c(1,3)
output: col1 col2
1 a,b
3 NA
Using data.table you could achieve this efficiently by running a binary join to l (your vector)
library(data.table)
setDT(df)[.(l), # join between `df` & `l`
on = .(col1), # using `col1`
.(col2 = toString(col2)), # paste the values in `col2` (you can add `unique`)
by = .EACHI] # do this per each value in `l`
# col1 col2
# 1: 1 a, b
# 2: 3 NA
DATA:
df <- structure(list(col1 = c(1L, 1L, 2L), col2 = c("a", "b", "c")), .Names = c("col1","col2"), class = "data.frame", row.names = c(NA, -3L))
l <- c(1, 3)
CODE:
library(magrittr)
lapply(l, function(x){
res<-df[[2]][df[[1]]==x] %>% paste(collapse=",")
if(res=="") res = NA
return(cbind(x,res))
}) %>% do.call(rbind,.)
Result:
x res
[1,] "1" "a,b"
[2,] "3" NA
Function which gives TRUE if sth is NOT integer(0), character(0), etc.
(they have in common that their length is zero):
non.zero.vec <- function(x) length(x) > 0
Any vector with such zero-length-value elements can be converted to NA using
zero2na <- function(vec) sapply(vec, function(x) ifelse(non.zero.vec(x), x, NA))
## e.g.
zero2na(c(1, 2, integer(0)) ## [1] 1 2 NA
Finally, this function does exactly what you want:
lookup <- function(df, key.col, val.col, keys) {
idxs <- lapply(keys, function(x) which(df[, key.col] == x))
lookups <- lapply(idxs, function(vec) if(length(vec) > 0) {df[vec , val.col]} else {NA})
lookupstrings <- unlist(lapply(lookups,
function(v) suppressWarnings(if(is.na(v)) {"NA"} else {paste(v, collapse = ", ")})))
res.df <- data.frame(unlist(keys), lookupstrings)
colnames(res.df) <- c(key.col, val.col)
res.df
}
df <- data.frame(col1 = c(1,1,2), col2 = c("a", "b", "c"))
lookup(df, "col1", "col2", c(1, 2, 3))
## output:
col1 col2
1 1 a, b
2 2 c
3 3 NA

Merging dataframes by a vector of dataframe names

I am trying to merge approx 30 dataframes.
I have saved the global environment as a vector, comma separated, as below;
df_names <- (df1, df2, df3, df4)
Now I am trying to merge all of these dataframes
total <- merge(df_names, by = 'ID')
But I am getting an error;
Error in as.data.frame(y) : argument "y" is missing, with no default
Converting comments to an answer, you're probably looking for a combination of mget and Reduce along with merge.
Demo:
df1 <- data.frame(ID = 1:3, var = c("a", "b", "c"))
df2 <- data.frame(ID = c(1, 3, 4), var = c("A", "B", "X"))
df3 <- data.frame(ID = c(2, 3, 4, 5), var = c("X", "Y", "Z", "A"))
df4 <- data.frame(ID = 1:5, var = letters[1:5])
Reduce(function(x, y) merge(x, y, by = "ID", all = TRUE), mget(paste0("df", 1:4)))
# ID var.x var.y var.x var.y
# 1 1 a A <NA> a
# 2 2 b <NA> X b
# 3 3 c B Y c
# 4 4 <NA> X Z d
# 5 5 <NA> <NA> A e
# Warning message:
# In merge.data.frame(x, y, by = "ID", all = TRUE) :
# column names ‘var.x’, ‘var.y’ are duplicated in the result

Resources