mapping values between data frames R - r

let's create example data:
df <- data.frame(date=c("2017-01-01","2017-01-02", "2017-01-03", "2017-01-04", "2017-01-05"), X1=c("A", "B", "C", "D", "F"),
X2=c("B", "A", "D", "F", "C"))
df2 <- data.frame(date=c("2017-01-01","2017-01-02", "2017-01-03", "2017-01-04", "2017-01-05"),
A=c("3", "4", "2", "1", "5"),
B=c("6", "2", "5", "1", "1"),
C=c("1", "4", "5", "2", "3"),
D=c("67", "67", "63", "61", "62"),
F=c("31", "33", "35", "31", "38"))
So I have two data frames and I want to match values from df2 to df by date and X1 and X2 and create new variables for those. What makes this tricky for me is that matched values in df2 are in colnames. End result should look like this:
> result
date X1 X2 Var1 Var2
1 2017-01-01 A B 3 6
2 2017-01-02 B A 2 4
3 2017-01-03 C D 5 63
4 2017-01-04 D F 61 31
5 2017-01-05 F C 38 3
result <- data.frame(date=c("2017-01-01","2017-01-02", "2017-01-03", "2017-01-04", "2017-01-05"),
X1=c("A", "B", "C", "D", "F"),
X2=c("B", "A", "D", "F", "C"),
Var1=c("3", "2", "5", "61", "38"),
Var2=c("6", "4", "63", "31", "3"))
I wanted to use mapvalues, but couldn't figure it out. Second thought was to go long format (melt) with df2 and try then, but failed there as well.
Ok, here is my best try, just feels that there could be more efficient way, if you have to create multiple (>50) new variables to data frame.
df2.long <- melt(df2, id.vars = c("date"))
df$Var1 <- na.omit(merge(df, df2.long, by.x = c("date", "X1"), by.y = c("date", "variable"), all.x = FALSE, all.y = TRUE))[,4]
df$Var2 <- na.omit(merge(df, df2.long, by.x = c("date", "X2"), by.y = c("date", "variable"), all.x = FALSE, all.y = TRUE))[,5]

Using dplyr and tidyr:
df2_m <- group_by(df2, date) %>%
gather('X1', 'var', -date)
left_join(df, df2_m) %>%
left_join(df2_m, by = c('date', 'X2' = 'X1')) %>%
rename(Var1 = var.x, Var2 = var.y) -> result

A possibility with mapply:
df$Var1 <- mapply(function(day, col) df2[df2$date==day, as.character(col)],
day=df$date, col=df$X1)
df$Var2 <- mapply(function(day, col) df2[df2$date==day, as.character(col)],
day=df$date, col=df$X2)
df
# date X1 X2 Var1 Var2
#1 2017-01-01 A B 3 6
#2 2017-01-02 B A 2 4
#3 2017-01-03 C D 5 63
#4 2017-01-04 D F 61 31
#5 2017-01-05 F C 38 3
NB:
If you have more columns to modify (not just 2 like in your example), you can use lapply to loop over the columns X.:
df[, paste0("Var", 1:2)] <- lapply(df[,paste0("X", 1:2)],
function(value) {
mapply(function(day, col) df2[df2$date==day, as.character(col)],
day=df$date, col=value)})

An double melt > join > dcast option using data.table
library(data.table) # v>=1.10.0
dcast(
melt(setDT(df), 1L)[ # melt the first table by date
melt(setDT(df2), 1L), # melt the second table by date
on = .(date, value = variable), # join by date and the letters
nomatch = 0L], # remove everything that wasn't matched
date ~ variable, # convert back to long format
value.var = c("value", "i.value")) # take both values columns
# date value_X1 value_X2 i.value_X1 i.value_X2
# 1: 2017-01-01 A B 3 6
# 2: 2017-01-02 B A 2 4
# 3: 2017-01-03 C D 5 63
# 4: 2017-01-04 D F 61 31
# 5: 2017-01-05 F C 38 3

We can use match to get the column index of 'df2' from the 'X1' and 'X2' columns, cbind with the sequence of rows, use the row/column index to extract the values in 'df2', and assign the output to create the 'Var' columns
df[paste0("Var", 1:2)] <- lapply(df[2:3], function(x)
df2[-1][cbind(1:nrow(df2), match(x, names(df2)[-1]))])
df
# date X1 X2 Var1 Var2
#1 2017-01-01 A B 3 6
#2 2017-01-02 B A 2 4
#3 2017-01-03 C D 5 63
#4 2017-01-04 D F 61 31
#5 2017-01-05 F C 38 3

Using melt and match:
df2l<-melt(df2, measure=c("A","B","C","D","F"))
Indices <- match(paste(df$date, df$X1), paste(df2l$date,df2l$variable))
df$Var1 <- df2l$value[Indices]
Indices2 <- match(paste(df$date, df$X2), paste(df2l$date,df2l$variable))
df$Var2 <- df2l$value[Indices2]

Related

How to add a common number to rows that have same value in another column?

After years of using your advices to another users, here is my for now unsolvable issue...
I have a dataset with thousands of rows and hundreds of column, that have one column with a possible value in common. Here is a subset of my dataset :
ID <- c("A", "B", "C", "D", "E")
Dose <- c("1", "5", "3", "4", "5")
Value <- c("x1", "x2", "x3", "x2", "x3")
mat <- cbind(ID, Dose, Value)
What I want is to assign a unique value to the rows that have the "Value" column in common, like that :
ID <- c("A", "B", "C", "D", "E")
Dose <- c("1", "5", "3", "4", "5")
Value <- c("153254", "258634", "896411", "258634", "896411")
Code <- c("1", "2", "3", "2", "3")
mat <- cbind(ID, Dose, Value, Code)
Does anyone have an idea that could help me a little ?
Thanks !
We may use match here
library(dplyr)
mat %>%
mutate(Code = match(Value, unique(Value)))
-output
ID Dose Value Code
1 A 1 153254 1
2 B 5 258634 2
3 C 3 896411 3
4 D 4 258634 2
5 E 5 896411 3
data
mat <- data.frame(ID, Dose, Value)
You should consider using a data.frame:
mat <- data.frame(ID, Dose, Value)
Using dplyr you could create the desired output:
library(dplyr)
mat %>%
group_by(Value) %>%
mutate(Code = cur_group_id()) %>%
ungroup()
This returns
# A tibble: 5 x 4
ID Dose Value Code
<chr> <chr> <chr> <int>
1 A 1 153254 1
2 B 5 258634 2
3 C 3 896411 3
4 D 4 258634 2
5 E 5 896411 3

How do I split the value of a column based on a lookup table?

I have a dataset with ids and associated values:
df <- data.frame(id = c("1", "2", "3"), value = c("12", "20", "16"))
I have a lookup table that matches the id to another reference label ref:
lookup <- data.frame(id = c("1", "1", "1", "2", "2", "3", "3", "3", "3"), ref = c("a", "b", "c", "a", "d", "d", "e", "f", "a"))
Note that id to ref is a many-to-many match: the same id can be associated with multiple ref, and the same ref can be associated with multiple id.
I'm trying to split the value associated with the df$id column equally into the associated ref columns. The output dataset would look like:
output <- data.frame(ref = "a", "b", "c", "d", "e", f", value = "18", "4", "4", "14", "4", "4")
ref
value
a
18
b
4
c
4
d
14
e
4
f
4
I tried splitting this into four steps:
calling pivot_wider on lookup, turning rows with the same id value into columns (e.g., a, b, c.)
merging the two datasets based on id
dividing each df$value equally into a, b, c, etc. columns that are not empty
transposing the dataset and summing across the id columns.
I can't figure out how to make step (3) work, though, and I suspect there's a much easier approach.
A variation of #thelatemail's answer with base pipes.
merge(df, lookup) |> type.convert(as.is=TRUE) |>
transform(value=ave(value, id, FUN=\(x) x/length(x))) |>
with(aggregate(list(value=value), list(ref=ref), sum))
# ref value
# 1 a 18
# 2 b 4
# 3 c 4
# 4 d 14
# 5 e 4
# 6 f 4
Here's a potential logic. Merge value from df into lookup by id, divide value by number of matching rows, then group by ref and sum. Then take your pick of how you want to do it.
Base R
tmp <- merge(lookup, df, by="id", all.x=TRUE)
tmp$value <- ave(as.numeric(tmp$value), tmp$id, FUN=\(x) x/length(x) )
aggregate(value ~ ref, tmp, sum)
dplyr
library(dplyr)
lookup %>%
left_join(df, by="id") %>%
group_by(id) %>%
mutate(value = as.numeric(value) / n() ) %>%
group_by(ref) %>%
summarise(value = sum(value))
data.table
library(data.table)
setDT(df)
setDT(lookup)
lookup[df, on="id", value := as.numeric(value)/.N, by=.EACHI][
, .(value = sum(value)), by=ref]
# ref value
#1: a 18
#2: b 4
#3: c 4
#4: d 14
#5: e 4
#6: f 4
This may work
lookup %>%
left_join(lookup %>%
group_by(id) %>%
summarise(n = n()) %>%
left_join(dummy, by = "id") %>%
mutate(value = as.numeric(value)) %>%
mutate(repl = value/n) %>%
select(id, repl) ,
by = "id"
) %>% select(ref, repl) %>%
group_by(ref) %>% summarise(value = sum(repl))
ref value
<chr> <dbl>
1 a 18
2 b 4
3 c 4
4 d 14
5 e 4
6 f 4

Merge if non duplicates

I am trying to merge two data frames (df1 and df2) based on two KEY (KEY1, and KEY2). However in df1, KEY1 is not unique. I want to merge df1 and df2 if KEY1 is unique. I generated a count variable which counts the number of occurence of KEY1, hence I want to merge df1 and df2 only if count equals 1.
Here is an example data frame:
df1$KEY1 <- as.data.frame(c("a", "a", "b", "c", "d"))
df1$count <- as.data.frame(c("2", "2", "1", "1", "1"))
df2$KEY2 <- as.data.frame(c("a", "b", "c", "d", "e"))
df2$value <- as.data.frame(c("85", "25", "581", "12", "4"))
My question is: how to perform the merge only if count equals 1?
df1 <- if(count==1,merge(df1, df2, by.x=KEY1, by.y=KEY2, all.x=TRUE), ?)
My goal is to get this:
df1$KEY1 <- as.data.frame(c("a", "a", "b", "c", "d"))
df1$count <- as.data.frame(c("2", "2", "1", "1", "1"))
df1$value <- as.data.frame(c("NA", "NA", "25", "581", "12"))
You can perform a join and change the values to NA if count is not 1.
library(dplyr)
inner_join(df1, df2, by = c('KEY1' = 'KEY2')) %>%
mutate(value = replace(value, count != 1, NA))
# KEY1 count value
#1 a 2 <NA>
#2 a 2 <NA>
#3 b 1 25
#4 c 1 581
#5 d 1 12
Similarly, in base R -
merge(df1, df2, by.x = 'KEY1', by.y = 'KEY2') |>
transform(value = replace(value, count != 1, NA))
data
df1 <- data.frame(KEY1 = c("a", "a", "b", "c", "d"),
count = c("2", "2", "1", "1", "1"))
df2 <- data.frame(KEY2 = c("a", "b", "c", "d", "e"),
value = c("85", "25", "581", "12", "4"))
If you insist on using base, what you are looking for is the incomparables argument in merge. Values of the key included in it aren't mathched
tab <- table(df1$KEY1)
tab
merge(df1, df2, by.x="KEY1", by.y="KEY2", all.x=TRUE,
incomparables = names(tab)[tab>1])
The output is:
KEY1 count value
1 a 2 <NA>
2 a 2 <NA>
3 b 1 25
4 c 1 581
5 d 1 12
You could use:
library(dplyr)
df1 %>%
mutate(
value = if_else(count == "1" & KEY1 %in% df2$KEY2,
tibble::deframe(df2)[KEY1],
NA_character_)
)
which returns
KEY1 count value
1 a 2 <NA>
2 a 2 <NA>
3 b 1 25
4 c 1 581
5 d 1 12
Or the same as base R:
transform(
df1,
value = ifelse(df1$count == 1,
`names<-`(df2$value, df2$KEY2)[df1$KEY1],
NA_character_)
)
Using data.table
library(data.table)
setDT(df1)[df2, value := NA^(count != 1) * value, on = .(KEY1 = KEY2)]
-output
> df1
KEY1 count value
1: a 2 NA
2: a 2 NA
3: b 1 25
4: c 1 581
5: d 1 12
NOTE: The numeric columns are created as character. Assuming they are of class numeric, do a join on by KEY columns and assign the value to 'df1' after converting to NA based on 'count' column values

String matching two dataframes with a sliding window

I have two df.
df1
col1
1 a
2 b
3 c
4 c
df2
setID col1
1 1 a
2 1 b
3 1 b
4 1 a
5 2 w
6 2 v
7 2 c
8 2 b
9 3 a
10 3 a
11 3 b
12 3 a
13 4 a
14 4 b
15 4 c
16 4 a
I'm using the following code to match them.
scorematch <- function ()
{
require("dplyr")
#to make sure every element is preceded by the one before that element
combm <- rev(sapply(rev(seq_along(df1$col1)), function(i) paste0(df1$col1[i-1], df1$col1[i])));
tempdf <- df2
#group the history by their ID
tempdf <- group_by(tempdf, setID)
#collapse strings in history
tempdf <- summarise(tempdf, ss = paste(col1, collapse = ""))
tempdf <- rowwise(tempdf)
#add score based on how it matches compared to path
tempdf <- mutate(tempdf, score = sum(sapply(combm, function(x) sum(grepl(x, ss)))))
tempdf <- ungroup(tempdf)
#filter so that only IDs with scores more than 0 are available
tempdf <- filter(tempdf, score != 0)
tempdf <- pull(tempdf, setID)
#filter original history to reflect new history
tempdf2 <- filter(df2, setID %in% tempdf)
tempdf2
}
This code works great. But I want to take this further. I want to apply a sliding window function to get the df1 values I want to match against df2. So far I'm using this function as my sliding window.
slidingwindow <- function(data, window, step)
{
#data is dataframe with colname
total <- length(data)
#spots are start of each window
spots <- seq(from=1, to=(total-step), by=step)
result <- vector(length = length(spots))
for(i in 1:length(spots)){
...
}
return(result)
}
The scorematch function will be nested inside slidingwindow function. I'm unsure how to proceed from there though. Ideally df1 will be split into windows. Starting from the first window it will be matched against df2 using the scorematch function to get a filtered out df2. Then I want the second window of df1 to match against the newly filtered df2 and so on. The loop should end when df2 has been filtered down so that it contains only 1 distinct setID value. The final output can either be the whole filtered df2 or just the remaining setID.
Ideal output would be either
setID col1
1 4 a
2 4 b
3 4 c
4 4 a
or
[1] "4"
Here is a solution without using a for-loop. I use stringr because of its nice consistent syntax, purrr for map (although lapply would be sufficient in this case) and dplyr to group_by setID and collapse the strings for each group.
library(dplyr)
library(purrr)
library(stringr)
First I collapse the string for each group. This makes it easier to use pattern-matching with str_detect-later:
df2_collapse <- df2 %>%
group_by(setID) %>%
summarise(string = str_c(col1, collapse = ""))
df2_collapse
# A tibble: 4 x 2
# setID string
# <int> <chr>
# 1 1 abba
# 2 2 wvcb
# 3 3 aaba
# 4 4 abca
The "look-up" string is collapse as well and then the substrings (i.e. slding windows) are extract with str_sub. Here I work along the length of the string str_length and extract all possible groups following each letter in the string.
string <- str_c(df1$col1, collapse = "")
string
# [1] "abcc"
substrings <-
unlist(map(1:str_length(string), ~ str_sub(string, start = .x, end = .x:str_length(string))))
Store the substrings in a tibble with their length as score.
substrings
# [1] "a" "ab" "abc" "abcc" "b" "bc" "bcc" "c" "cc" "c"
substrings <- tibble(substring = substrings,
score = str_length(substrings))
substrings
# A tibble: 10 x 2
# substring score
# <chr> <int>
# 1 a 1
# 2 ab 2
# 3 abc 3
# 4 abcc 4
# 5 b 1
# 6 bc 2
# 7 bcc 3
# 8 c 1
# 9 cc 2
# 10 c 1
For each setID with extract the maximum score it matches in the substring-data and the filter out the row with the maximum score of all setIDs.
df2_collapse %>%
mutate(score = map_dbl(string,
~ max(substrings$score[str_detect(.x, substrings$substring)]))) %>%
filter(score == max(score))
# A tibble: 1 x 3
# setID string score
# <int> <chr> <dbl>
# 1 4 abca 3
Data
df1 <- structure(list(col1 = c("a", "b", "c", "c")),
class = "data.frame", row.names = c("1", "2", "3", "4"))
df2 <-
structure(list(setID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L),
col1 = c("a", "b", "b", "a", "w", "v", "c", "b", "a", "a", "b", "a", "a", "b", "c", "a")),
class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16"))

Combine character columns into new column

I'd be very grateful if you could help me with the following as after a few tests I haven't still been able to get the right outcome.
I've got this data:
dd_1 <- data.frame(ID = c("1","2", "3", "4", "5"),
Class_a = c("a",NA, "a", NA, NA),
Class_b = c(NA, "b", "b", "b", "b"))
And I'd like to produce a new column 'CLASS':
dd_2 <- data.frame(ID = c("1","2", "3", "4", "5"),
Class_a = c("a",NA, "a", NA, NA),
Class_b = c(NA, "b", "b", "b", "b"),
CLASS = c("a", "b", "a-b", "b", "b"))
Thanks a lot!
Here it is:
tmp <- paste(dd_1$Class_a, dd_1$Class_b, sep='-')
tmp <- gsub('NA-|-NA', '', tmp)
(dd_2 <- cbind(dd_1, tmp))
First we concatenate (join as strings) the 2 columns. paste treats NAs as ordinary strings, i.e. "NA", so we either get NA-a, NA-b, or a-b. Then we substitute NA- or -NA with an empty string.
Which results in:
## ID Class_a Class_b tmp
## 1 1 a <NA> a
## 2 2 <NA> b b
## 3 3 a b a-b
## 4 4 <NA> b b
## 5 5 <NA> b b
Another option:
dd_1$CLASS <- with(dd_1, ifelse(is.na(Class_a), as.character(Class_b),
ifelse(is.na(Class_b), as.character(Class_a),
paste(Class_a, Class_b, sep="-"))))
This way you would check if any of the classes is NA and return the other, or, if none is NA, return both separated by "-".
Here's a short solution with apply:
dd_2 <- cbind(dd_1, CLASS = apply(dd_1[2:3], 1,
function(x) paste(na.omit(x), collapse = "-")))
The result
ID Class_a Class_b CLASS
1 1 a <NA> a
2 2 <NA> b b
3 3 a b a-b
4 4 <NA> b b
5 5 <NA> b b

Resources