Doing data mapping for each value in a df - r

I have a data set like this
df1<-data.frame(ID=c(1,2,3,4),colA=c(101,102,103,104),colB=c(201,202,203,204))
df2<-data.frame(var_id=c(101,102,103,104,201,202,203,204),var_value=c("A","B","C","D","E","F","G","H"))
I want to map any value in df1 that is in df2$var_id with the corresponding string in df2$var_value.
Desired output
df1<-data.frame(ID=c(1,2,3,4),colA=c("A","B","C","D"),colB=c("E","F","G","H"))
I have tried write a function, and then do lapply, but it only display one var_value

cols = c("colA", "colB")
df1[cols] <- lapply(df1[cols], \(x) df2$var_value[match(x, df2$var_id)])
df1
# ID colA colB
# 1 1 A E
# 2 2 B F
# 3 3 C G
# 4 4 D H

You can join twice.
library(dplyr)
df1 %>%
left_join(df2, by = c("colA" = "var_id")) %>%
left_join(df2, by = c("colB" = "var_id")) %>%
select(ID, colA = var_value.x, colB = var_value.y)
# ID colA colB
# 1 1 A E
# 2 2 B F
# 3 3 C G
# 4 4 D H

with the tidyverse you can apply a function across several columns:
library(tidyverse)
df1 |>
mutate(across(colA:colB, \(x) map_chr(x, \(y) with(df2,var_value[y == var_id] ))))
#> ID colA colB
#> 1 1 A E
#> 2 2 B F
#> 3 3 C G
#> 4 4 D H
#or
df1 |>
mutate(across(colA:colB, \(x) with(df2, var_value[match(x, var_id)])))
#> ID colA colB
#> 1 1 A E
#> 2 2 B F
#> 3 3 C G
#> 4 4 D H

Related

Sort a dataframe based on a specific column in R with dplyr

I have a dataframe that looks like this
df <- data.frame(time=seq(1,4,1),col1=c("a","b","d","c"), col2=c("d","a","c","b"))
df
#> time col1 col2
#> 1 1 a d
#> 2 2 b a
#> 3 3 d c
#> 4 4 c b
Created on 2021-11-06 by the reprex package (v2.0.1)
I want to sort my data frame based on col2 and look like this
time col1 col2
3 d d
1 a a
4 c c
2 b b
Any ideas or help is highly appreciated!
Don't know, if this makes any sense, but you could do a self join:
library(tidyr)
library(dplyr)
df %>%
select(col2) %>%
inner_join(df %>% mutate(col2 = col1), by = "col2") %>%
select(time, col1, col2)
This returns
time col1 col2
1 3 d d
2 1 a a
3 4 c c
4 2 b b
A solution in base R:
df <- data.frame(time = match(df$col2,df$col1), col1 = df$col2, col2=df$col2)
#> time col1 col2
#> 1 3 d d
#> 2 1 a a
#> 3 4 c c
#> 4 2 b b

Extract rows where value appears in any of multiple columns

Let' say I have two data.frames
name_df = read.table(text = "player_name
a
b
c
d
e
f
g", header = T)
game_df = read.table(text = "game_id winner_name loser_name
1 a b
2 b a
3 a c
4 a d
5 b c
6 c d
7 d e
8 e f
9 f a
10 g f
11 g a
12 f e
13 a d", header = T)
name_df contains a unique list of all the winner_name or loser_name values in game_df. I want to create a new data.frame that has, for each person in the name_df a row if a given name (e.g. a) appears in either the winner_name or loser_name column
So I essentially want to merge game_df with name_df, but the key column (name) can appear in either winner_name or loser_name.
So, for just a and b the final output would look something like:
final_df = read.table(text = "player_name game_id winner_name loser_name
a 1 a b
a 2 b a
a 3 a c
a 4 a d
a 9 f a
a 11 g a
a 13 a d
b 1 a b
b 2 b a
b 5 b c", header = T)
We can loop over the elements in 'name_df' for 'player_name', filter the rows from 'game_df' for either the 'winner_name' or 'loser_name'
library(dplyr)
library(purrr)
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ game_df %>%
filter(winner_name %in% .x|loser_name %in% .x), .id = 'player_name')
Or if there are many columns, use if_any
map_dfr(setNames(name_df$player_name, name_df$player_name),
~ {
nm1 <- .x
game_df %>%
filter(if_any(c(winner_name, loser_name), ~ . %in% nm1))
}, .id = 'player_name')
Dedicated to our teacher and mentor dear #akrun
I think we can also make use of the add_row() function you first taught me the other day. Unbelievable!!!
library(dplyr)
library(purrr)
library(tibble)
game_df %>%
rowwise() %>%
mutate(player_name = winner_name) %>%
group_split(game_id) %>%
map_dfr(~ add_row(.x, game_id = .x$game_id, winner_name = .x$winner_name,
loser_name = .x$loser_name, player_name = .x$loser_name)) %>%
arrange(player_name) %>%
relocate(player_name)
# A tibble: 26 x 4
player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows
This can be directly expressed in SQL:
library(sqldf)
sqldf("select *
from name_df
left join game_df on winner_name = player_name or loser_name = player_name")
Without using purrr. I think this is appropriate use case of tidyr::unite with argument remove = F where we can first unite the winners' and losers' names and then use tidyr::separate_rows to split new column into rows.
library(tidyr)
library(dplyr)
game_df %>% unite(Player_name, winner_name, loser_name, remove = F, sep = ', ') %>%
separate_rows(Player_name) %>%
relocate(Player_name) %>%
arrange(Player_name)
# A tibble: 26 x 4
Player_name game_id winner_name loser_name
<chr> <int> <chr> <chr>
1 a 1 a b
2 a 2 b a
3 a 3 a c
4 a 4 a d
5 a 9 f a
6 a 11 g a
7 a 13 a d
8 b 1 a b
9 b 2 b a
10 b 5 b c
# ... with 16 more rows
A Base R approach :
result <- do.call(rbind, lapply(name_df$player_name, function(x)
cbind(plaername = x,
subset(game_df, winner_name == x | loser_name == x))))
rownames(result) <- NULL
result
# playername game_id winner_name loser_name
#1 a 1 a b
#2 a 2 b a
#3 a 3 a c
#4 a 4 a d
#5 a 9 f a
#6 a 11 g a
#7 a 13 a d
#8 b 1 a b
#...
#...

Fill in cells with alternating pattern

I am trying to fill in blank cells with the value of rows above. Similar to na.locf function, but I have a pattern that needs to be matched. I don't necessarily know how many rows between new values (i.e betweem a,b and c,d).
I have used the na.locf and searched around for a solution to no avail.
df <- df <- data.frame(col1 = c("a","b", NA, NA, NA, NA, "c", "d", NA, NA))
df
# col1
# 1 a
# 2 b
# 3 <NA>
# 4 <NA>
# 5 <NA>
# 6 <NA>
# 7 c
# 8 d
# 9 <NA>
# 10 <NA>
Solution I would like:
df
col1
a
b
a
b
a
b
c
d
c
d
ave(df$col1,
with(rle(!is.na(df$col1)), rep(cumsum(values), lengths)),
FUN = function(x){
rep(x[!is.na(x)], length.out = length(x))
})
# [1] a b a b a b c d c d
Here's way with dplyr. You can drop the group column if needed. -
df %>%
group_by(group = cumsum(is.na(lag(col1)) & !is.na(col1))) %>%
mutate(
col1 = rep(col1[!is.na(col1)], length.out = n())
) %>%
ungroup()
# A tibble: 10 x 2
col1 group
<chr> <int>
1 a 1
2 b 1
3 a 1
4 b 1
5 a 1
6 b 1
7 c 2
8 d 2
9 c 2
10 d 2

Add together 2 dataframes in R without losing columns

I have 2 dataframes in R (df1, df2).
A C D
1 1 1
2 2 2
df2 as
A B C
1 1 1
2 2 2
How can I merge these 2 dataframes to produce the following output?
A B C D
2 1 2 1
4 2 4 2
Columns are sorted and column values are added. Both DFs have same number of rows. Thank you in advance.
Code to create DF:
df1 <- data.frame("A" = 1:2, "C" = 1:2, "D" = 1:2)
df2 <- data.frame("A" = 1:2, "B" = 1:2, "C" = 1:2)
nm1 = names(df1)
nm2 = names(df2)
nm = intersect(nm1, nm2)
if (length(nm) == 0){ # if no column names in common
cbind(df1, df2)
} else { # if column names in common
cbind(df1[!nm1 %in% nm2], # columns only in df1
df1[nm] + df2[nm], # add columns common to both
df2[!nm2 %in% nm1]) # columns only in df2
}
# D A C B
#1 1 2 2 1
#2 2 4 4 2
You can try:
library(tidyverse)
list(df2, df1) %>%
map(rownames_to_column) %>%
bind_rows %>%
group_by(rowname) %>%
summarise_all(sum, na.rm = TRUE)
# A tibble: 2 x 5
rowname A B C D
<chr> <int> <int> <int> <int>
1 1 2 1 2 1
2 2 4 2 4 2
By using left_join() from dplyr you won't lose the column
library(tidyverse)
dat1 <- tibble(a = 1:10,
b = 1:10,
c = 1:10)
dat2 <- tibble(c = 1:10,
d = 1:10,
e = 1:10)
left_join(dat1, dat2, by = "c")
#> # A tibble: 10 x 5
#> a b c d e
#> <int> <int> <int> <int> <int>
#> 1 1 1 1 1 1
#> 2 2 2 2 2 2
#> 3 3 3 3 3 3
#> 4 4 4 4 4 4
#> 5 5 5 5 5 5
#> 6 6 6 6 6 6
#> 7 7 7 7 7 7
#> 8 8 8 8 8 8
#> 9 9 9 9 9 9
#> 10 10 10 10 10 10
Created on 2019-01-16 by the reprex package (v0.2.1)
allnames <- sort(unique(c(names(df1), names(df2))))
df3 <- data.frame(matrix(0, nrow = nrow(df1), ncol = length(allnames)))
names(df3) <- allnames
df3[,allnames %in% names(df1)] <- df3[,allnames %in% names(df1)] + df1
df3[,allnames %in% names(df2)] <- df3[,allnames %in% names(df2)] + df2
df3
A B C D
1 2 1 2 1
2 4 2 4 2
Here is a fun base R method with Reduce.
Reduce(cbind,
list(Reduce("+", list(df1[intersect(names(df1), names(df2))],
df2[intersect(names(df1), names(df2))])), # sum results
df1[setdiff(names(df1), names(df2))], # in df1, not df2
df2[setdiff(names(df2), names(df1))])) # in df2, not df1
This returns
A C D B
1 2 2 1 1
2 4 4 2 2
This assumes that both df1 and df2 have columns that are not present in the other. If this is not true, you'd have to adjust the list.
Note also that you could replace Reduce with do.call in both places and you'd get the same result.

Tidyverse friendly hack on combn

Problem is simple and in many other posts, but I haven't found satisfactory answer.
Say you have a tibble with one column of labels (here letters) and other values in other columns (here just one 'value').
data <- tibble(letter = letters[1:5], value = 1:5)
Now what you want is generate all the pairs without permutations and keep the value attached to each of the pair element. Here's the solution I have and which I believe is valid but...inelegant.
combn(data$letter, m = 2) %>%
t() %>%
as_tibble() %>%
rename(letter_1 = V1, letter_2 = V2) %>%
left_join(data, by = c("letter_1" = "letter")) %>%
left_join(data, by = c("letter_2" = "letter"), suffix = c("_1", "_2"))
Which outputs the desired result:
# A tibble: 10 x 4
letter_1 letter_2 value_1 value_2
<chr> <chr> <int> <int>
1 a b 1 2
2 a c 1 3
3 a d 1 4
4 a e 1 5
5 b c 2 3
6 b d 2 4
7 b e 2 5
8 c d 3 4
9 c e 3 5
10 d e 4 5
I'm really looking for a tidyverse approach. I'm a fan boy :)
Thank you in advance for any help.
Here is a tidyverse solution using expand (instead of combn):
data %>%
expand(letter_1 = letter, letter_2 = letter) %>%
mutate(
value_1 = match(letter_1, letters),
value_2 = match(letter_2, letters)) %>%
filter(letter_1 != letter_2) %>%
rowwise() %>%
mutate(id = paste0(sort(c(letter_1, letter_2)), collapse = " ")) %>%
distinct(id, .keep_all = TRUE) %>%
select(-id)
## A tibble: 15 x 4
# letter_1 letter_2 value_1 value_2
# <chr> <chr> <int> <int>
# 2 a b 1 2
# 3 a c 1 3
# 4 a d 1 4
# 5 a e 1 5
# 7 b c 2 3
# 8 b d 2 4
# 9 b e 2 5
#11 c d 3 4
#12 c e 3 5
#13 d d 4 4
#14 d e 4 5
One option could be using combn as:
data <- tibble(letter = letters[1:5], value = 1:5)
res <- cbind(data.frame(t(combn(data$letter, 2))), data.frame(t(combn(data$value, 2))))
names(res) <- c("letter_1", "letter_2", "value_1", "value_2")
res
# letter_1 letter_2 value_1 value_2
# 1 a b 1 2
# 2 a c 1 3
# 3 a d 1 4
# 4 a e 1 5
# 5 b c 2 3
# 6 b d 2 4
# 7 b e 2 5
# 8 c d 3 4
# 9 c e 3 5
# 10 d e 4 5
I find the rowwise() function to work inconsistently in my machine. You might want to try map() functions in the purrr pacakge.
Here's a way to implement this:
library(purrr)
data %>%
expand(letter_1 = letter, letter_2 = letter) %>%
mutate(
value_1 = match(letter_1, letters),
value_2 = match(letter_2, letters)) %>%
filter(letter_1 != letter_2) %>%
mutate(
id = map2_chr(letter_1, letter_2, function(x, y) {
paste(sort(c(x, y)), collapse = " ")
})
) %>%
distinct(id, .keep_all = TRUE) %>%
select(-id)
# # A tibble: 10 x 4
# letter_1 letter_2 value_1 value_2
# <chr> <chr> <int> <int>
# 1 a b 1 2
# 2 a c 1 3
# 3 a d 1 4
# 4 a e 1 5
# 5 b c 2 3
# 6 b d 2 4
# 7 b e 2 5
# 8 c d 3 4
# 9 c e 3 5
# 10 d e 4 5

Resources