Merge where some ids are concatenated in single column

Merge where some ids are concatenated in single column - r

I have a dataframe with a column of ids, but for some rows there are multiple ids concatenated together. I want to merge this onto another dataframe using the id, and when the ids are concatenated it handles that and reflects it by having the values in the new columns added also concatenated.
For example I have dataframes
data <- data.frame(
id = c(1, 4, 3, "2,3", "1,4"),
value = c(1:5)
)
> data
id value
1 1 1
2 4 2
3 3 3
4 2,3 4
5 1,4 5
mapping <- data.frame(
id = 1:4,
name = c("one", "two", "three", "four")
)
> mapping
id name
1 1 one
2 2 two
3 3 three
4 4 four
I would like to end up with
id value name
1 1 1 one
2 4 2 four
3 3 3 three
4 2,3 4 two,three
5 1,4 5 one,four

I don't think there's a good way to do this other than to separate, join, and re-concatenate:
library(dplyr)
library(tidyr)
data %>%
mutate(true_id = row_number()) %>%
separate_rows(id, convert = TRUE) %>%
left_join(mapping, by = "id") %>%
group_by(true_id, value) %>%
summarize(id = toString(id), name = toString(name), .groups = "drop")
# # A tibble: 5 × 4
# true_id value id name
# <int> <int> <chr> <chr>
# 1 1 1 1 one
# 2 2 2 4 four
# 3 3 3 3 three
# 4 4 4 2, 3 two, three
# 5 5 5 1, 4 one, four
I wasn't sure if your value column would actually be unique, so I added a true_id just in case.

What about something like this. I could think of a few ways. One is longer, but much easier to follow and the other is short, but kind of a mess.
library(tidyverse)
#long and readable
data |>
mutate(tmp = row_number()) |>
mutate(id = str_split(id, ",")) |>
unnest_longer(id) |>
left_join(mapping |>
mutate(id = as.character(id)), by = "id") |>
group_by(tmp) |>
summarise(id = paste(id, collapse = ","),
value = value[1],
name = paste(name, collapse = ","))
#> # A tibble: 5 x 4
#> tmp id value name
#> <int> <chr> <int> <chr>
#> 1 1 1 1 one
#> 2 2 4 2 four
#> 3 3 3 3 three
#> 4 4 2,3 4 two,three
#> 5 5 1,4 5 one,four
#short and ugly
data |>
mutate(name = map_chr(id, \(x)paste(
mapping$name[which(as.character(mapping$id) %in% str_split(x, ",")[[1]])],
collapse = ",") ))
#> id value name
#> 1 1 1 one
#> 2 4 2 four
#> 3 3 3 three
#> 4 2,3 4 two,three
#> 5 1,4 5 one,four

greping the data$ids out of the mapping$ids.
mapply(\(x, y) toString(mapping$name[grep(sprintf('[%s]', gsub('\\D', '', x)), y)]),
data$id, list(mapping$id))
# 1 4 3 2,3 1,4
# "one" "four" "three" "two, three" "one, four"
In order not to have a space after the comma, use paste(., collapse=',') instead of toString.

Related

How to use R summarise with multiple numeric and text-based conditional subsets

I have a table containing two rows for each ID.
table <- tibble(
id = c(1,1,2,2,3,3,4,4,5,5),
row1 = c(2,5,2,5,1,3,2,5,3,2),
row2 = c("foo", "other foo", "bar", "bar", "bar", "bar other", "other", "foo", "other", "other")
)
> table
# A tibble: 10 × 3
id row1 row2
<dbl> <dbl> <chr>
1 1 2 foo
2 1 5 other foo
3 2 2 bar
4 2 5 bar
5 3 1 bar
6 3 3 bar other
7 4 2 other
8 4 4 foo
9 5 3 other
10 5 2 other
I would like to resolve the table to a single row for each ID based on three rules in succession:
If, for each ID, there is one row in which row1 is 5 or more, then choose the row where row1 is less than 5.
Else if, for each ID, there is a row in which row2 contains the word 'other', choose the row where row2 does not contain the word 'other'
Otherwise, for each ID, pick the first row.
I feel there must be a more straightforward way of doing this. This is my attempt so far, but I've can't work out how to resolve the NA to return 'bar'.
table %>%
group_by(id) %>%
summarise(
row1 = ifelse(max(row1) >= 5,
first(row1[row1 < 5]),
ifelse(
grep("other", row2),
ifelse(
!is.na(first(row1[grep("other", row2, invert = T)])),
first(row1[grep("other", row2, invert = T)]),
first(row1)),
first(row1))
),
row2 = ifelse(
max(row1) >= 5,
first(row2[row1 < 5]),
ifelse(
grep("other", row2),
ifelse(
!is.na(first(row2[grep("other", row2, invert = T)])),
first(row2[grep("other", row2, invert = T)]),
first(row2)),
first(row2)
)
)
)
# A tibble: 5 × 3
id row1 row2
<dbl> <dbl> <chr>
1 1 2 foo
2 2 2 NA
3 3 1 bar
4 4 2 foo
5 5 3 other
Desired output:
id
row1
row2
1
2
foo
2
2
bar
3
1
bar
4
2
other
5
3
other
Many thanks for your help.

Here is how we can do it:
library(dplyr)
library(tidyr)
library(stringr)
table %>%
group_by(id) %>%
separate_rows(row2) %>%
mutate(x = ifelse(row1>=5, min(row1),NA),
y = ifelse(str_detect(row2, 'other'), !str_detect(row2, 'other'), NA)) %>%
slice(1) %>%
select(-c(x, y))
id row1 row2
<dbl> <dbl> <chr>
1 1 2 foo
2 2 2 bar
3 3 1 bar
4 4 2 other
5 5 3 other

table %>%
group_by(id) %>%
subset(
case_when(
any(row1 >= 5) ~ row1 < 5,
any(grepl("other", row2)) ~ !grepl("other", row2),
T ~ T
)
) %>%
filter(row_number() == 1) %>%
ungroup()
This answer takes advantage of dplyr's grouping abilities to check for any() within each group, so it gets easy to know if a certain condition happens within a group.
It also uses case_when() to check for a series of conditions in a prioritized order, implementing what would be a series of if/else's.
Finally, since in whatever case we would like only the first row that matches the criteria, it uses the function row_number() to check whether we're on the first row within the group, in order to select it.
Output is:
# A tibble: 5 x 3
id row1 row2
<dbl> <dbl> <chr>
1 1 2 foo
2 2 2 bar
3 3 1 bar other
4 4 2 other
5 5 3 other
>

Here is one solution, that leverages this small function, f() using tidyverse or data.table
f <- function(r1,r2) {
if(sum(r1>=5)==1) return(list("row1" =r1[r1<5], "row2"=r2[r1<5]))
if(sum(grepl("other",r2))==1) return(list("row1" = r1[!grepl("other",r2)], "row2"=r2[!grepl("other",r2)]))
list("row1"=r1[1],"row2"=r2[1])
}
Usage
library(tidyverse)
table %>%
group_by(id) %>%
summarize(n=list(f(row1,row2))) %>%
unnest_wider(n)
or
library(data.table)
setDT(table)[, f(row1,row2), by=id]
Output:
id row1 row2
<num> <num> <char>
1: 1 2 foo
2: 2 2 bar
3: 3 1 bar
4: 4 2 other
5: 5 3 other

A dplyr solution:
table %>%
group_by(id) %>%
filter(row1 < 5 | n_distinct(row1 < 5) == 1) %>%
filter(!grepl("other", row2) | n_distinct(grepl("other", row2)) == 1) %>%
slice(1) %>% ungroup()
# # A tibble: 5 × 3
# id row1 row2
# <dbl> <dbl> <chr>
# 1 1 2 foo
# 2 2 2 bar
# 3 3 1 bar
# 4 4 2 other
# 5 5 3 other
n_distinct(...) == 1 is used to determine if a condition is all TRUE or all FALSE.

spliting a data.frame to a list of smaller data.frames containing a pair

I wonder how to split my data below such that I get a list of smaller dataf.rames each of which containing a unique pair of type in it?
My desired_output is shown below.
Note that this is just a toy data, so type can be any other variable. Also, note that if a particular type has just one row (like type == 4), I want to exclude that with a warning that says:
type 4 has just one row thus is excluded.
m=
"
obs type
1 1
2 1
3 a
4 a
5 3
6 3
7 4
"
data <- read.table(text = m, h=T)
desired_output <-list(
data.frame(obs=1:4, type=c(1,1,"a","a")),
data.frame(obs=c(1,2,5,6), type=c(1,1,3,3)),
data.frame(obs=3:6, type=c("a","a",3,3))
)
# warning: type 4 has just one row thus is excluded.

Here is base R function -
return_list_data <- function(data, type) {
unique_counts <- table(data[[type]])
single_count <- names(unique_counts[unique_counts == 1])
if(length(single_count)) {
warning(sprintf('%s %s has just one row thus is excluded.', type, toString(single_count)))
}
multiple_count <- names(unique_counts[unique_counts > 1])
combn(multiple_count, 2, function(x) {
data[data[[type]] %in% x, ]
}, simplify = FALSE)
}
This returns -
return_list_data(data, 'type')
#[[1]]
# obs type
#1 1 1
#2 2 1
#5 5 3
#6 6 3
#[[2]]
# obs type
#1 1 1
#2 2 1
#3 3 a
#4 4 a
#[[3]]
# obs type
#3 3 a
#4 4 a
#5 5 3
#6 6 3
#Warning message:
#In return_list_data(data, "type") :
# type 4 has just one row thus is excluded.
No warning is generated if there is no type with single row i.e return_list_data(data[-7, ], 'type').

You may try using dplyr,
df1 <- read.table(text = m, h=T)
fun <- function(df1){
df2 <- df1 %>%
group_by(type) %>%
filter(n() > 1)
df3 <- combn(unique(df2$type), 2) %>% as.data.frame
df4 <- lapply(df3, function(x){
df2 %>%
filter(type %in% x)
})
war <- df1 %>%
group_by(type) %>%
filter(n()<= 1) %>%
pull(type)%>%
unique
if (length(war)>0){
warning(paste("type", war, "has just one row thus is excluded"))}
return(df4)
}
fun(df1)
result:
$V1
# A tibble: 4 x 2
# Groups: type [2]
obs type
<int> <chr>
1 1 1
2 2 1
3 3 a
4 4 a
$V2
# A tibble: 4 x 2
# Groups: type [2]
obs type
<int> <chr>
1 1 1
2 2 1
3 5 3
4 6 3
$V3
# A tibble: 4 x 2
# Groups: type [2]
obs type
<int> <chr>
1 3 a
2 4 a
3 5 3
4 6 3
Warnings: In fun(df1) : type 4 has just one row thus is excluded

How to use paste within group_by (dplyr)?

I have following data (example):
id <- c(1, 1, 2, 2, 2)
x <- c(2, 2, 3, 3, 4)
dat <- data.frame(id, x)
Now I can count the occurrence of x by group (id) and save in dat2:
dat2 <- dat %>% group_by(id, x) %>% dplyr::mutate(count = n())
Now count cases for the id's:
dat2 <- dat2 %>% group_by(id) %>% dplyr::mutate(j = n())
This works all fine. Result:
dat2
# A tibble: 5 x 4
# Groups: id [2]
id x count j
<dbl> <dbl> <int> <int>
1 1 2 2 2
2 1 2 2 2
3 2 3 2 3
4 2 3 2 3
5 2 4 1 3
Now to my problem. I want to use paste within "group_by". To be more exact, i want to use two character-"placeholder" i (for id) and z (for x) to control the grouping. I don't want to use the "real" objects id and x:
i <- "id"
z <- "x"
dat2 <- dat %>% group_by(dat[[paste(i, sep = "")]], dat[[paste(z, sep = "")]]) %>% dplyr::mutate(count = n())
This first step also works, same as above. However, going into the next final step, an error occurs:
dat2 <- dat2 %>% group_by(dat[[paste(i, sep = "")]]) %>% dplyr::mutate(j = n ())
Error: Problem with `mutate()` input `..1`.
x Input `..1` can't be recycled to size 2.
i Input `..1` is `dat[[paste(i, sep = "")]]`.
i Input `..1` must be size 2 or 1, not 5.
i The error occured in group 1: dat[[paste(i, sep = "")]] = 1, dat[[paste(z, sep = "")]] = 2.
Run `rlang::last_error()` to see where the error occurred.
My question: How to avoid this error and get to the same result like before without using paste? Working with the paste command may look strange, but i need to work with a character-placeholder.
I am glad about any help!

We could use across instead of paste
library(dplyr)
dat %>%
group_by(across(all_of(c(i, z)))) %>%
mutate(count = n()) %>%
group_by(across(all_of(i))) %>%
mutate(j = n())
# A tibble: 5 x 4
# Groups: id [2]
id x count j
<dbl> <dbl> <int> <int>
1 1 2 2 2
2 1 2 2 2
3 2 3 2 3
4 2 3 2 3
5 2 4 1 3
Or instead of grouping, use add_count
dat %>%
add_count(across(all_of(c(i, z))), name = 'count') %>%
add_count(across(all_of(i)), name = 'j')
id x count j
1 1 2 2 2
2 1 2 2 2
3 2 3 2 3
4 2 3 2 3
5 2 4 1 3

Gather serveral columns at once in r

I am trying to gather() a data.frame, but somehow it is not doing what I want.
This is my data:
df <- data.frame("id" = c(1),
"reco_1"= c(2),
"sim_1" = c(2),
"title_1"= c(2),
"reco_2" = c(3),
"sim_2" = c(3),
"title_2"= c(3))
And this is what it looks like printed:
> df
id reco_1 sim_1 title_1 reco_2 sim_2 title_2
1 1 2 2 2 3 3 3
When I now gather() my df, it looks like this:
> df %>% gather(reco, sim, -id)
id reco sim
1 1 reco_1 2
2 1 sim_1 2
3 1 title_1 2
4 1 reco_2 3
5 1 sim_2 3
6 1 title_2 3
However, what I would like to have is the following structure:
id reco sim title
1 1 2 2 2
2 2 3 3 3
I would appreciate any help, since I do not even know whether gather() is even the right verb for it.

We can use pivot_longer
library(dplyr)
library(tidyr)
df %>%
pivot_longer(-id, names_to = c(".value", "new_id"), names_sep = "_") %>%
select(-id)
# A tibble: 2 x 4
new_id reco sim title
<chr> <dbl> <dbl> <dbl>
1 1 2 2 2
2 2 3 3 3

Removing mirrored combinations of variables in a data frame

I'm looking to get each unique combination of two variables:
library(purrr)
cross_df(list(id1 = seq_len(3), id2 = seq_len(3)), .filter = `==`)
# A tibble: 6 x 2
id1 id2
<int> <int>
1 2 1
2 3 1
3 1 2
4 3 2
5 1 3
6 2 3
How do I remove out the mirrored combinations? That is, I want only one of rows 1 and 3 in the data frame above, only one of rows 2 and 5, and only one of rows 4 and 6. My desired output would be something like:
# A tibble: 3 x 2
id1 id2
<int> <int>
1 2 1
2 3 1
3 3 2
I don't care if a particular id value is in id1 or id2, so the below is just as acceptable as the output:
# A tibble: 3 x 2
id1 id2
<int> <int>
1 1 2
2 1 3
3 2 3

A tidyverse version of Dan's answer:
cross_df(list(id1 = seq_len(3), id2 = seq_len(3)), .filter = `==`) %>%
mutate(min = pmap_int(., min), max = pmap_int(., max)) %>% # Find the min and max in each row
unite(check, c(min, max), remove = FALSE) %>% # Combine them in a "check" variable
distinct(check, .keep_all = TRUE) %>% # Remove duplicates of the "check" variable
select(id1, id2)
# A tibble: 3 x 2
id1 id2
<int> <int>
1 2 1
2 3 1
3 3 2

A Base R approach:
# create a string with the sorted elements of the row
df$temp <- apply(df, 1, function(x) paste(sort(x), collapse=""))
# then you can simply keep rows with a unique sorted-string value
df[!duplicated(df$temp), 1:2]

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Merge where some ids are concatenated in single column - r

Related

How to use R summarise with multiple numeric and text-based conditional subsets

spliting a data.frame to a list of smaller data.frames containing a pair

How to use paste within group_by (dplyr)?

Gather serveral columns at once in r

Removing mirrored combinations of variables in a data frame

Categories

Resources