Label distinct combinations across multiple columns in R - r

I want to create a new column that labels each unique combination of values across x, y, z columns. My current work-around to achieve that is this:
> library(tidyverse)
>
> set.seed(100)
> df = tibble(x = sample.int(5, 50, replace = T), y = sample.int(5, 50, replace = T), z = sample.int(5, 50, replace = T))
> df
# A tibble: 50 x 3
x y z
<int> <int> <int>
1 2 4 4
2 3 4 4
3 1 3 5
4 2 1 4
5 4 2 5
6 4 5 2
7 2 3 4
8 3 5 4
9 2 4 1
10 5 5 2
# … with 40 more rows
>
> df2 = df %>% distinct(x,y,z) %>% rowid_to_column("unique_id") %>% left_join(df)
Joining, by = c("x", "y", "z")
> df2
# A tibble: 50 x 4
unique_id x y z
<int> <int> <int> <int>
1 1 2 4 4
2 2 3 4 4
3 3 1 3 5
4 4 2 1 4
5 4 2 1 4
6 5 4 2 5
7 5 4 2 5
8 6 4 5 2
9 6 4 5 2
10 7 2 3 4
# … with 40 more rows
What is a better/more efficient way to do this on a fairly large dataset? I'd like to stay within tidyverse but also open to other suggestions.

You could use rleidv from data.table
df$unique_id <- data.table::rleidv(df)
In dplyr, we can use group_indices function for this purpose which generates a unique id for each group of values.
library(dplyr)
df %>% mutate(unique_id = group_indices(., x, y, z))

In the devel version of dplyr, we can use cur_group_id
library(dplyr)
df %>%
group_by_all() %>%
mutate(unique_id = cur_group_id())
Or using .GRP from data.table
library(data.table)
setDT(df)[, unique_id := .GRP, names(df)]

Related

Re-ordering a factor (in a table) with hierarchical groups in R

Suppose the following table with two factor variabels and one numerical variable:
df <- tibble(
x = as_factor(c("a", "a", "a", "b", "b", "b")),
y = as_factor(1:6),
val = c(10, 3, 8, 2, 6, 1)
)
> df
# A tibble: 6 x 3
x y val
<fct> <fct> <dbl>
1 a 1 10
2 a 2 3
3 a 3 8
4 b 4 2
5 b 5 6
6 b 6 1
I would like to re-order y such that the sum of val, when grouped by x, takes precedent, but y is still ordered by val. To illustrate the goal:
# A tibble: 6 x 4
# Groups: x [2]
x y val sum
<fct> <fct> <dbl> <dbl>
1 a 1 10 21 # all y for which x=="a" come first, because
2 a 3 8 21 # the sum of val for x=="a" is greater than
3 a 2 3 21 # for x=="b"
4 b 5 6 9 # within each group, y is ordered by val
5 b 4 2 9
6 b 6 1 9
But how do I get there? Within tidyverse, I tried to solve it with forcats::fct_reorder(), thinking that grouping might help (df |> group_by(x) |> mutate(y = fct_reorder(y, val))), but it doesn't.
Can fct_reorder() do that at all? What other approaches could work?
Edit: I have found a solution, but it feels rather hacky:
df |>
group_by(x) |>
mutate(sum = sum(val)) |>
arrange(desc(sum), desc(val)) |> ungroup() |>
tibble::rowid_to_column() |>
mutate(across(c(x, y), \(x) fct_reorder(x, rowid)))
Perhaps, we need to arrange
library(dplyr)
library(forcats)
df %>%
arrange(desc(ave(val, x, FUN = sum)), desc(val)) %>%
mutate(across(where(is.factor), fct_inorder))
-output
# A tibble: 6 × 3
x y val
<fct> <fct> <dbl>
1 a 1 10
2 a 3 8
3 a 2 3
4 b 5 6
5 b 4 2
6 b 6 1
Or use fct_reorder/reorder in arrange
df %>%
arrange(desc(fct_reorder(x, val, .fun = sum)), desc(val)) %>%
mutate(across(where(is.factor), fct_inorder)
Probably we can use the following data.table option along with fct_inorder
setorder(
setDT(df)[
,
sum := sum(val), x
],
-sum, -val
)[
,
lapply(
.SD,
function(x) ifelse(is.factor(x), fct_inorder, c)(x)
)
]
and you will obtain
x y val sum
1: a 1 10 21
2: a 3 8 21
3: a 2 3 21
4: b 5 6 9
5: b 4 2 9
6: b 6 1 9

dplyr: Mutate a new column with sequential repeated integers of n time in a dataframe

I am struggling with one maybe easy question. I have a dataframe of 1 column with n rows (n is a multiple of 3). I would like to add a second column with integers like: 1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,.. How can I achieve this with dplyr as a general solution for different length of rows (all multiple of 3).
I tried this:
df <- tibble(Col1 = c(1:12)) %>%
mutate(Col2 = rep(1:4, each=3))
This works. But I would like to have a solution for n rows, each = 3 . Many thanks!
You can specify each and length.out parameter in rep.
library(dplyr)
tibble(Col1 = c(1:12)) %>%
mutate(Col2 = rep(row_number(), each=3, length.out = n()))
# Col1 Col2
# <int> <int>
# 1 1 1
# 2 2 1
# 3 3 1
# 4 4 2
# 5 5 2
# 6 6 2
# 7 7 3
# 8 8 3
# 9 9 3
#10 10 4
#11 11 4
#12 12 4
We can use gl
library(dplyr)
df %>%
mutate(col2 = as.integer(gl(n(), 3, n())))
As integer division i.e. %/% 3 over a sequence say 0:n will result in 0, 0, 0, 1, 1, 1, ... adding 1 will generate the desired sequence automatically, so simply this will also do
df %>% mutate(col2 = 1+ (row_number()-1) %/% 3)
# A tibble: 12 x 2
Col1 col2
<int> <dbl>
1 1 1
2 2 1
3 3 1
4 4 2
5 5 2
6 6 2
7 7 3
8 8 3
9 9 3
10 10 4
11 11 4
12 12 4

Merging columns while ignoring NAs

I would like to merge multiple columns. Here is what my sample dataset looks like.
df <- data.frame(
id = c(1,2,3,4,5),
cat.1 = c(3,4,NA,4,2),
cat.2 = c(3,NA,1,4,NA),
cat.3 = c(3,4,1,4,2))
> df
id cat.1 cat.2 cat.3
1 1 3 3 3
2 2 4 NA 4
3 3 NA 1 1
4 4 4 4 4
5 5 2 NA 2
I am trying to merge columns cat.1 cat.2 and cat.3. It is a little complicated for me since there are NAs.
I need to have only one cat variable and even some columns have NA, I need to ignore them. The desired output is below:
> df
id cat
1 1 3
2 2 4
3 3 1
4 4 4
5 5 2
Any thoughts?
Another variation of Gregor's answer using dplyr::transmute:
library(dplyr)
df %>%
transmute(id = id, cat = coalesce(cat.1, cat.2, cat.3))
#> id cat
#> 1 1 3
#> 2 2 4
#> 3 3 1
#> 4 4 4
#> 5 5 2
With dplyr:
library(dplyr)
df %>%
mutate(cat = coalesce(cat.1, cat.2, cat.3)) %>%
select(-cat.1, -cat.2, -cat.3)
An option with fcoalesce from data.table
library(data.table)
setDT(df)[, .(id, cat = do.call(fcoalesce, .SD)), .SDcols = patterns('^cat')]
-output
# id cat
#1: 1 3
#2: 2 4
#3: 3 1
#4: 4 4
#5: 5 2
Does this work:
> library(dplyr)
> df %>% rowwise() %>% mutate(cat = mean(c(cat.1, cat.2, cat.3), na.rm = T)) %>% select(-(2:4))
# A tibble: 5 x 2
# Rowwise:
id cat
<dbl> <dbl>
1 1 3
2 2 4
3 3 1
4 4 4
5 5 2
Since values across rows are unique, mean of the rows will return the same unique value, can also go with max or min.
Here is a base R solution which uses apply:
df$cat <- apply(df, 1, function(x) unique(x[!is.na(x)][-1]))

Sum of individual elements in a vector

I would like to determine the sum for each individual element in a vector.
For example, suppose I have the vector
x <- c(2,3,2,2,5,5,3,3)
and I want to find the sum for each element.
The answer would be something like
2: 6
3: 9
5: 10
This is because there are three 2's (2+2+2 or 2*), etc.
In other words, I want to essentially multiply the number times the number of times that element is found in the vector.
Using base R tapply
tapply(x, x, sum)
# 2 3 5
# 6 9 10
If you need it as dataframe wrap it in stack
stack(tapply(x, x, sum))
# values ind
#1 6 2
#2 9 3
#3 10 5
If you convert this to a dataframe then this becomes (How to sum a variable by group)
library(dplyr)
tibble::tibble(x) %>%
group_by(x) %>%
summarise(n = sum(x))
# A tibble: 3 x 2
# x n
# <dbl> <dbl>
#1 2 6
#2 3 9
#3 5 10
A method with dplyr:
x <- c(2,3,2,2,5,5,3,3)
a = tibble(x)
a %>% count(x) %>% mutate(xn = x*n)
# A tibble: 3 x 3
x n xn
<dbl> <int> <dbl>
1 2 3 6
2 3 3 9
3 5 2 10
Lots of ways to do this. A couple of base approaches:
with(rle(sort(x)), data.frame(val = values, freq = lengths, prod = lengths*values))
val freq prod
1 2 3 6
2 3 3 9
3 5 2 10
Or:
transform(as.data.frame(table(x), stringsAsFactors = FALSE), sum = as.numeric(x) * Freq)
x Freq sum
1 2 3 6
2 3 3 9
3 5 2 10
library(tidyverse)
x <- c(2,3,2,2,5,5,3,3)
tibble(x) %>%
count(x) %>%
mutate(xn = x*n ) %>%
pull(xn)
We can use rowsum from base R
rowsum(x, group = x)
# [,1]
#2 6
#3 9
#5 10
Or with by
by(x, x, FUN = sum)
Or with split
sapply(split(x, x), sum)
# 2 3 5
# 6 9 10
Or another option with xtabs
xtabs(x1 ~ x, cbind(x1 = x, x))
# 2 3 5
# 6 9 10
Or with ave
unique(data.frame(x, Sum = ave(x, x, FUN = sum)))
# x Sum
#1 2 6
#2 3 9
#5 5 10
Or using data.table
library(data.table)
data.table(grp = x, x=x)[, .(Sum = sum(x)), grp]
# grp Sum
#1: 2 6
#2: 3 9
#3: 5 10

Dense Rank by Multiple Columns in R

How can I get a dense rank of multiple columns in a dataframe? For example,
# I have:
df <- data.frame(x = c(1,1,1,1,2,2,2,3,3,3),
y = c(1,2,3,4,2,2,2,1,2,3))
# I want:
res <- data.frame(x = c(1,1,1,1,2,2,2,3,3,3),
y = c(1,2,3,4,2,2,2,1,2,3),
r = c(1,2,3,4,5,5,5,6,7,8))
res
x y z
1 1 1 1
2 1 2 2
3 1 3 3
4 1 4 4
5 2 2 5
6 2 2 5
7 2 2 5
8 3 1 6
9 3 2 7
10 3 3 8
My hack approach works for this particular dataset:
df %>%
arrange(x,y) %>%
mutate(r = if_else(y - lag(y,default=0) == 0, 0, 1)) %>%
mutate(r = cumsum(r))
But there must be a more general solution, maybe using functions like dense_rank() or row_number(). But I'm struggling with this.
dplyr solutions are ideal.
Right after posting, I think I found a solution here. In my case, it would be:
mutate(df, r = dense_rank(interaction(x,y,lex.order=T)))
But if you have a better solution, please share.
data.table
data.table has you covered with frank().
library(data.table)
frank(df, x,y, ties.method = 'min')
[1] 1 2 3 4 5 5 5 8 9 10
You can df$r <- frank(df, x,y, ties.method = 'min') to add as a new column.
tidyr/dplyr
Another option (though clunkier) is to use tidyr::unite to collapse your columns to one plus dplyr::dense_rank.
library(tidyverse)
df %>%
# add a single column with all the info
unite(xy, x, y) %>%
cbind(df) %>%
# dense rank on that
mutate(r = dense_rank(xy)) %>%
# now drop the helper col
select(-xy)
You can use cur_group_id:
library(dplyr)
df %>%
group_by(x, y) %>%
mutate(r = cur_group_id())
# x y r
# <dbl> <dbl> <int>
# 1 1 1 1
# 2 1 2 2
# 3 1 3 3
# 4 1 4 4
# 5 2 2 5
# 6 2 2 5
# 7 2 2 5
# 8 3 1 6
# 9 3 2 7
# 10 3 3 8

Resources