Remove rows found in more than 3 groups - r

I have a dataframe, i am trying to remove the rows that are present in >= 3 groups. In my below example bike is the common value across 3 group and i need to remove that. Please help me to achieve this.
df <- data.frame(a = c("name1","name1","name1","name2","name2","name2","name3"), b=c("car","bike","bus","train","bike","tour","bike"))
df
a b
name1 car
name1 bike
name1 bus
name2 train
name2 bike
name2 tour
name3 bike
Expected Output:
a b
name1 car
name1 bus
name2 train
name2 tour

You can use dplyr::n_distinct:
n_gr <- 3
cn <- df %>% group_by(b) %>% summarise(na = n_distinct(a)) %>%
filter(na >= n_gr) %>% pull(b)
df <- df %>% filter(!(b %in% cn))
Output
a b
1 name1 car
2 name1 bus
3 name2 train
4 name2 tour

In base R you could do this...
df[ave(as.numeric(as.factor(df$a)), #convert a to numbers (factor levels) (required by ave)
df$b, #group by b
FUN=length) < 3, ] #return whether no of a's per b is less than 3
a b
1 name1 car
3 name1 bus
4 name2 train
6 name2 tour

Using data.table:
library(data.table)
setDT(df)[, count := .N, by = b] ## convert df to data.table & create a column to count groups
df <- df[!(count >= 3), ] ## delete rows that have count equal to 3 or more than 3
df[, count := NULL] ## delete the column created
df
a b
1: name1 car
2: name1 bus
3: name2 train
4: name2 tour

Using Base R:
df <- data.frame(a = c("name1","name1","name1","name2","name2","name2","name3"), b=c("car","bike","bus","train","bike","tour","bike"))
df
lst <- table(df$b)
df[df$b != names(lst)[lst >=3],]
# a b
# 1 name1 car
# 3 name1 bus
# 4 name2 train
# 6 name2 tour

Related

R: Lookup value for calculation (similar to vlookup in Excel)?

I've got the following dataframes in R.
Name <- c("Tom", "Bill", "Jeffrey", "George", "David")
Value <- c("5.24", "5.48", "6.32", "6.07", "5.1")
df <- data.frame(Name, Value)
And:
Name1 <- c("Tom", "George", "David")
Name2 <- c("Jeffrey", "Bill", "Tom")
df2 <- data.frame(Name1, Name2)
I want to create another column in df2 which will produce the following:
Name1 * Name2 (based on value from df)
What is the best way to achieve this in R?
I know you can use vlookup function in Excel.
We could use match to get the index for subsetting the 'Value' column use Reduce to do elementwise multiplication
# as Value is created with quotes, it is of class character
# use type.convert to change the column types
df <- type.convert(df, as.is = TRUE)
df2$Value <- Reduce(`*`, lapply(df2, function(x) df$Value[match(x, df$Name)]))
-output
> df2
Name1 Name2 Value
1 Tom Jeffrey 33.1168
2 George Bill 33.2636
3 David Tom 26.7240
> with(df, Value[Name == 'Tom'] * Value[Name == "Jeffrey"])
[1] 33.1168
Or without Reduce
with(df2, df$Value[match(Name1, df$Name)] * df$Value[match(Name2, df$Name)])
[1] 33.1168 33.2636 26.7240
Or using tidyverse
library(dplyr)
library(purrr)
df2 %>%
mutate(Value = across(everything(), ~ df$Value[match(.x, df$Name)]) %>%
reduce(`*`))
-output
Name1 Name2 Value
1 Tom Jeffrey 33.1168
2 George Bill 33.2636
3 David Tom 26.7240
using tapply:
first convert to numeric.
df$Value <- as.numeric(df$Value)
Then
df2$val <- tapply(df$Value[match(unlist(df2), df$Name)],row(df2), prod)
Name1 Name2 val
1 Tom Jeffrey 33.1168
2 George Bill 33.2636
3 David Tom 26.7240
library(dplyr)
df2 %>%
left_join(df, join_by(Name1 == Name)) %>%
left_join(df, join_by(Name2 == Name)) %>%
mutate(mult = as.numeric(Value.x) * as.numeric(Value.y))
Result
Name1 Name2 Value.x Value.y mult
1 Tom Jeffrey 5.24 6.32 33.1168
2 George Bill 6.07 5.48 33.2636
3 David Tom 5.1 5.24 26.7240

How to rename columns in R with dplyr using a character object?

My data frame is as such:
#generic dataset
datatest <- data.frame(col1 = c(1,2,3,4), col2 = c('A', 'B', 'C', 'D'))
#character objects
name1 <- 'A'
name2 <- 'B'
I want to rename my columns using the name1 and name2 objects. These dynamically change in the code so I can't use the following:
#I DON'T WANT THIS
datatest %>% rename(A = col1, B = col2)
I want to use this:
datatest %>% rename(name1 = col1, name2 = col2)
but then the data table columns end up becoming 'name1' and 'name2' respectively, when they should be A and B. Here is the data table at the moment.
name1 (I want this to be A)
name2 (I want this to be B)
1
A
2
B
3
C
4
D
Any help is hugely appreciated. I have the same issue with kable tables too.
Thanks in advance!
Couple of options -
Using rename_with -
library(dplyr)
name1 <- 'A'
name2 <- 'B'
datatest %>% rename_with(~c(name1, name2), c(col1, col2))
#If there are only two columns in datatest
datatest %>% rename_with(~c(name1, name2))
# A B
#1 1 A
#2 2 B
#3 3 C
#4 4 D
Use a named vector
name <- c(A = 'col1', B = 'col2')
datatest %>% rename(!!name)
You may try
datatest %>% rename({{name1}} := col1, {{name2}} := col2)
A B
1 1 A
2 2 B
3 3 C
4 4 D
Here is one more option using !!!setNames
datatest %>%
rename(!!!setNames(names(.), c(name1, name2)))
A B
1 1 A
2 2 B
3 3 C
4 4 D

tidyverse solution: is there a way to keep only rows when a certain word/value occurs e.g. 3x in a column

Lets say the data looks like this
A <- c("name1", "name2", "name3", "name1", "name1", "name4")
B <- c(10, 8, 7, 3, -1, -2)
C <- c(8, 3, -1, -10, -2, -2)
df <- data.frame(A, B, C)
df
A B C
1 name1 10 8
2 name2 8 3
3 name3 7 -1
4 name1 3 -10
5 name1 -1 -2
6 name6 -2 -2
Now there must be a smart way to "collect" ONLY the rows that have triplicated values for the first column (A) into a new dataframe. So for this particular example that would be all rows that have "name1" because that is repeated thrice. How to do this if the dataset is very large, how can you detect and keep rows with triplicated (or any other arbitrary number) of values?
dplyr
df %>%
group_by(A) %>%
filter(n() == 3)
base R
df[A %in% names(which(table(df$A) == 3)),]
output
A B C
1 name1 10 8
2 name1 3 -10
3 name1 -1 -2
Slightly different dplyr approach:
df %>%
add_count(A, name = "A_count")%>%
filter(A_count == 3) %>%
select(-A_count)
Add a count of the variable in A, name the count (otherwise that column would be named n) and then filter, remove the column with select -.

How to create a new data table based on pairwise combinations of a subset of column names?

I am trying to define a function that takes a data frame or table as input with a specific number of ID columns (e.g., 2 or 3 ID columns), and the remaining columns are NAME1, NAME2, ..., NAMEK (numeric columns). The output should be a data table that consists of the same ID columns as before plus one additional ID column that groups each unique pairwise combination of the column names (NAME1, NAME2, ...). In addition, we must gather the actual values of the numeric columns into two new columns based on the ID column; an example with two ID columns and three numeric columns:
ID1 <- c("A","A","A","B","B","B")
ID2 <- c(1,2,3,1,2,3)
NAME1 <- c(10,11,9,22,25,22)
NAME2 <- c(7,9,8,20,22,21)
NAME3 <- c(10,12,11,15,19,30)
DT <- data.table(ID1,ID2,NAME1,NAME2,NAME3)
I want the output of the function with DT as input to be
ID.new <- c("NAME1 - NAME2","NAME1 - NAME2","NAME1 - NAME2", "NAME1 - NAME2",
"NAME1 - NAME2","NAME1 - NAME2", "NAME1 - NAME3", "NAME1 - NAME3",
"NAME1 - NAME3","NAME1 - NAME3","NAME1 - NAME3","NAME1 - NAME3",
"NAME2 - NAME3","NAME2 - NAME3","NAME2 - NAME3","NAME2 - NAME3",
"NAME2 - NAME3", "NAME2 - NAME3")
ID1 <- c("A","A","A","B","B","B","A","A","A","B","B","B","A","A","A","B","B","B")
ID2 <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
value.left <- c(10,11,9,22,25,22,10,11,9,22,25,22,7,9,8,20,22,21)
value.right <- c(7,9,8,20,22,21,10,12,11,15,19,30,10,12,11,15,19,30)
DT.output <- data.table(ID.new,ID1,ID2,value.left,value.right)
I have found that fun() (see below) does the job, but is too slow for my liking:
fun <- function(data, ID.cols){
data <- data.table(data)
# Which of the columns are ID columns
ids <- which(colnames(data) %in% ID.cols)
# Obtain all pairwise combinations of numeric columns into a list
numeric.combs <- combn(x = data.table(data)[,!ids, with = FALSE], m = 2, simplify = FALSE)
id.cols <- data[,ids, with = FALSE]
# bind the ID columns to each pairwise combination of numeric columns inside the list
bind.columns.each.numeric.comb <- lapply(X = numeric.combs, FUN = function(x) cbind(id.cols,x))
# Create generic names for the numeric columns so that rbindlist() may be applied. In addition we make a new column that groups based on which columns we are considering
generalize <- suppressWarnings(lapply(X = bind.columns.each.numeric.comb, FUN = function(x)
setattr(x = x[,ID.NEW:=paste(colnames(x[,!ids,with=FALSE]),collapse=" - ")], name =
'names', value = c(ID.cols,"value.left","value.right","ID.NEW"))))
return(rbindlist(l=generalize))
}
# Performance
print(microbenchmark(fun(DT,ID.cols=c("ID1","ID2")),times=1000))
Is there a faster and more elegant way to do this?
A melted, self-join option:
library(data.table)
DTlong <- melt(DT, id.vars = c("ID1", "ID2"), variable.factor = FALSE)
out <- DTlong[DTlong, on = .(ID1, ID2), allow.cartesian = TRUE
][variable < i.variable,
][, .(ID.new = paste(variable, i.variable, sep = " - "),
ID1, ID2, value.left = value, value.right = i.value)]
out
# ID.new ID1 ID2 value.left value.right
# <char> <char> <num> <num> <num>
# 1: NAME1 - NAME2 A 1 10 7
# 2: NAME1 - NAME2 A 2 11 9
# 3: NAME1 - NAME2 A 3 9 8
# 4: NAME1 - NAME2 B 1 22 20
# 5: NAME1 - NAME2 B 2 25 22
# 6: NAME1 - NAME2 B 3 22 21
# 7: NAME1 - NAME3 A 1 10 10
# 8: NAME2 - NAME3 A 1 7 10
# 9: NAME1 - NAME3 A 2 11 12
# 10: NAME2 - NAME3 A 2 9 12
# 11: NAME1 - NAME3 A 3 9 11
# 12: NAME2 - NAME3 A 3 8 11
# 13: NAME1 - NAME3 B 1 22 15
# 14: NAME2 - NAME3 B 1 20 15
# 15: NAME1 - NAME3 B 2 25 19
# 16: NAME2 - NAME3 B 2 22 19
# 17: NAME1 - NAME3 B 3 22 30
# 18: NAME2 - NAME3 B 3 21 30
### validation
setorder(out, ID.new, ID1, ID2)
identical(DT.output, out)
# [1] TRUE
The methodology of combn is sound thinking, certainly, however its only inefficiency is that it iterates once per combination. That is, the function passed to combn(..., FUN=) is called in this case 18 times; if your data is much larger, it will be called many more times. In the case of a merge/join as here, though, everything is done in as vectorized a way as we can manage: merge is done efficiently, filtering is returned as a single logical vector, and the paste(..) is also one large vector.
The merge concept does have its own inefficiency, to be fair: it originally produces 54 rows due to the cartesian join. This will cause memory-exhaustion problems with much-larger data. If you run into this, it may help to use fuzzyjoin and include variable < variable (LHS vs RHS), which should reduce (if not completely remove) the problem.
This last recommendation can be done in sqldf as well:
sqldf::sqldf("
select t1.variable || ' - ' || t2.variable as [ID.new], t1.ID1, t1.ID2,
t1.value as [value.left], t2.value as [value.right]
from DTlong t1
join DTlong t2 on t1.ID1=t2.ID1 and t1.ID2=t2.ID2
and t1.variable < t2.variable")
# ID.new ID1 ID2 value.left value.right
# 1 NAME1 - NAME2 A 1 10 7
# 2 NAME1 - NAME3 A 1 10 10
# 3 NAME1 - NAME2 A 2 11 9
# 4 NAME1 - NAME3 A 2 11 12
# 5 NAME1 - NAME2 A 3 9 8
# 6 NAME1 - NAME3 A 3 9 11
# 7 NAME1 - NAME2 B 1 22 20
# 8 NAME1 - NAME3 B 1 22 15
# 9 NAME1 - NAME2 B 2 25 22
# 10 NAME1 - NAME3 B 2 25 19
# 11 NAME1 - NAME2 B 3 22 21
# 12 NAME1 - NAME3 B 3 22 30
# 13 NAME2 - NAME3 A 1 7 10
# 14 NAME2 - NAME3 A 2 9 12
# 15 NAME2 - NAME3 A 3 8 11
# 16 NAME2 - NAME3 B 1 20 15
# 17 NAME2 - NAME3 B 2 22 19
# 18 NAME2 - NAME3 B 3 21 30
Benchmarking:
bench::mark(
pernkf = fun(DT, c("ID1", "ID2")),
tjebo = fun2(DT, c("ID1", "ID2")),
r2evans = fun3(DT, c("ID1", "ID2")), # native data.table
r2evans2 = fun4(), # sqldf
check = FALSE)
# # A tibble: 4 x 13
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
# 1 pernkf 5.38ms 6.06ms 161. 287KB 13.2 61 5 379ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
# 2 tjebo 5.08ms 5.63ms 172. 230KB 8.83 78 4 453ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
# 3 r2evans 2.97ms 3.48ms 280. 170KB 11.0 127 5 454ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
# 4 r2evans2 17.19ms 18.91ms 52.0 145KB 13.0 20 5 384ms <NULL> <Rprofmem[,3~ <bch:tm~ <tibble [~
(sqldf does take a performance hit in this example, I'd welcome improved queries :-)
If you can use a data frame, the below will give you the currently most speed and memory efficient approach (see benchmark wiki).
I think the approach using combn() seems reasonable to me. And I don't really think it's iterating over the combinations 18 times, as has been purported. Moreover, I personally find this easier to read than the data table melt version, but this is probably because I'm not used to data.table syntax.
Note: using this on a data table is apparently not efficient. If you really need a data.table, r2evans solution is better.
fun2 <- function(data, ID.cols){
ids <- which(colnames(data) %in% ID.cols)
## you can loop over the combinations directly
new_dat <- combn(data[-ids], 2, function(x) {
new_x <- setNames(x, paste("value", c("left", "right"), sep = "."))
## use paste with collapse for the ID.new
new_x$ID.new <- paste(names(x), collapse = " - ")
new_x
}, simplify = FALSE)
## bind it with the old ID columns, outside the loop (bit faster)
cbind(do.call(rbind, new_dat), data[ids])
}
fun2(DT,ID.cols = c("ID1", "ID2"))
#> value.left value.right ID.new ID1 ID2
#> 1 10 7 NAME1 - NAME2 A 1
#> 2 11 9 NAME1 - NAME2 A 2
#> 3 9 8 NAME1 - NAME2 A 3
#> 4 22 20 NAME1 - NAME2 B 1
#> 5 25 22 NAME1 - NAME2 B 2
#> 6 22 21 NAME1 - NAME2 B 3
#> 7 10 10 NAME1 - NAME3 A 1
#> 8 11 12 NAME1 - NAME3 A 2
#> 9 9 11 NAME1 - NAME3 A 3
#> 10 22 15 NAME1 - NAME3 B 1
#> 11 25 19 NAME1 - NAME3 B 2
#> 12 22 30 NAME1 - NAME3 B 3
#> 13 7 10 NAME2 - NAME3 A 1
#> 14 9 12 NAME2 - NAME3 A 2
#> 15 8 11 NAME2 - NAME3 A 3
#> 16 20 15 NAME2 - NAME3 B 1
#> 17 22 19 NAME2 - NAME3 B 2
#> 18 21 30 NAME2 - NAME3 B 3
For a benchmark, please see the community wiki.
Benchmarking, reprex. If you don't really need a data table, base R seems to do the trick here.
Note this is comparing r2evans' and pernkf's functions on a data table with tjebo's and tarjae's function on a data frame.
The methods suggested by PeaceWang is currently not included as they are either not scalable to k columns or provide an incorrect result.
bench::mark(
pernkf = fun(DT, c("ID1", "ID2")),
tjebo = fun2(DF, c("ID1", "ID2")),
r2evans = fun3(DT, c("ID1", "ID2")),
tarjae = fun4(DF, c("ID1", "ID2")),
check = FALSE)
#> # A tibble: 4 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 pernkf 2.95ms 3.2ms 302. 2.29MB 6.33
#> 2 tjebo 359.33µs 373.85µs 2423. 18.65KB 10.5
#> 3 r2evans 1.65ms 1.79ms 535. 756.16KB 6.30
#> 4 tarjae 26.49ms 27.74ms 34.3 4.75MB 7.35
m <- microbenchmark::microbenchmark(
pernkf = fun(DT, ID.cols = c("ID1", "ID2")),
r2evans = fun3(DT, ID.cols = c("ID1", "ID2")),
tjebo = fun2(DF, ID.cols = c("ID1", "ID2")),
tarjae = fun4(DF, c("ID1", "ID2")),
times = 1000
)
m
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> pernkf 2885.714 3055.1450 3439.1257 3150.457 3298.404 95391.80 1000
#> r2evans 1629.028 1739.5715 1949.8389 1829.696 1922.227 10843.33 1000
#> tjebo 354.714 410.0975 469.1457 427.948 443.237 4344.00 1000
#> tarjae 25854.416 26564.8420 29103.6948 27142.758 30982.328 118592.10 1000
ggplot2::autoplot(m)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.
Data and functions
library(tidyverse)
library(data.table)
ID1 <- c("A","A","A","B","B","B")
ID2 <- c(1,2,3,1,2,3)
NAME1 <- c(10,11,9,22,25,22)
NAME2 <- c(7,9,8,20,22,21)
NAME3 <- c(10,12,11,15,19,30)
DF <- data.frame(ID1,ID2,NAME1,NAME2,NAME3)
DT <- data.table(DF)
fun <- function(data, ID.cols){
data <- data.table(data)
ids <- which(colnames(data) %in% ID.cols)
numeric.combs <- combn(x = data.table(data)[,!ids, with = FALSE], m = 2, simplify = FALSE)
id.cols <- data[,ids, with = FALSE]
bind.columns.each.numeric.comb <- lapply(X = numeric.combs, FUN = function(x) cbind(id.cols,x))
generalize <- suppressWarnings(lapply(X = bind.columns.each.numeric.comb, FUN = function(x)
setattr(x = x[,ID.NEW:=paste(colnames(x[,!ids,with=FALSE]),collapse=" - ")], name =
'names', value = c(ID.cols,"value.left","value.right","ID.NEW"))))
return(rbindlist(l=generalize))
}
fun2 <- function(data, ID.cols){
ids <- which(colnames(data) %in% ID.cols)
new_dat <- combn(data[-ids], 2, function(x) {
new_x <- setNames(x, paste("value", c("left", "right"), sep = "."))
new_x$ID.new <- paste(names(x), collapse = " - ")
new_x
}, simplify = FALSE)
cbind(do.call(rbind, new_dat), data[ids])
}
fun3 <- function(data, ID.cols) {
DTlong <- melt(data, id.vars = ID.cols, variable.factor = FALSE)
out <- DTlong[DTlong, on = .(ID1, ID2), allow.cartesian = TRUE
][variable < i.variable,
][, .(ID.new = paste(variable, i.variable, sep = " - "),
ID1, ID2, value.left = value, value.right = i.value)]
out
}
fun4 <- function(x, id.cols){
DT1 <- DT %>%
pivot_longer(
-id.cols
) %>%
mutate(name1 = lead(name, default=last(name)),
value1 = lead(value, default=last(value)))%>%
arrange(name, name1) %>%
group_by(name) %>%
mutate(n = n()) %>%
mutate(name_nr = parse_number(name)) %>%
ungroup()
DT1 %>%
mutate(name1 = lead(name, unique(n)*(max(name_nr)-min(name_nr)))) %>%
mutate(value1 = lead(value, unique(n)*(max(name_nr)-min(name_nr)))) %>%
slice(seq_len(first(n))) %>%
bind_rows(DT1 %>%
slice(1:(n() - unique(n))), .
) %>%
mutate(ID.new = paste(name, name1, sep = " - "), .before=1) %>%
select(ID.new, ID1, ID2, value.left=value, value.right = value1) %>%
arrange(ID.new)
}
Check if the solutions are the same:
## convert all to data frame
## column names and order need to be the same
## rows need to be sorted in the same way (caveat row names!)
preparetocompare <- function(x){
x <- data.frame(x)
names(x) <- tolower(names(x))
x <- x[c("id1", "id2", "value.left", "value.right", "id.new")]
x <- x[with(x, order(id.new, id1, id2)),]
rownames(x) <- NULL
}
compare_df <- function(...){
# credit to https://stackoverflow.com/a/17244041/7941188
ls_df <- c(as.list(environment()), list(...))
ls_compare <- lapply(ls_df, preparetocompare)
# inspired by https://stackoverflow.com/a/18814864/7941188
all.identical <- function(l) mapply(all.equal, head(l, 1), tail(l, -1))
all.identical(ls_compare)
}
compare_df(fun(DT, c("ID1", "ID2")),
fun2(DF, c("ID1", "ID2")),
fun3(DT, c("ID1", "ID2")),
fun4(DF, c("ID1", "ID2"))
)
#> [1] TRUE TRUE TRUE
This post consists of 3 parts:
The original answer (non-equi self-join with two ID columns)
The 1st edit (non-equi self-join with a variable number of ID columns)
The 2nd edit (benchmark of 6 different approaches with varying problem sizes)
Original answer: Non-equi self-join with two ID columns
For the sake of completeness, here is a solution which uses a non-equi self-join of the molten data (reshaped to long format):
library(data.table)
mdt <- melt(DT, id.vars = c("ID1", "ID2"))
res <- mdt[mdt, on = .(ID1, ID2, variable < variable), nomatch = NULL,
.(ID.new = paste(x.variable, i.variable, sep = " - "),
ID1, ID2, value.left = x.value, value.right = i.value)]
all.equal(res, DT.output, ignore.row.order = TRUE)
[1] TRUE
This approach is similar to r2evans' answer but avoids the cartesian join. I have refrained from showing benchmark results as benchmarkmarking a sample data set of 6 rows, 5 columns has only limited relevance.
Edit 1: Non-equi self-join with a variable number of ID columns
The OP has requested that the number of ID columns may vary (as a matter of fact, the names of the ID columns are passed as parameter to OP's own function).
The non-equi self-join can be enhanced to handle an arbitrary number of ID columns:
library(data.table)
id_cols <- c("ID1", "ID2")
mdt <- melt(DT, id.vars = id_cols)
res <- mdt[mdt, on = c(id_cols, "variable < variable"), nomatch = NULL,
c(.(ID.new = paste(x.variable, i.variable, sep = " - "),
value.left = x.value, value.right = i.value), .SD),
.SDcols = id_cols]
all.equal(res, DT.output, ignore.col.order = TRUE, ignore.row.order = TRUE)
[1] TRUE
Note, that it is safe to use .SD here as .SDcols picks only those columns which are already used to join on (specified by id_cols).
Edit 2: Benchmark with varying problem sizes
The benchmarks presented so far by r2evans and tjebo only use the original dataset with 2 id columns, 3 numeric columns, and 6 rows. Due to small problem size these benchmarks compare overhead but are not representative for the performance on larger problem sizes.
There are 3 different parameters which describe the problem size:
The number of rows nr of the sample dataset DT,
the number of numeric columns nc from which pair-wise rows are created, and
the number of id columns ni.
The final result consists of nc * (nc - 1) / 2 * nr rows and ni + 3 columns.
By using the press() function from the bench package we can easily perform a set of benchmark runs with varying problem sizes.
6 different approaches are being included in the benchmark runs:
pernkf(): the function as shown in OP's question using combn(),
r2evans(): r2evans' melted self-join but modified to work with an arbitrary number of id columns,
tjebo(): tjebo's base R approach using combn() with a data.frame,
nej(): a non-equi self-join of molten data, similar to r2evans' approach but avoiding the cartesian join,
dtc(): a data.table version of tjebos's combn() approach,
mvl(): an implementation of هنروقتان's approach to call melt() with a made-up measure.vars list.
All approaches are implemented as functions which are called with 2 parameters, the dataset DT, or DF, resp., and a character vector with the names of arbitrary id columns.
pernkf <- function(data, ID.cols){
data <- data.table(data)
# Which of the columns are ID columns
ids <- which(colnames(data) %in% ID.cols)
# Obtain all pairwise combinations of numeric columns into a list
numeric.combs <- combn(x = data.table(data)[,!ids, with = FALSE], m = 2, simplify = FALSE)
id.cols <- data[,ids, with = FALSE]
# bind the ID columns to each pairwise combination of numeric columns inside the list
bind.columns.each.numeric.comb <- lapply(X = numeric.combs, FUN = function(x) cbind(id.cols,x))
# Create generic names for the numeric columns so that rbindlist() may be applied. In addition we make a new column that groups based on which columns we are considering
generalize <- suppressWarnings(lapply(X = bind.columns.each.numeric.comb, FUN = function(x)
setattr(x = x[,ID.new:=paste(colnames(x[,!ids,with=FALSE]),collapse=" - ")], name =
'names', value = c(ID.cols,"value.left","value.right","ID.new"))))
return(rbindlist(l=generalize))
}
r2evans = \(DT, id_cols) {
DTlong <- melt(DT, id.vars = id_cols, variable.factor = FALSE)
DTlong[DTlong, on = c(id_cols), allow.cartesian = TRUE
][variable < i.variable,
][, .(ID.new = paste(variable, i.variable, sep = " - "), setnames(.SD, id_cols),
value.left = value, value.right = i.value), .SDcols = id_cols
]
}
tjebo <- \(data, ID.cols) {
ids <- which(colnames(data) %in% ID.cols)
## you can loop over the combinations directly
new_dat <- combn(data[-ids], 2, function(x) {
new_x <- setNames(x, paste("value", c("left", "right"), sep = "."))
## use paste with collapse for the ID.new
new_x$ID.new <- paste(names(x), collapse = " - ")
new_x
}, simplify = FALSE)
## bind it with the old ID columns, outside the loop (bit faster)
cbind(do.call(rbind, new_dat), data[ids])
}
nej <- \(DT, id_cols) {
mdt <- melt(DT, id.vars = id_cols)
mdt[mdt, on = c(id_cols, "variable < variable"), nomatch = NULL,
.(setnames(.SD, id_cols), ID.new = paste(x.variable, i.variable, sep = " - "),
value.left = x.value, value.right = i.value),
.SDcols = id_cols]
}
dtc <- \(DT, id_cols) {
combn(setdiff(colnames(DT), id_cols), 2,
\(x) DT[, ..x][, ID.new := paste(x, collapse = " - ")],
simplify = FALSE) |>
rbindlist(use.names = FALSE) |>
setnames(1:2, c("value.left", "value.right")) |>
cbind(DT[, ..id_cols])
}
mvl <- \(DT, id_cols) {
num_cols <- setdiff(colnames(DT), id_cols)
combos <- combn(num_cols, 2L, simplify = TRUE)
id_new_levels <- apply(combos, 2, paste, collapse = " - ")
melt(DT, measure.vars = list(combos[1L, ],combos[2L, ]),
value.name = c("value.left", "value.right"), variable.name = "ID.new")[
, ID.new := as.character(`levels<-`(ID.new, id_new_levels))]
}
The two approaches by Peace Wang and TarJae have been omitted as I was not able to turn these into scalable functions.
In the call to press() the number of rows nr is varied from 10 to 100'000 and the number of numeric columns nc from 3 to 10. Correspondingly, the number of rows of the resulting datasets vary from 30 to 4.5 million rows. All runs use 3 id columns in order to verify that ni is scalable (and not limited to 2).
The check funtion is set to ignore different order of rows and/or columns as these may vary between the different approaches.
library(bench)
bm <- press(
nr = c(10L, 1000L, 100000L),
nc = c(3L, 5L, 10L),
{
ni <- 3L
DT <- data.table()
id_cols <- sprintf("ID%01i", seq(ni))
# append id cols
for (id in id_cols) set(DT, , id, seq(nr))
# append data cols
for (j in seq(nc)) {
col_name <- sprintf("NAME%04i", j)
set(DT, , col_name, seq(nr) + (j / 1000))
}
DF <- as.data.frame(DT)
mark(
pernkf(DT, id_cols),
r2evans(DT, id_cols),
tjebo(DF, id_cols),
nej(DT, id_cols),
dtc(DT, id_cols),
mvl(DT, id_cols),
check = \(x,y) all.equal(x, setDT(y), ignore.row.order = TRUE, ignore.col.order = TRUE),
min_iterations = 3L
)
}
)
The benchmark timings are visualised by
ggplot2::autoplot(bm)
(note the logarithmic time scale).
Almost always, mvl() is the fastest approach. Only for the smallest problem sizes with 3 numeric columns and up to 1000 rows, tjebo() is slightly faster. For large problems with 100'000 of rows, dtc() and pernkf() are second and third, resp.
The next chart shows how performance varies with the number of numeric columns nc:
library(ggplot2)
ggplot(bm) +
aes(nc, median, colour = attr(expression, "description")) +
geom_point() +
geom_line() +
scale_x_log10() +
labs(colour = "expression") +
facet_wrap(~nr, scales = "free_y") +
ggtitle("Median run time")
(note the log-log scales and the independent time scales of the facets)
tjebo()'s run times increase steeper with nc than any other approach. For some use cases, mvl() is about a magnitude faster than any other approach.
An often neglected aspect is memory consumption. The next chart shows how memory allocation varies with problem size:
ggplot(bm) +
aes(nc, mem_alloc, colour = attr(expression, "description")) +
geom_point() +
geom_line() +
scale_x_log10() +
labs(colour = "expression") +
facet_wrap(~nr, scales = "free_y") +
ggtitle("Memory allocation")
(note the log-log scales and the independent scales on the y-axes)
There is a remarkably large discrepancy in memory allocation between the best and worst approaches for each use case which is about a factor of 7 to 8. Again, tjebo() shows the steepest increase in memory alloction with nc. For large problem sizes, mvl() allocates less memory than any other approach followed by dtc() and pernkf().
UPDATE II (Removed wrong solution)
Now after really hard work and good support of the community (thanks to #akrun and #tjebo) I think I have the correct and scalable tidyverse solution: (HURRAY):-)
library(tidyverse)
DT1 <- DT %>%
pivot_longer(
-c(ID1, ID2)
) %>%
mutate(name1 = lead(name, default=last(name)),
value1 = lead(value, default=last(value)))%>%
arrange(name, name1) %>%
group_by(name) %>%
mutate(n = n()) %>%
mutate(name_nr = parse_number(name)) %>%
ungroup()
DT1 %>%
mutate(name1 = lead(name, unique(n)*(max(name_nr)-min(name_nr)))) %>%
mutate(value1 = lead(value, unique(n)*(max(name_nr)-min(name_nr)))) %>%
slice(seq_len(first(n))) %>%
bind_rows(DT1 %>%
slice(1:(n() - unique(n))), .
) %>%
mutate(ID.new = paste(name, name1, sep = " - "), .before=1) %>%
select(ID.new, ID1, ID2, value.left=value, value.right = value1) %>%
arrange(ID.new)
ID.new ID1 ID2 value.left value.right
<chr> <chr> <dbl> <dbl> <dbl>
1 NAME1 - NAME2 A 1 10 7
2 NAME1 - NAME2 A 2 11 9
3 NAME1 - NAME2 A 3 9 8
4 NAME1 - NAME2 B 1 22 20
5 NAME1 - NAME2 B 2 25 22
6 NAME1 - NAME2 B 3 22 21
7 NAME1 - NAME3 A 1 10 10
8 NAME1 - NAME3 A 2 11 12
9 NAME1 - NAME3 A 3 9 11
10 NAME1 - NAME3 B 1 22 15
11 NAME1 - NAME3 B 2 25 19
12 NAME1 - NAME3 B 3 22 30
13 NAME2 - NAME3 A 1 7 10
14 NAME2 - NAME3 A 2 9 12
15 NAME2 - NAME3 A 3 8 11
16 NAME2 - NAME3 B 1 20 15
17 NAME2 - NAME3 B 2 22 19
18 NAME2 - NAME3 B 3 21 30
Attention:
Here is an inspiring idea which is not fully satisfy OP's requirement (e.g., ID.new and number order) but I think it worth to be recoreded here.
You can turn DT into long format by melt firstly.
Then to shift value with the step -nrow(DT) in order to do
the minus operation, i.e. NAME1 - NAME2, NAME2 - NAME3, NAME3 - NAME1.
ds = melt(DT,
measure.vars = patterns("^NAME"),
variable.name = c("ID.new"),
value.name = c("value.left"))
group_len = nrow(DT)
ds[, ID.new := paste(ID.new,shift(ID.new, n = -group_len, type = c("cyclic")),sep = " - ")]
ds[, value.right := shift(value.left, n = -group_len, type = c("cyclic"))]
Output:
ID1 ID2 ID.new value.left value.right
<char> <num> <char> <num> <num>
1: A 1 NAME1 - NAME2 10 7
2: A 2 NAME1 - NAME2 11 9
3: A 3 NAME1 - NAME2 9 8
4: B 1 NAME1 - NAME2 22 20
5: B 2 NAME1 - NAME2 25 22
6: B 3 NAME1 - NAME2 22 21
7: A 1 NAME2 - NAME3 7 10
8: A 2 NAME2 - NAME3 9 12
9: A 3 NAME2 - NAME3 8 11
10: B 1 NAME2 - NAME3 20 15
11: B 2 NAME2 - NAME3 22 19
12: B 3 NAME2 - NAME3 21 30
13: A 1 NAME3 - NAME1 10 10
14: A 2 NAME3 - NAME1 12 11
15: A 3 NAME3 - NAME1 11 9
16: B 1 NAME3 - NAME1 15 22
17: B 2 NAME3 - NAME1 19 25
18: B 3 NAME3 - NAME1 30 22
I think since data are well structured someone may use the following code (this is scalable but for simplicity I provided a simple variant)
melt(DT, measure.vars=list(c(3,3,4), c(4,5,5)))

Replace values in one column by taking values from another column

After asking one question this morning, now I would like to ask another way to do the replacement, since I am waiting my teacher confirm about the species name.
I have a dataframe like this (The real df resulted by removing duplicated rows)
df <- data.frame(name1 = c("a" , "b", "c", "a"),
name2 = c("x", NA, NA, NA),
name3 = c(NA, "b1", "c1", NA),
name4 = c("x", "b1", "c1", "a"))
name1 name2 name3 name4
1 a x <NA> x
2 b <NA> b1 b1
3 c <NA> c1 c1
4 a <NA> <NA> a
Can we replace a by x by calling if the value in name4 column match with name1 column?
I do not want to use and assign x directly here since my data is supposed to have many cases like this. Any suggestions for me, please? (using base-R also fine for me since I would love to learn more)
Desired output
name1 name2 name3 name4
1 a x <NA> x
2 b <NA> b1 b1
3 c <NA> c1 c1
4 a <NA> <NA> x
My explanation for the table and my expectation:
I have 3 columns name1, name2, name3 (after removing duplicated rows). Name4 column is the final column that contains value that I want from 3 previous columns. The value in name2 column is the my first priority to use, then value in name3.
In my fourth row, since NA value appears in name2 column, then I took an "a" from name1 column. I am thinking that whether can I replace a by x without assigning x i.e. if value (i.e. a) in name4 == value (i.e. a) in name1, then the a in name4 replaced by x in name2 or 4.
Your criteria to define name4 as I understand it is:
Use name2 from the same row if available
Use name3 from the same row if available
Leave it missing (for now)
Fill missing name4 values with name4 values from previous rows that share the same name1 value.
If you want a tidyverse-based solution:
library(dplyr)
library(tidyr)
df <- data.frame(name1 = c("a" , "b", "c", "a"),
name2 = c("x", NA, NA, NA),
name3 = c(NA, "b1", "c1", NA))
result <- df %>%
mutate(name4 = case_when(
#!is.na(name4) ~ name4, # when name4 is not missing, use it? If you like...
!is.na(name2) ~ name2, # when name2 is not missing, use it
!is.na(name3) ~ name3, # when name3 is not missing, use it
TRUE ~ NA_character_ # leave a NA for now otherwise
)) %>%
group_by(name1) %>%
fill(name4, .direction = c("down")) %>% # Fill each group looking at the previous non-missing row.
ungroup()
Returns:
# A tibble: 4 × 4
name1 name2 name3 name4
<chr> <chr> <chr> <chr>
1 a x NA x
2 b NA b1 b1
3 c NA c1 c1
4 a NA NA x
Note that fill can fill in several directions, you could use "downup" if you want to first fill from top to bottom and then bottom to top.
You can group by name1 and if name1 and name4 are equal replace the name4 value with 1st non-NA value available.
library(dplyr)
df %>%
group_by(name1) %>%
mutate(name4 = ifelse(name1 == name4, na.omit(unlist(cur_data()))[1], name4)) %>%
ungroup
# name1 name2 name3 name4
# <chr> <chr> <chr> <chr>
#1 a x NA x
#2 b NA b1 b1
#3 c NA c1 c1
#4 a NA NA x
You can do it like this:
df[which(df$name1==df$name4), "name4"] <- "x"
Basically this means subsetting your dataframe selecting rows, in which name1 == name4, and name4 column, then changing these values to "x"
Base R ifelse solution:
df$name4 <- ifelse(df$name1 == df$name4, "x", df$name4)
Based on your update, using dplyr's first:
library(dplyr)
df$name4 <- ifelse(df$name1 == df$name4, first(df$name4), df$name4)
This does the following:
Checks to see if name1 is equal to name 4
If name1 is equal to name4, it replaces the value of name4 with the first value occurring for name4.
Result:
name1 name2 name3 name4
1 a x <NA> x
2 b <NA> b1 b1
3 c <NA> c1 c1
4 a <NA> <NA> x

Resources