Match rows with other row with smallest difference on a column - r

I want to perform matching between two groups in a data frame, where all rows belonging to one group (binary) are matched with observations from the other group (with replacement) if their difference on another column is smaller than a pre-set threshold. Let's use the toy-dataset below:
set.seed(123)
df <- data.frame(id = c(1:10),
group = rbinom(10,1, 0.3),
value = round(runif(10),2))
threshold <- round(sd(df$value),2)
Which looks like this
> df
id group value
1 1 0 0.96
2 2 1 0.45
3 3 0 0.68
4 4 1 0.57
5 5 1 0.10
6 6 0 0.90
7 7 0 0.25
8 8 1 0.04
9 9 0 0.33
10 10 0 0.95
> threshold
[1] 0.35
In this case, I want to match rows with group==1 with rows with group==2 where the difference between value is smaller than threshold(0.35). This should lead to a data frame looking like this (apologizes for potential error, did it manually).
id matched_id
1 2 3
2 2 7
3 2 9
4 4 3
5 4 6
6 4 7
7 4 9
8 5 7
9 5 9
10 8 7
11 8 9
Thank you!

You can use df |> left_join(df, by = character()) which is the tidyverse way of performing a cartesian product. Then filter according to threshold.
library(dplyr)
df |>
left_join(df, by = character()) |>
filter(group.x != group.y,
id.x < id.y,
abs(value.x - value.y) < threshold)
#>+ id.x group.x value.x id.y group.y value.y
#>1 2 1 0.45 3 0 0.68
#>2 2 1 0.45 7 0 0.25
#>3 2 1 0.45 9 0 0.33
#>4 3 0 0.68 4 1 0.57
#>5 4 1 0.57 6 0 0.90
#>6 4 1 0.57 7 0 0.25
#>7 4 1 0.57 9 0 0.33
#>8 5 1 0.10 7 0 0.25
#>9 5 1 0.10 9 0 0.33
#>10 7 0 0.25 8 1 0.04
#>11 8 1 0.04 9 0 0.33

UPDATED ANSWER: Was going slow on a larger dataset, so I tried to make the code a bit more efficient.
Came up with a solution that seems to do what I want. Not sure how efficient this code is on larger data but seems to work.
library(tidyverse)
library(data.table)
# All values
dist_mat <- df$value
# Adding identifier
names(dist_mat) <- df$id
# Dropping combinations that are not of interest
dist_mat_col <-dist_mat[df$group == 0]
dist_mat_row <- dist_mat[df$group == 1]
# Difference between each value
dist_mat <- abs(outer(dist_mat_row, dist_mat_col, "-"))
# Identifying matches that fulfills the criteria
dist_mat <- dist_mat <= threshold
# From matrix to a long dataframe
dist_mat <- melt(dist_mat)
# Tidying up the dataframe and dropping unneccecary columns and rows.
dist_mat <- dist_mat %>%
rename(id = Var1,
matched_id = Var2,
cond = value) %>%
filter(cond == TRUE) %>%
left_join(df, by = "id") %>%
select(id, matched_id)
This leads to the following dataframe:
> arrange(dist_mat, id)
id matched_id
1 2 3
2 2 7
3 2 9
4 4 3
5 4 6
6 4 7
7 4 9
8 5 7
9 5 9
10 8 7
11 8 9

Related

Calculating proportions based on conditions

please see sample data below:
data <- data.frame(q1=c(3,4,5,2,1,2,4),
q2=c(3,4,4,5,4,3,2),
q3=c(2,3,2,3,1,2,3),
q4=c(3,4,4,4,4,5,5))
I would like to create a another column which shows the percent of 4/5 responses. The output I am hoping to get looks something like this. Any help is appreciated, thank you!
q1 q2 q3 q4 percent
1 3 3 2 3 0.00
2 4 4 3 4 0.75
3 5 4 2 4 0.75
4 2 5 3 4 0.50
5 1 4 1 4 0.50
6 2 3 2 5 0.25
7 4 2 3 5 0.50
Using rowMeans
library(dplyr)
data %>%
mutate(percent = rowMeans(across(everything(), ~ .x %in% 4:5)))
-output
q1 q2 q3 q4 percent
1 3 3 2 3 0.00
2 4 4 3 4 0.75
3 5 4 2 4 0.75
4 2 5 3 4 0.50
5 1 4 1 4 0.50
6 2 3 2 5 0.25
7 4 2 3 5 0.50
One possible solution:
data$percent = rowMeans(data>3)
Or
data$percent = apply(data, 1, \(x) mean(x %in% 4:5))
q1 q2 q3 q4 percent
1 3 3 2 3 0.00
2 4 4 3 4 0.75
3 5 4 2 4 0.75
4 2 5 3 4 0.50
5 1 4 1 4 0.50
6 2 3 2 5 0.25
7 4 2 3 5 0.50
library(dplyr)
data <- data.frame(q1=c(3,4,5,2,1,2,4),
q2=c(3,4,4,5,4,3,2),
q3=c(2,3,2,3,1,2,3),
q4=c(3,4,4,4,4,5,5))
percent_4_5 <- function(x) {
(sum(x == 4) + sum(x == 5)) / length(x)
}
data %>% rowwise() %>% mutate(percent = percent_4_5(c_across(starts_with("q")))) %>% ungroup()
Another possible solution without using dplyr
library(magrittr)
data$percent <- (data > 3) %>% as.data.frame() %>% apply(., 1, mean)
True is considered as 1 and False is considered as 0 when counting.
Output:
q1 q2 q3 q4 percent
1 3 3 2 3 0.00
2 4 4 3 4 0.75
3 5 4 2 4 0.75
4 2 5 3 4 0.50
5 1 4 1 4 0.50
6 2 3 2 5 0.25
7 4 2 3 5 0.50

how to pipe all these commands dplyr aggregate groupby mixedorder in R

assume this is my dataset
library(gtools)
library(dplyr)
df <- data.frame(grp=c(0.5,0.6,1,2,2,2,4.5,10,22,"kids","Parents","Teachers"),
f1= c(1,0,3,2,4,0,3,0,1,6,8,4),
f2= c(1,0,3,1,4,0,1,0,1,5,8,4),
f3= c(1,0,3,2,4,6,1,2,1,6,8,4))
df
grp f1 f2 f3
1 0.5 1 1 1
2 0.6 0 0 0
3 1 3 3 3
4 2 2 1 2
5 2 4 4 4
6 2 0 0 6
7 4.5 3 1 1
8 10 0 0 2
9 22 1 1 1
10 kids 6 5 6
11 Parents 8 8 8
12 Teachers 4 4 4
and this is my desired output
df_final
grp f1 f2 f3
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
5 kids 6 5 6
6 Parents 8 8 8
7 Teachers 4 4 4
This is what I did + commenting my questions:
############ how NOT to splot set into two subsets of data
df_1 <- df %>%
filter(grepl('kids|Parents|Teachers', grp))
df_1
grp f1 f2 f3
1 kids 6 5 6
2 Parents 8 8 8
3 Teachers 4 4 4
df_2 <- df %>%
filter(!grepl('kids|Parents|Teachers', grp)) %>%
mutate(across(.cols = grp, .fns = as.numeric)) %>%
mutate(grp= cut(grp, breaks=c(-999,2,10,21,999) , labels=c("<=1", "2-9","10-19",">20"), right=F))
df_2
grp f1 f2 f3
1 <=1 1 1 1
2 <=1 0 0 0
3 <=1 3 3 3
4 2-9 2 1 2
5 2-9 4 4 4
6 2-9 0 0 6
7 2-9 3 1 1
8 10-19 0 0 2
9 >20 1 1 1
### how to pipe both aggregate and mixedorder/sort instead of separate lined of codes
df_2 <- aggregate(.~grp, data = df_2, FUN=sum)
df2[mixedorder(df2$grp, decreasing = T),]
df_2
grp f1 f2 f3
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
### how to make sure 10-19 does not come before 2-9 in case of actual dataset
grp a b d
1 <=1 53 48 53
2 10-15 65 63 65
3 2-9 30 40 30
df_final <- rbind(df_2, df_1)
df_final
grp f1 f2 f3
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
5 kids 6 5 6
6 Parents 8 8 8
7 Teachers 4 4 4
Is there any neat way to get from original df to df_final all in dplyr by just piping commands?
how NOT to splot set into two subsets of data?
how to pipe both aggregate and mixedorder/sort instead of separate lined of codes?
how to make sure 10-19 does not come before 2-9 in case of actual dataset?
Here is one option - create a second column ('grp2') with the cut values on the numeric elements only, then coalesce with the original column, while appending the levels, and then do a group_by summarise with across. In this way, we don't have to use mixedsort, as the cut already had the grouping sorted
library(dplyr)
library(stringr)
df %>%
mutate(grp2 = case_when(str_detect(grp, '^[0-9.]+$')
~ cut(as.numeric(grp), breaks=c(-999,2,10,21,999) ,
labels=c("<=1", "2-9","10-19",">20"), right=FALSE))) %>%
mutate(grp =factor(coalesce(grp2, grp),
levels = c(levels(grp2), unique(grp[is.na(grp2)]))), .keep = "unused") %>%
group_by(grp) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 7 × 4
grp f1 f2 f3
<fct> <dbl> <dbl> <dbl>
1 <=1 4 4 4
2 2-9 9 6 13
3 10-19 0 0 2
4 >20 1 1 1
5 kids 6 5 6
6 Parents 8 8 8
7 Teachers 4 4 4

Sum groupwise but using only individual's first entry

Consider the following sample data. The data has 2 individuals per group and each individual has 2 entries.
rm(list=ls()); set.seed(1234)
G=4 ; # Suppose you have 4 groups
nTot<-8 # We have 2 individuals per group so in total we have 8 individuals
group<-rep(1:G, rep(4,G) )#Group identifier
individualID<-rep(1:nTot, rep(2,nTot) )#We have 2 individuals per group each with 2 entries
n<-2*nTot # We have 16 entries in total
X<-rbinom(n, 1, 0.5)
Y<-runif(n, 0, 1)
Z<-runif(n, 0, 4)
df1<-round(data.frame(group,individualID,X,Y,Z),3)
> df1
group individualID X Y Z
1 1 1 0 0.286 1.219
2 1 1 1 0.267 2.029
3 1 2 1 0.187 0.724
4 1 2 1 0.232 3.039
5 2 3 1 0.317 0.805
6 2 3 1 0.303 1.035
7 2 4 0 0.159 3.969
8 2 4 0 0.040 3.229
9 3 5 1 0.219 2.213
10 3 5 1 0.811 2.586
11 3 6 1 0.526 1.247
12 3 6 1 0.915 2.487
13 4 7 0 0.831 1.319
14 4 7 1 0.046 2.008
15 4 8 0 0.456 2.708
16 4 8 1 0.265 1.940
Func<-X*Y+Z
Func
The code below computes sum of Func per group using split() function.
Func<-X*Y+Z
GroupSum<-as.numeric( sapply( split(Func,group),sum) ) # Group sum of X*Y+Z
I would like a code that will split the data and group sum Func only for the first entry per individual i.e I should end up with a vector of 4 values as we have 4 groups.
We may use a group by approach i.e. grouped by 'group', slice the first row, ungroup, and then summarise to get the sum of X multiplied by 'Y' and added to 'Z'
library(dplyr)
df1 %>%
group_by(group) %>%
slice_head(n = 1) %>%
summarise(out = sum(X * Y + Z, na.rm = TRUE))
-output
# A tibble: 4 × 2
group out
<dbl> <dbl>
1 1 2.19
2 2 1.31
3 3 1.50
4 4 2.52
Or can use duplicated in base R
aggregate(out ~ group, transform(subset(df1, !duplicated(group)),
out = X * Y + Z), FUN = sum)
group out
1 1 2.194
2 2 1.311
3 3 1.501
4 4 2.522

Retaining the last value of a column

I have the following data frame,
Input
For all observations where Month > tenor, the last value of the rate column should be retained for each account for the remaining months. Eg:- Customer 1 has tenor = 5, so for all months greater than 5, the last rate value is retained.
I am using the following code
df$rate <- ifelse(df$Month > df$tenor,tail(df$rate, n=1),df$rate)
But here, the last value is NA so it does not work
Expected output is
Output
this will work, but please have a reproducible example. Others want to help you, not do your homework.
df <- data.frame(
customer = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2),
Month = c(1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10),
tenor = c(5,5,5,5,5,5,5,5,5,5,3,3,3,3,3,3,3,3,3,3),
rate = c(0.2,0.3,0.4,0.5,0.6,NA,NA,NA,NA,NA,0.1,0.2,0.3,NA,NA,NA,NA,NA,NA,NA)
)
fn <- function(cus, mon, ten, rat){
if (mon > ten & is.na(rat)){
return(dplyr::filter(df, customer == cus, Month == ten, tenor == ten)$rate)
}
return(rat)
}
df2 <- mutate(df,
newrate = Vectorize(fn)(customer, Month, tenor, rate)
)
One option is:
library(dplyr)
library(tidyr)
df %>%
group_by(cus_no) %>%
fill(rate, .direction = "down") %>%
ungroup()
# A tibble: 20 x 4
customer Month tenor rate
<dbl> <dbl> <dbl> <dbl>
1 1 1 5 0.2
2 1 2 5 0.3
3 1 3 5 0.4
4 1 4 5 0.5
5 1 5 5 0.6
6 1 6 5 0.6
7 1 7 5 0.6
8 1 8 5 0.6
9 1 9 5 0.6
10 1 10 5 0.6
11 2 1 3 0.1
12 2 2 3 0.2
13 2 3 3 0.3
14 2 4 3 0.3
15 2 5 3 0.3
16 2 6 3 0.3
17 2 7 3 0.3
18 2 8 3 0.3
19 2 9 3 0.3
20 2 10 3 0.3
I can't replicate your data frame so this is a guess right now.
I think dplyr should be the solution:-
library(dplyr)
df%>%
group_by(Month)%>%
replace_na(last(rate))
should work

How to add calculation results to another dataset based on a filter

I have two datasets and I need to add a value that is based on specific levels of the two variables' values.I need to add to dat dataset the value that is calculated on the dat1 dataset so the output should be like in dat2.Any idea how to this ( the values in dat1 are text)?
dat3 <- read.table(text = " lamps vases
7 9
1 6
3 5
7 8
5 4
1 3
0 7
6 6
8 9 ", header = TRUE)
dat2 <- read.table(text = " lamps vases est
3,5 4,5 0.6
8 9 0.4
", header = TRUE)
dat2 <- read.table(text = " lamps vases est
7 9 0
1 6 0
3 5 0.6
7 8 0
5 4 0.6
1 3 0
0 7 0
6 6 0
8 9 0.4", header = TRUE)
We can try with tidyverse. Convert the second dataset 'dat2' (with 2 rows in the OP's post) into 'long' format by splitting the 'lamps', 'vases' column (separate_rows), left_join with the first dataset ('df1' - labelled as 'df3' in OP's post), and with replace_na change the NA in 'est' to 0
library(tidyverse)
separate_rows(dat2, lamps, convert = TRUE) %>%
separate_rows(vases, convert = TRUE) %>%
left_join(dat1, ., by = c("lamps", "vases")) %>%
replace_na(list(est = 0))
# lamps vases est
#1 7 9 0.0
#2 1 6 0.0
#3 3 5 0.6
#4 7 8 0.0
#5 5 4 0.6
#6 1 3 0.0
#7 0 7 0.0
#8 6 6 0.0
#9 8 9 0.4

Resources