break long characters by "-" and identify the unique components by group - r

Suppose you have a transaction data set with products purchased by customers. You know the total skus (which unique product they buy for each order). But you want to know the unique sku for the lifetime of the user. Say, I buy "apple" for the first time ever today, then apple is a NEW sku. But when I buy "apple" amd "banana" again. Then apple is not a new sku but banana is (if first time purchase).
data
user_id<-c(1,1,1,2,3,4,4)
order_date<-c("2/9/2016",
"11/19/2015",
"12/30/2016",
"9/27/2016",
"12/10/2016",
"11/5/2016",
"1/1/2017")
sku<-c("262-264-280","280-123","510","6251-16990","9227-14572","9227-14572","280")
dt<-data.frame(user_id,order_date,sku)
Output
update: I typed "user_id" as "order_id"

A data.table possibility, using the strings split by -, and checking for new unique values in each row via a set union and Reduce(..., accumulate=TRUE). The count of new values is then a difference between each successive row:
library(data.table)
setDT(dt)
dt[, sku := as.character(sku)]
dt[,
total := lengths(Reduce(union, strsplit(sku, "\\-"), accumulate=TRUE)),
by=user_id
]
dt[, new := c(total[1], diff(total)), by=user_id]
dt
# user_id order_date sku total new
#1: 1 2/9/2016 262-264-280 3 3
#2: 1 11/19/2015 280-123 4 1
#3: 1 12/30/2016 510 5 1
#4: 2 9/27/2016 6251-16990 2 2
#5: 3 12/10/2016 9227-14572 2 2
#6: 4 11/5/2016 9227-14572 2 2
#7: 4 1/1/2017 280 3 1

A possible tidyverse way:
library(dplyr)
library(tidyr)
dt %>%
separate_rows(sku, sep = "-") %>%
mutate(order_date = as.Date(as.character(order_date), "%m/%d/%Y")) %>%
group_by(order_id, sku) %>%
arrange(order_id, order_date) %>%
mutate(idx = row_number() * (row_number() == 1)) %>%
group_by(order_id, order_date) %>%
summarise(sku = paste(sku, collapse = "-"),
`number of new sku purchase` = sum(idx)) %>%
group_by(order_id) %>%
mutate(`total number of sku` = cumsum(`number of new sku purchase`))

Here's a base solution that is similar to #thelatemail:
dt$sku <- as.character(dt$sku)
dt$n_skus <- ave(dt$sku
, dt$user_id
, FUN = function (sku_f) {
sapply(
Reduce(union, strsplit(sku_f, '-', fixed = T), accumulate = T)
, length)
})
dt$n_skus <- as.integer(dt$n_skus)
dt$n_new <- ave(dt$n_skus
, dt$user_id
, FUN = function(n) c(n[1], diff(n)))
dt
user_id order_date sku n_skus n_new
1 1 2/9/2016 262-264-280 3 3
2 1 11/19/2015 280-123 4 1
3 1 12/30/2016 510 5 1
4 2 9/27/2016 6251-16990 2 2
5 3 12/10/2016 9227-14572 2 2
6 4 11/5/2016 9227-14572 2 2
7 4 1/1/2017 280 3 1
And here's a crazy, inefficient data.table solution that doesn't include the original sku field:
library(data.table)
setDT(dt)
dt[, strsplit(as.character(sku), '-'), by = .(user_id, order_date)
][, .SD[!duplicated(V1), .(n_new = .N), by = order_date], by = .(user_id)
][, n_total := cumsum(n_new), by = .(user_id)][]
Performance
#thelatemail's should scale up the best.
Unit: microseconds
expr min lq mean median uq max neval
argonaut_dplyr 10020.601 10247.851 10720.0709 10474.451 10770.751 16021.3 100
thelatemail_dt 1954.501 2072.101 2385.8019 2303.001 2436.202 5807.0 100
#base is always fastest with 7 rows, it would be outstripped by #thelatemail very quickly
cole_base 651.501 751.601 834.0299 772.201 794.752 6309.2 100
cole_bad_dt 7006.400 7355.200 7757.9991 7668.401 7958.651 12708.0 100
purrr_A_sul 14575.501 14862.552 15566.4809 15033.201 15401.601 33263.3 100

Here is one option based on unlist(strsplit(dt$sku,'-')) the current sku then compare it with the previous skus
library(dplyr)
library(purrr)
library(tidyr)
dt %>%
nest(-user_id) %>%
mutate(NNSP = map(data, ~map_dbl(1:length(.x$sku), function(y) {
#browser()
ynow <- unlist(strsplit(as.character(.x$sku)[y],'-'))
yprev <- unique(unlist(strsplit(as.character(.x$sku[1:y-1]),'-')))
length(ynow[!ynow %in% yprev])
})),
TotNNSP = map(NNSP, ~cumsum(.x))) %>%
unnest()
# A tibble: 7 x 5
user_id NNSP TotNNSP order_date sku
<dbl> <dbl> <dbl> <fct> <fct>
1 1 3 3 2/9/2016 262-264-280
2 1 1 4 11/19/2015 280-123
3 1 1 5 12/30/2016 510
4 2 2 2 9/27/2016 6251-16990
5 3 2 2 12/10/2016 9227-14572
6 4 2 2 11/5/2016 9227-14572
7 4 1 3 1/1/2017 280
Using #thelatemail's Reduce and lengths combination, we can do:
library(dplyr)
setdiff2 <- function(x, y) y[!y %in% x]
dt %>%
group_by(user_id) %>%
mutate(sku = as.character(sku),
NNSP = lengths(Reduce(setdiff2, strsplit(sku,'-'), accumulate = TRUE)),
Tot_NNSP = cumsum(NNSP))

Related

Sum of elements in a forward looking rolling window by month

I have the following data.frame with columns: Id, Month, have
library(dplyr)
dt <- read.table(header = TRUE, text = '
Id Month have want
1 01-Jan-2018 1.000000000000000 1.234567901220000
1 01-Feb-2018 0.200000000000000 0.234567901233000
1 01-Mar-2018 0.030000000000000 0.034567901234400
1 01-Apr-2018 0.004000000000000 0.004567901234550
1 01-May-2018 0.000500000000000 0.000567901234566
1 01-Jun-2018 0.000060000000000 0.000067901234566
1 01-Jul-2018 0.000007000000000 0.000007901234566
1 01-Aug-2018 0.000000800000000 0.000000901234566
1 01-Sep-2018 0.000000090000000 0.000000101234566
1 01-Oct-2018 0.000000010000000 0.000000011234566
1 01-Nov-2018 0.000000001100000 0.000000001234566
1 01-Dec-2018 0.000000000120000 0.000000000134566
1 01-Jan-2019 0.000000000013000 0.000000000014566
1 01-Feb-2019 0.000000000001400 0.000000000001566
1 01-Mar-2019 0.000000000000150 0.000000000000166
1 01-Apr-2019 0.000000000000016 0.000000000000016
2 01-Jan-2018 1337.00 1338.00
2 01-Feb-2018 1.00 1.00
3 01-Jan-2018 5.000000000000000000 5.000000000000000
') %>% mutate(Month=as.Date(Month, format='%d-%b-%Y')
I would like to programmatically calculate sum of elements in a 12 month forward looking rolling window by Month and grouped by Id as demonstrated in column want. If the rolling observation window is less than 12 months, the missing elements should be ignored.
For bonus points would the solution would also allow for missing months, such as in:
dt <- read.table(header = TRUE, text = '
Id Month have want
1 01-Jan-18 1.000000000000000 1.200000000000000
1 01-Dec-18 0.200000000000000 0.230000000000000
1 01-Jan-19 0.030000000000000 0.030000000000000
') %>% mutate(Month=as.Date(Month, format='%d-%b-%Y')
I have tried different solutions, e.g. rollapplyr() of the zoo package and some functions in the runner package, but it doesn't seem to give me what I need.
You can use zoo's rollaply with partial = TRUE
library(dplyr)
dt %>%
group_by(Id) %>%
tidyr::complete(Month = seq(min(Month), max(Month), "month")) %>%
mutate(result = zoo::rollapply(have, 12, sum, na.rm = TRUE,
align = 'left', partial = TRUE)) -> result
result
If you have data for every month for each Id like in the example shared you can remove the complete step.
I suggest to use runner package in this case. runner function let you to calculate rolling window having a full control in time. k is a window length, lag is a lag of the window and in idx you specify index column which window depends on.
library(runner)
dt %>%
group_by(Id) %>%
mutate(want2 = runner(
.,
f = function(x) sum(x$have),
k = 12, # or "12 months"
lag = -11, # or "-11 months"
idx = Month)
)
# # A tibble: 19 x 5
# # Groups: Id [3]
# Id Month have want want2
# <int> <date> <dbl> <dbl> <dbl>
# 1 1 2018-01-01 1.00e+ 0 1.23e+ 0 1.00e+ 0
# 2 1 2018-02-01 2.00e- 1 2.35e- 1 2.00e- 1
# 3 1 2018-03-01 3.00e- 2 3.46e- 2 3.00e- 2
# 4 1 2018-04-01 4.00e- 3 4.57e- 3 4.00e- 3
# 5 1 2018-05-01 5.00e- 4 5.68e- 4 5.00e- 4
# 6 1 2018-06-01 6.00e- 5 6.79e- 5 6.00e- 5

manipulate data with multiple values in the cell in r

I have a data-set looks like
universityies <- c("UNI.1;UNI.1;UNI.2;UNI.3","UNI.5", "UNI.3;UNI.4" )
papers <- c(1,1,1)
cited <- c(10,5,20)
df <- data.frame(universityies, papers, cited )
df
and I want to get something like
#total papers total cited
#UNI.1 1 10
#UNI.2 1 10
#UNI.3 2 30
#UNI.4 1 20
#UNI.5 1 5
many thanks in advance,
We can split the data on ";", get unique rows, group_by universityies count distinct papers and the total number of citations.
library(dplyr)
df %>%
mutate(row = row_number()) %>%
tidyr::separate_rows(universityies, sep = ";") %>%
distinct() %>%
group_by(universityies) %>%
summarise(total_papers = n_distinct(row),
total_cited = sum(cited))
# universityies total_papers total_cited
# <chr> <int> <dbl>
#1 UNI.1 1 10
#2 UNI.2 1 10
#3 UNI.3 2 30
#4 UNI.4 1 20
#5 UNI.5 1 5
We can use cSplit from splitstackshape and data.table methods
library(data.table)
library(splitstackshape)
unique(cSplit(setDT(df, keep.rownames = TRUE), "universityies", ";",
"long"))[, .(total_papers = uniqueN(rn), total_cited = sum(cited)),.(universityies)]
# universityies total_papers total_cited
#1: UNI.1 1 10
#2: UNI.2 1 10
#3: UNI.3 2 30
#4: UNI.5 1 5
#5: UNI.4 1 20
You may use strsplit in a first step, then aggregate
tmp <- do.call(rbind, apply(df, 1, function(x)
setNames(data.frame(strsplit(x[1], ";"), as.numeric(x[2]), as.numeric(x[3]),
row.names=NULL, stringsAsFactors=FALSE), names(df))))
res <- aggregate(cbind(total.papers=papers, total.cited=cited) ~ universityies,
unique(tmp), sum)
res[order(res$universityies), ]
# universityies total.papers total.cited
# 1 UNI.1 1 10
# 2 UNI.2 1 10
# 3 UNI.3 2 30
# 4 UNI.4 1 20
# 5 UNI.5 1 5

Age groups into monthly buckets

I'm struggling to find a solution for the following problem. I have a df with id's/ dob's and another monthbucket df as following
set.seed(33)
df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10),
id = seq(1:10) )
monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)
I want to get an output which gives me the count of members within age groups (<19, 19-64, >64) for each of my monthly buckets. The count obviously switches over the year when people have birthdays.
I got the age calculation with something like:
age.fct <- function(dob, bucketdate) {
period <- as.period(interval(dob, bucketdate),unit = "year")
period$year}
I guess the general approach would be to calculate the age for each monthbucket, assign into one of the 3 age groups and count it up by month. Any suggestions?
EDIT 1.
Thanks for all the different approaches, I just run a brief benchmark on the solutions to determine which answer to accept. Somehow the data table solution didn't work on my test data set but I will check as soon as I have a few minutes in the next days.
set.seed(33)
df <- data.frame(dob = sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10000),
id = seq(1:10000) )
monthbucket <- data.frame(month = format(seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),'%Y-%m'),
startmonth = seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months"),
endmonth = seq(as.Date("2010-02-01"),as.Date("2011-02-01"),by="months")-1)
birth_days <- df$dob
month_bucket <- monthbucket$startmonth
and the benchmark
microbenchmark::microbenchmark(
MM= monthbucket %>% group_by_all %>% expand(id=df$id) %>% left_join(.,{df %>% mutate(birth_month =cut(dob, "month"))},by="id") %>% mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>%
mutate(age_cat=case_when(age<19 ~ "<19", age>64 ~ ">64",TRUE ~ "19-64")) %>% group_by(month) %>% count(age_cat) %>% gather(variable, count, n) %>%
unite(variable, age_cat) %>% spread(variable, count)
,
AkselA = {ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
ages <- do.call(data.frame, lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
},
Cole1 ={t(table(apply(X = outer(month_bucket, birth_days, `-`) / 365.25, MARGIN = 2, FUN = cut, c(0,19,65, Inf)), rep(format(month_bucket,'%Y-%m'), length(birth_days))))
},
# cole2={ cast(CJ(month_bucket, birth_days)[, .N, by = .(month_bucket , cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))], month_bucket ~ cut, value.var = 'N')
# },
#
Cole3={crossing(month_bucket, birth_days)%>%count(month_bucket, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf)))%>%spread(age_range, n)
},
Cole4={all_combos <- expand.grid(month_bucket = month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
reshape(data = aggregate( all_combos$month_bucket, by = list(bucket = all_combos$month_bucket,age_group = all_combos$cut_r), FUN = length), timevar = 'age_group' , idvar = 'bucket', direction = 'wide' )
},
times = 1L)
Unit: milliseconds
expr min lq mean median uq max neval
MM 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 4249.02810 1
AkselA 17.12697 17.12697 17.12697 17.12697 17.12697 17.12697 1
Cole1 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 3237.94534 1
Cole3 23.63945 23.63945 23.63945 23.63945 23.63945 23.63945 1
Cole4 877.92782 877.92782 877.92782 877.92782 877.92782 877.92782 1
Based on speed AkselA's approach seems to be the fastest but I get a different result for M-M's approach compared to all others (once AkselA's changes to 65 in the cut part cut, c(0, 19, 64, Inf)... I will accept answer based on speed but will look into the differences in the results!
Not very sophisticated but I joined the two tables (first expanded monthbucket on df$id) and then calculated the age (as you have the whole month, I just calculated difftime with the first day of month of birth and startmonth). Then, for each month (bucket) I counted number of different age groups and at the end converted long format to wide for better illustration.
library(lubridate)
library(tidyverse)
monthbucket %>%
group_by_all %>%
expand(id=df$id) %>%
left_join(.,{df %>%
mutate(birth_month =cut(dob, "month"))},
by="id") %>%
mutate(age=time_length(difftime(startmonth, birth_month),"years")) %>%
mutate(age_cat=case_when(age<19 ~ "<19",
age>64 ~ ">64",
TRUE ~ "19-64")) %>%
group_by(month) %>%
count(age_cat) %>%
gather(variable, count, n) %>%
unite(variable, age_cat) %>%
spread(variable, count)
#> # A tibble: 13 x 4
#> # Groups: month [13]
#> month `<19` `>64` `19-64`
#> <fct> <int> <int> <int>
#> 1 2010-01 3 2 5
#> 2 2010-02 3 2 5
#> 3 2010-03 3 2 5
#> 4 2010-04 3 2 5
#> 5 2010-05 3 2 5
#> 6 2010-06 3 2 5
#> 7 2010-07 3 2 5
#> 8 2010-08 3 2 5
#> 9 2010-09 3 2 5
#> 10 2010-10 3 2 5
#> 11 2010-11 3 2 5
#> 12 2010-12 3 2 5
#> 13 2011-01 3 2 5
Created on 2019-07-03 by the reprex package (v0.3.0)
Assuming I understand your request.
ages <- as.data.frame(t(unclass(outer(monthbucket$startmonth, df$dob, "-")/365.25)))
ages <- do.call(data.frame,
lapply(ages, cut, c(0, 19, 64, Inf), c("0-19", "19-64", "64+")))
ages <- sapply(ages, table)
colnames(ages) <- monthbucket$month
ages
# 2010-01 2010-02 2010-03 2010-04 2010-05 2010-06 2010-07 2010-08 2010-09 2010-10 2010-11 2010-12 2011-01
# 0-19 2 2 2 2 2 2 2 2 2 2 2 2 2
# 19-64 7 7 7 7 7 7 7 7 7 7 7 7 7
# 64+ 1 1 1 1 1 1 1 1 1 1 1 1 1
#
There are some similarities to #AkselA's answer as it depends on outer(), cut(), and table().
set.seed(33)
birth_days <- sample(seq(as.Date('1940/01/01'), as.Date('2010/01/01'), by="day"), 10)
month_bucket <- seq(as.Date("2010-01-01"),as.Date("2011-01-01"),by="months")
t(
table(
apply(
X = outer(month_bucket, birth_days, `-`) / 365.25
, MARGIN = 2
, FUN = cut, c(0,19,65, Inf)
)
, rep(format(month_bucket,'%Y-%m'), length(birth_days))
)
)
(0,19] (19,65] (65,Inf]
2010-01 2 7 1
2010-02 2 7 1
2010-03 2 7 1
2010-04 2 7 1
2010-05 2 7 1
2010-06 2 7 1
2010-07 2 7 1
2010-08 2 7 1
2010-09 2 7 1
2010-10 2 7 1
2010-11 2 7 1
2010-12 2 7 1
2011-01 2 7 1
I felt weird having such a similar solution so here is data.table:
library(data.table)
dcast(
CJ(month_bucket, birth_days
)[, .N
, by = .(month_bucket
, cut(as.numeric(month_bucket - birth_days)/365.25, c(0,19,65,Inf)))
]
, month_bucket ~ cut
, value.var = 'N')
dplyr and tidyr:
library(dplyr)
library(tidyr)
crossing(month_bucket, birth_days)%>%
count(month_bucket
, age_range = cut(as.numeric(month_bucket - birth_days) / 365.25, c(0,19,65,Inf))
)%>%
spread(age_range, n)
And a similar approach in base that I'm not completely happy with.
all_combos <- expand.grid(month_bucket = month_bucket, birth_days = birth_days)
all_combos$age <- as.numeric(all_combos$month_bucket - all_combos$birth_days) / 365.25
all_combos$cut_r <- cut(all_combos$age, c(0,19,65,Inf))
reshape(
data = aggregate(
all_combos$month_bucket
, by = list(bucket = all_combos$month_bucket
,age_group = all_combos$cut_r)
, FUN = length)
, timevar = 'age_group'
, idvar = 'bucket'
, direction = 'wide'
)

data.table equivalent of tidyr::complete()

tidyr::complete() adds rows to a data.frame for combinations of column values that are missing from the data. Example:
library(dplyr)
library(tidyr)
df <- data.frame(person = c(1,2,2),
observation_id = c(1,1,2),
value = c(1,1,1))
df %>%
tidyr::complete(person,
observation_id,
fill = list(value=0))
yields
# A tibble: 4 × 3
person observation_id value
<dbl> <dbl> <dbl>
1 1 1 1
2 1 2 0
3 2 1 1
4 2 2 1
where the value of the combination person == 1 and observation_id == 2 that is missing in df has been filled in with a value of 0.
What would be the equivalent of this in data.table?
I reckon that the philosophy of data.table entails fewer specially-named functions for tasks than you'll find in the tidyverse, so some extra coding is required, like:
res = setDT(df)[
CJ(person = person, observation_id = observation_id, unique=TRUE),
on=.(person, observation_id)
]
After this, you still have to manually handle the filling of values for missing levels. We can use setnafill to handle this efficiently & by-reference in recent versions of data.table:
setnafill(res, fill = 0, cols = 'value')
See #Jealie's answer regarding a feature that will sidestep this.
Certainly, it's crazy that the column names have to be entered three times here. But on the other hand, one can write a wrapper:
completeDT <- function(DT, cols, defs = NULL){
mDT = do.call(CJ, c(DT[, ..cols], list(unique=TRUE)))
res = DT[mDT, on=names(mDT)]
if (length(defs))
res[, names(defs) := Map(replace, .SD, lapply(.SD, is.na), defs), .SDcols=names(defs)]
res[]
}
completeDT(setDT(df), cols = c("person", "observation_id"), defs = c(value = 0))
person observation_id value
1: 1 1 1
2: 1 2 0
3: 2 1 1
4: 2 2 1
As a quick way of avoiding typing the names three times for the first step, here's #thelatemail's idea:
vars <- c("person","observation_id")
df[do.call(CJ, c(mget(vars), unique=TRUE)), on=vars]
# or with magrittr...
c("person","observation_id") %>% df[do.call(CJ, c(mget(.), unique=TRUE)), on=.]
Update: now you don't need to enter names twice in CJ thanks to #MichaelChirico & #MattDowle for the improvement.
There might be a better answer out there, but this works:
dt[CJ(person=unique(dt$person),
observation_id=unique(dt$observation_id)),
on=c('person','observation_id')]
Which gives:
person observation_id value
1: 1 1 1
2: 2 1 1
3: 1 2 NA
4: 2 2 1
Now, if you would like to be able to fill with any value (and not NA), I would suggest to wait for the corresponding feature to be finished or contribute to it :)
It is worth noting that the completeDT function above doesn't carry many of the features that tidyr::complete does. In particular, empty factor levels are dropped - unlike tidyr::complete which keeps them. If you do want to keep empty factor the function can be edited as below. The make_vals function below could be made more sophisticated to handle other variable classes eg. full sequence for integers.
library(magrittr)
library(data.table)
dat <- data.frame(
person = c(1,2,2),
observation_id = factor(c(1,1,2), 1:3),
value = c(1,1,1))
dat %>%
tidyr::complete(
person, observation_id, fill = list(value=0))
#> # A tibble: 6 x 3
#> person observation_id value
#> <dbl> <fct> <dbl>
#> 1 1 1 1
#> 2 1 2 0
#> 3 1 3 0
#> 4 2 1 1
#> 5 2 2 1
#> 6 2 3 0
completeDT <- function(DT, cols, defs = NULL){
make_vals <- function(col) {
if(is.factor(col)) factor(levels(col))
else unique(col)
}
mDT = do.call(CJ, c(lapply(DT[, ..cols], make_vals), list(unique=TRUE)))
res = DT[mDT, on=names(mDT)]
if (length(defs))
res[, names(defs) := Map(replace, .SD, lapply(.SD, is.na), defs), .SDcols=names(defs)]
res[]
}
completeDT(DT = setDT(dat), cols = c("person", "observation_id"), defs = c(value = 0))
#> person observation_id value
#> 1: 1 1 1
#> 2: 1 2 0
#> 3: 1 3 0
#> 4: 2 1 1
#> 5: 2 2 1
#> 6: 2 3 0
Created on 2021-03-08 by the reprex package (v0.3.0)

n objects, a value for each combination of two objects, find minimum value for each object in R

I want to find the minimum value associated with an object out of a dataframe. The dataframe contains two columns representing all combinations of the objects and a value-column for each combination. It looks like this:
id_A id_B dist
206 208 2385.5096
207 208 467.8890
207 209 576.4631
...
208 209 1081.539
208 210 8214.439
...
I tried the following recommended dplyr functions:
df %>%
group_by(id_A) %>%
slice(which.min(dist))
But it creates not the desired output:
id_A id_B dist
...
207 208 467.8890
208 209 1081.5393
...
Note that for id 208 the combination with id 207 has the lowest value, but is not associated to id 208 (when it is in the grouped_by column).
I wrote a function doing this right, but since I got many entries it is way to slow. Its a loop subsetting the data by all entries containing a specific id and then finds the minimum within that subset and associates that value with that id.
Do you have an idea, how to make that fast e.g. using dplyr.
The issue boils down to needing a long (rather than wide) data format. First, here are some reproducible data (using the pipe from dplyr):
df <-
LETTERS[1:4] %>%
combn(2) %>%
t %>%
data.frame() %>%
mutate(val = 1:n()) %>%
setNames( c("id_A", "id_B", "dist") )
gives:
id_A id_B dist
1 A B 1
2 A C 2
3 A D 3
4 B C 4
5 B D 5
6 C D 6
What we want is a pair of columns giving matching each category with the distance from its row. For this, I am using gather from tidyr. It creates new columns telling us which column the data came from and what value that held. Here, we are telling it to pull from columns id_A and id_B to give us the category for each ID entry (it then duplicates the dist column as necessary)
df %>%
gather(whichID, Category, id_A, id_B)
Gives
dist whichID Category
1 1 id_A A
2 2 id_A A
3 3 id_A A
4 4 id_A B
5 5 id_A B
6 6 id_A C
7 1 id_B B
8 2 id_B C
9 3 id_B D
10 4 id_B C
11 5 id_B D
12 6 id_B D
We can then pass that data.frame to group_by and then use summarise to give us whatever information we wanted. I know that you didn't ask for the max, but I am including it just to show the general syntax you can use to get whatever type of result you want:
df %>%
gather(whichID, Category, id_A, id_B) %>%
group_by(Category) %>%
summarise(minDist = min(dist)
, maxDist = max(dist))
Returns:
Category minDist maxDist
<chr> <int> <int>
1 A 1 3
2 B 1 5
3 C 2 6
4 D 3 6
I just looked at the question and realized that you wanted to also display which comparison had the minimum value. Here is an approach that does that by tracking an index of the match (so that it is replicated when gathering) and then pulls the correct row from the original df and pastes together the two comparison values:
df %>%
mutate(whichComparison = 1:n()) %>%
gather(whichID, Category, id_A, id_B) %>%
group_by(Category) %>%
summarise(minDist = min(dist)
, whichMin = whichComparison[which.min(dist)]
, maxDist = max(dist)
, whichMax = whichComparison[which.max(dist)]) %>%
mutate(
minComp = sapply(whichMin, function(x){
paste(df[x, "id_A"], df[x, "id_B"], sep = " vs " )})
, maxComp = sapply(whichMax, function(x){
paste(df[x, "id_A"], df[x, "id_B"], sep = " vs " )})
)
returns
Category minDist whichMin maxDist whichMax minComp maxComp
<chr> <int> <int> <int> <int> <chr> <chr>
1 A 1 1 3 3 A vs B A vs D
2 B 1 1 5 5 A vs B B vs D
3 C 2 2 6 6 A vs C C vs D
4 D 3 3 6 6 A vs D C vs D
If you really want a single column giving which comparison gave the min value (and the max, in my output), you can instead use the index to pull both the id_A and id_B from the original df, knock out the one that matches the Category of interest, then use use_first_valid_of from the package janitor to grab just the one you are interested in. Because this generated a large number of intermediate columns, I am using select to clean things back up:
df %>%
mutate(whichComparison = 1:n()) %>%
gather(whichID, Category, id_A, id_B) %>%
group_by(Category) %>%
summarise(minDist = min(dist)
, maxDist = max(dist)
, whichMin = whichComparison[which.min(dist)]
, whichMax = whichComparison[which.max(dist)]) %>%
mutate(
minA = df$id_A[whichMin]
, minB = df$id_B[whichMin]
, maxA = df$id_A[whichMax]
, maxB = df$id_B[whichMax]
) %>%
mutate_each(funs(ifelse(. == Category, NA, as.character(.)) )
, minA:maxB) %>%
mutate(minComp = use_first_valid_of(minA, minB)
, maxComp = use_first_valid_of(maxA, maxB)) %>%
select(-(whichMin:maxB))
returns:
Category minDist maxDist minComp maxComp
<chr> <int> <int> <chr> <chr>
1 A 1 3 B D
2 B 1 5 A D
3 C 2 6 A D
4 D 3 6 A C
An alternative approach is to first convert the distance pairs to a matrix. Here, I first duplicate the comparisons in the reverse order to ensure that the matrix is complete (using tidyr to spread):
bind_rows(
df
, rename(df, id_A = id_B, id_B = id_A)
) %>%
spread(id_B, dist)
returns:
id_A A B C D
1 A NA 1 2 3
2 B 1 NA 4 5
3 C 2 4 NA 6
4 D 3 5 6 NA
Then, we just apply across rows much like we would if we working from a distance matrix (which may be where your data actually started):
bind_rows(
df
, rename(df, id_A = id_B, id_B = id_A)
) %>%
spread(id_B, dist) %>%
mutate(
minDist = apply(as.matrix(.[, -1]), 1, min, na.rm = TRUE)
, minComp = names(.)[apply(as.matrix(.[, -1]), 1, which.min) + 1]
, maxDist = apply(as.matrix(.[, -1]), 1, max, na.rm = TRUE)
, maxComp = names(.)[apply(as.matrix(.[, -1]), 1, which.max) + 1]
) %>%
select(Category = `id_A`
, minDist:maxComp)
returns:
Category minDist minComp maxDist maxComp
1 A 1 B 3 D
2 B 1 A 5 D
3 C 2 A 6 D
4 D 3 A 6 C

Resources