R dplyr join by range or virtual column

R dplyr join by range or virtual column - r

I want to join two tibbles by a range or a virtual column. but it seems the by - parameter just allow to handle chr oder vector(chr) of existing column names.
In my example I have a tibble d with a column value, and a tibble r with a from and a to column.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
> d
# A tibble: 26 x 1
value
<dbl>
1 1.0
2 1.2
3 1.4
4 1.6
5 1.8
6 2.0
7 2.2
8 2.4
9 2.6
10 2.8
# ... with 16 more rows
> r
# A tibble: 6 x 3
from to class
<int> <dbl> <chr>
1 1 2 A
2 2 3 B
3 3 4 C
4 4 5 D
5 5 6 E
6 6 Inf F
now I want to join the value column in d within the range of from and to in r:
d %>% inner_join(r, by = "value between from and to") # >= and <
I can't find a way to do this so decided to join the floor of value in d with the from column in r
d %>% inner_join(r, by = c("floor(value)" = "from"))
of course i can create a second column to solve that:
d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, class)
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
but isn't there a more comfortable way?
Thanks

I don't think inequality joins is implemented in dplyr yet, or it ever will (see this discussion on Join on inequality constraints), but this is a good situation to use an SQL join:
library(tibble)
library(sqldf)
as.tibble(sqldf("select d.value, r.class from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Alternatively, if you want to integrate the join into your dplyr chain, you can use fuzzyjoin::fuzzy_join:
library(dplyr)
library(fuzzyjoin)
d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"),
match_fun = list(`>=`, `<`)) %>%
select(value, class)
Result:
# A tibble: 31 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 A
7 2.0 B
8 2.2 B
9 2.4 B
10 2.6 B
# ... with 21 more rows
Notice I added single quotes around from and to since those are reserved words for the SQL language.

Ok thanks for advices, this was pretty interesting. I finally wrote a function range_join (inspired by #ycw's code) and compared all described solution in view of runtime.
I like fuzzy_join but with only 50k rows in d it needs more than 40sec. Thats too slow.
Here the result with 5k rows in d
library(dplyr)
library(fuzzyjoin)
library(sqldf)
#join by range by #WiWeber
range_join <- function(x, y, value, left, right){
x_result <- tibble()
for (y_ in split(y, 1:nrow(y)))
x_result <- x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
return(x_result)
}
#dynamic join by #ycw
dynamic_join <- function(d, r){
d$type <- NA_character_
for (r_ in split(r, r$type))
d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
return(d)
}
d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)
# #useR sqldf - fast and intuitive but extra library with horrible code
start <- Sys.time()
d2 <- tbl_df(sqldf("select d.value, r.type from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Sys.time() - start
# #useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
start <- Sys.time()
d2 <- d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
select(value, type)
Sys.time() - start
# #jonathande4 cut pretty fast
start <- Sys.time()
d2 <- d
d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
Sys.time() - start
# #WiWeber floor
start <- Sys.time()
d2 <- d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, type)
Sys.time() - start
# #WiWeber cross join - filter
start <- Sys.time()
d2 <- d %>%
inner_join(r, by = "join") %>%
filter(value >= from, value < to) %>%
select(value, type)
Sys.time() - start
# #hardik-gupta sapply
start <- Sys.time()
d2 <- d %>%
mutate(
type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
) %>%
select(value, type)
Sys.time() - start
# #ycw re-dynamic join
start <- Sys.time()
d2 <- d %>% dynamic_join(r)
Sys.time() - start
# #WiWeber range_join
start <- Sys.time()
d2 <- d %>%
range_join(r, "value", "from", "to") %>%
select(value, type)
Sys.time() - start
Results:
# #useR sqldf - fast and intuitive but extra library with horrible code
Time difference of 0.06221986 secs
# #useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
Time difference of 4.765595 secs
# #jonathande4 cut pretty fast
Time difference of 0.004637003 secs
# #WiWeber floor
Time difference of 0.02223396 secs
# #WiWeber cross join - filter
Time difference of 0.0201931 secs
# #hardik-gupta sapply
Time difference of 5.166633 secs
# #ycw dynamic join
Time difference of 0.03124094 secs
# #WiWeber range_join
Time difference of 0.02691698 secs
greez WiWeber

You use the cut function to create a "class" in object d and then use a left join.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
d <- left_join(d, r)
To get the right buckets, you just need to work with the cut function to get what you want.

We can use sapply for this
library(tibble)
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d <- cbind(d, data.frame(class = (unlist(sapply(d$value, function (x) r[which(x >= r$from & x < r$to), "class"]))) ) )
d
value class
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
11 3.0 C
12 3.2 C
13 3.4 C
14 3.6 C
15 3.8 C
16 4.0 D
17 4.2 D
18 4.4 D
19 4.6 D
20 4.8 D
21 5.0 E
22 5.2 E
23 5.4 E
24 5.6 E
25 5.8 E
26 6.0 F

We can use mutate and case_when from dplyr.
library(dplyr)
d2 <- d %>%
mutate(class = case_when(
value >= 1 & value < 2 ~ "A",
value >= 2 & value < 3 ~ "B",
value >= 3 & value < 4 ~ "C",
value >= 4 & value < 5 ~ "D",
value >= 5 & value < 6 ~ "E",
value >= 6 ~ "F"
))
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
Update
Here is a workaround by defining a function for this task.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
library(dplyr)
# Define a function for dynamic join
dynamic_join <- function(d, r){
if (!("class" %in% colnames(d))){
d[["class"]] <- NA_character_
}
d <- d %>%
mutate(class = ifelse(value >= r$from & value < r$to, r$class, class))
return(d)
}
re_dynamic_join <- function(d, r){
r_list <- split(r, r$class)
for (i in 1:length(r_list)){
d <- dynamic_join(d, r_list[[i]])
}
return(d)
}
# Apply the function
d2 <- d %>% re_dynamic_join(r)
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows

I really liked #WiWeber's range_join function, but it gives an error if a record is not within range. Here's a modification
library(dplyr)
d <- tibble(value = c(seq(1,4, by = 0.2),9))
r <- tibble(from = seq(1,5), to = c(seq(2,5),8), class = LETTERS[seq(1,5)])
range_join <- function(x, y, value, left, right){
all_matches <- tibble()
x = as.data.frame(x)
y = as.data.frame(y)
x$index=x[,value]
for (i in 1:nrow(y)){
matches = x %>% filter(index>=y[i,left] & index<= y[i,right])
if (nrow(matches)>0){
all_matches = all_matches %>% bind_rows(matches %>% cbind(y[i,]))
}
}
all_matches = all_matches %>% select(-index)
return(all_matches)
}
data <- d %>%
range_join(r, "value", "from", "to")
data

Related

Mutate a dynamic subset of variable

a = tibble(x = runif(1000,0,10),
t = rpois(1000,4)
) %>% arrange(t)
I want a column l that averages the subset of x for the values associated to a t < t(x).
Expected result:
for x[t=0], l = NaN
for x[t=1], l = mean(x[t<1])
for x[t=2], l = mean(x[t<2])
etc.
A code that does not work:
a %>%
mutate(
l = mean(x[a$t < .$t])
) -> a
Now this could would work:
for (i in c(1:1000)) {
a$l[i] = mean(a$x[a$t < a$t[i]])
}
But is not a mutate. I'd like a mutate so I can apply it to groups etc.
To understand better the issue: imagine that you have to average all the x before a date. Now: this, dynamically, in a mutate.
I think that purrr may be necessary but I hate it.

You can use map with mutate:
library(tidyverse)
f <- function(lim) mean(a$x[a$t < lim])
a %>% mutate(l = map_dbl(t, f))
Testing against OP solution:
res <- a %>% mutate(l = map_dbl(t, f))
l <- vector(mode = "numeric", length = 1000)
for (i in c(1:1000)) l[i] = mean(a$x[a$t < a$t[i]])
assertthat::are_equal(res$l, l) # TRUE

For each t value you can calculate average value of x and then calculate lag value of cumulative mean.
library(dplyr)
a %>%
group_by(t) %>%
summarise(l = mean(x)) %>%
mutate(l = lag(cummean(l)))
# t l
# <int> <dbl>
# 1 0 NA
# 2 1 5.33
# 3 2 5.45
# 4 3 5.36
# 5 4 5.26
# 6 5 5.16
# 7 6 5.10
# 8 7 5.07
# 9 8 5.12
#10 9 4.96
#11 10 4.98
#12 11 5.15
#13 12 4.93
If you want to maintain number of rows in the dataframe add %>% left_join(a, by = 't') to the above answer.
data
set.seed(123)
a = tibble(x = runif(1000,0,10),
t = rpois(1000,4)
) %>% arrange(t)

Identifying Differences in Rows using Groupby a Column

I have this reproducible dataframe:
df <- data.frame(ID = c("A", "A", "B", "B", "B","C", "C", "D"), cost = c("0.5", "0.4", "0.7", "0.8", "0.5", "1.3", "1.3", "2.6"))
I'm trying to groupby the ID, to test if there are differences in the cost column and update a new column called Test diff
Intermediate Output
ID cost Testdiff
1 A 0.5 Y
2 A 0.4 Y
3 B 0.7 Y
4 B 0.8 Y
5 B 0.5 Y
6 C 1.3 N
7 C 1.3 N
8 D 2.6 N
I'm looking at using a dplyr example to do this but I"m unsure if match is the correct function.
df %>% group_by(ID) %>% mutate(Testdiff = ifelse(match(cost) == T, "Y", "N"))
Once that is completed, I want to keep the 1st row of the unique ID, giving me this output
ID cost Testdiff
1 A 0.5 Y
2 B 0.7 Y
3 C 1.3 N
4 D 2.6 N

We could use n_distinct and then slice
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Testdiff = n_distinct(cost) > 1) %>%
slice(1)
# ID cost Testdiff
# <fct> <fct> <lgl>
#1 A 0.5 TRUE
#2 B 0.7 TRUE
#3 C 1.3 FALSE
#4 D 2.6 FALSE
If you want output to be "Y"/"N" instead of TRUE/FALSE
df %>%
group_by(ID) %>%
mutate(Testdiff = ifelse(n_distinct(cost) > 1, "Y", "N")) %>%
slice(1)
We could use ave and aggregate to solve it using base R
df$Testdiff <- ifelse(with(df, ave(cost, ID, FUN = function(x)
length(unique(x)))) > 1, "Y", "N")
aggregate(.~ID, df, head, n = 1)
# ID cost Testdiff
#1 A 0.5 Y
#2 B 0.7 Y
#3 C 1.3 N
#4 D 2.6 N

Since we have dplyr and base R already why not add in data.table:
library(data.table)
setDT(df)[, .(cost = cost[1], testdiff = uniqueN(cost) > 1), by = ID]
ID cost testdiff
1: A 0.5 TRUE
2: B 0.7 TRUE
3: C 1.3 FALSE
4: D 2.6 FALSE

A different tidyverse possibility could be:
df %>%
group_by(ID) %>%
mutate(Testdiff = ifelse(all(cost == first(cost)), "N", "Y")) %>%
filter(row_number() == 1)
ID cost Testdiff
<fct> <fct> <chr>
1 A 0.5 Y
2 B 0.7 Y
3 C 1.3 N
4 D 2.6 N
Or:
df %>%
group_by(ID) %>%
mutate(Testdiff = ifelse(all(cost == first(cost)), "N", "Y")) %>%
top_n(1, wt = desc(row_number()))

R combine rows with similar values

I have a dataframe and the row values are first ordered from smallest to largest. I compute row value differences between adjacent rows, combine rows with similar differences (e.g., smaller than 1), and return averaged values of combined rows. I could check each row differences with a for loop, but seems a very inefficient way. Any better ideas? Thanks.
library(dplyr)
DF <- data.frame(ID=letters[1:12],
Values=c(1, 2.2, 3, 5, 6.2, 6.8, 7, 8.5, 10, 12.2, 13, 14))
DF <- DF %>%
mutate(Diff=c(0, diff(Values)))
The expected output of DF would be
ID Values
a 1.0
b/c 2.6 # (2.2+3.0)/2
d 5.0
e/f/g 6.67 # (6.2+6.8+7.0)/3
h 8.5
i 10.0
j/k 12.6 # (12.2+13.0)/2
i 14.0

Here is an option with data.table
library(data.table)
setDT(DF)[, .(ID = toString(ID), Values = round(mean(Values), 2)),
by = .(Diff = cumsum(c(TRUE, diff(Values)>=1)))][, -1, with = FALSE]
# ID Values
#1: a 1.00
#2: b, c 2.60
#3: d 5.00
#4: e, f, g 6.67
#5: h 8.50
#6: i 10.00
#7: j, k 12.60
#8: l 14.00

Calculate difference between Values of every row and check if those are >= 1. Cumulative sum of that >=1 will provide you distinct group on which one can summarize to get desired result.
library(dplyr)
DF %>% arrange(Values) %>%
group_by(Diff = cumsum(c(1,diff(Values)) >= 1) ) %>%
summarise(ID = paste0(ID, collapse = "/"), Values = mean(Values)) %>%
ungroup() %>% select(-Diff)
# # A tibble: 8 x 2
# ID Values
# <chr> <dbl>
# 1 a 1.00
# 2 b/c 2.60
# 3 d 5.00
# 4 e/f/g 6.67
# 5 h 8.50
# 6 i 10.0
# 7 j/k 12.6
# 8 l 14.0

library(magrittr)
df <- DF[order(DF$Values),]
df$Values %>%
#Find groups for each row
outer(., ., function(x, y) x >= y & x < y + 1) %>%
# Remove sub-groups
`[<-`(apply(., 1, cumsum) > 1, F) %>%
# Remove sub-group columns
.[, colSums(.) > 0] %>%
# select these groups from data
apply(2, function(x) data.frame(ID = paste(df$ID[x], collapse = '/')
, Values = mean(df$Values[x]))) %>%
# bind results by row
do.call(what = rbind)
# ID Values
# 1 a 1.000000
# 2 b/c 2.600000
# 4 d 5.000000
# 5 e/f/g 6.666667
# 8 h 8.500000
# 9 i 10.000000
# 10 j/k 12.600000
# 12 l 14.000000
Note:
This method is different from those using diff because it groups rows together only if all Values are within < 1 of each other.
Example:
Change the dataset so that Value is 7.3 at ID g.
Above method: The IDs e, f, and g are no longer grouped together because the value at ID e is 6.2 and 7.2 - 6.2 > 1.
Diff Method: IDs e, f, and g are still grouped together because the diff of IDs at e and f is < 1 and the diff of IDs F and G is < 1

Creating Nodes and Edges Dataframes from Tidy Dataframes

I have a data frame that's of this structure:
df <- data.frame(var1 = c(1,1,1,2,2,3,3,3,3),
cat1 = c("A","B","D","B","C","D","E","B","A"))`
> df
var1 cat1
1 1 A
2 1 B
3 1 D
4 2 B
5 2 C
6 3 D
7 3 E
8 3 B
9 3 A
And I am looking to create both nodes and edges data frames from it, so that I can draw a network graph, using VisNetwork. This network will show the number/strength of connections between the different cat1 values, as grouped by the var1 value.
I have the nodes data frame sorted:
nodes <- data.frame(id = unique(df$cat1))
> nodes
id
1 A
2 B
3 D
4 C
5 E
What I'd like help with is how to process df in the following manner:
for each distinct value of var1 in df, tally up the group of nodes that are common to that value of var1 to give an edges dataframe that ultimately looks like the one below. Note that I'm not bothered about the direction of flow along the edges. Just that they are connected is all I need.
> edges
from to value
1 A B 2
2 A D 2
3 A E 1
4 B C 1
5 B D 2
6 B E 1
7 D E 1
With thanks in anticipation,
Nevil
Update: I found here a similar problem, and have adapted that code to give, which is getting close to what I want, but not quite there...
> df %>% group_by(var1) %>%
filter(n()>=2) %>% group_by(var1) %>%
do(data.frame(t(combn(.$cat1, 2,function(x) sort(x))),
stringsAsFactors=FALSE))
# A tibble: 10 x 3
# Groups: var1 [3]
var1 X1 X2
<dbl> <chr> <chr>
1 1. A B
2 1. A D
3 1. B D
4 2. B C
5 3. D E
6 3. B D
7 3. A D
8 3. B E
9 3. A E
10 3. A B

I don't know if there is already a suitable function to achieve this task. Here is a detailed procedure to do it. Whith this, you should be able to define you own function. Hope it helps!
# create an adjacency matrix
mat <- table(df)
mat <- t(mat) %*% mat
as.table(mat) # look at your adjacency matrix
# since the network is not directed, we can consider only the (strictly) upper triangular matrix
mat[lower.tri(mat, diag = TRUE)] <- 0
as.table(mat) # look at the new adjacency matrix
library(dplyr)
edges <- as.data.frame(as.table(mat))
edges <- filter(edges, Freq != 0)
colnames(edges) <- c("from", "to", "value")
edges <- arrange(edges, from)
edges # output
# from to value
#1 A B 2
#2 A D 2
#3 A E 1
#4 B C 1
#5 B D 2
#6 B E 1
#7 D E 1

here's a couple other ways...
in base R...
values <- unique(df$var1[duplicated(df$var1)])
do.call(rbind,
lapply(values, function(i) {
nodes <- as.character(df$cat1[df$var1 == i])
edges <- combn(nodes, 2)
data.frame(from = edges[1, ],
to = edges[2, ],
value = i,
stringsAsFactors = F)
})
)
in tidyverse...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
filter(n() >= 2) %>%
mutate(cat1 = as.character(cat1)) %>%
summarise(edges = list(data.frame(t(combn(cat1, 2)), stringsAsFactors = F))) %>%
unnest(edges) %>%
select(from = X1, to = X2, value = var1)
in tidyverse using tidyr::complete...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
mutate(cat1 = as.character(cat1)) %>%
mutate(i.cat1 = cat1) %>%
complete(cat1, i.cat1) %>%
filter(cat1 < i.cat1) %>%
select(from = cat1, to = i.cat1, value = var1)
in tidyverse using tidyr::expand...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
mutate(cat1 = as.character(cat1)) %>%
expand(cat1, to = cat1) %>%
filter(cat1 < to) %>%
select(from = cat1, to, value = var1)

use user defined function after grouped data in R

I have a function checking zero numbers in each column in a large dataframe. Now I want to check zero numbers in each col after grouped by category.
Here is the example:
zero_rate <- function(df) {
z_rate_list <- sapply(df, function(x) {
data.frame(
n_zero=length(which(x==0)),
n=length(x),
z_rate=length(which(x==0))/length(x))
})
d <- data.frame(z_rate_list)
d <- sapply(d, unlist)
d <- as.data.frame(d)
return(d)}
df = data.frame(var1=c(1,0,NA,4,NA,6,7,0,0,10),var2=c(11,NA,NA,0,NA,16,0,NA,19,NA))
df1= data.frame(cat = c(1,1,1,1,1,2,2,2,2,2),df)
zero_rate_df = df1 %>% group_by(cat) %>% do( zero_rate(.))
Here zero_rate(df) works just as I expected. But when I group the data by cat and calculate in each category the zero_rate for each column, the result is not as I expected.
I expect something like this:
cat va1 var2
1 n_zero 1 1
n 5 5
z_rate 0.2 0.2
2 n_zero 2 1
n 5 5
z_rate 0.4 0.2
Any suggestion? Thank you.

I came up with the following code. .[-1] was used to remove grouping col:
zero_rate <- function(df){
res <- lapply(df, function(x){
y <- c(sum(x == 0, na.rm = T), length(x))
c(y, y[1]/y[2])
})
res <- do.call(cbind.data.frame, res)
res$vars <- c('n_zero', 'n', 'z_rate')
res
}
df1 %>% group_by(cat) %>% do( zero_rate(.[-1]))
# cat var1 var2 vars
# <dbl> <dbl> <dbl> <chr>
# 1 1 1.0 1.0 n_zero
# 2 1 5.0 5.0 n
# 3 1 0.2 0.2 z_rate
# 4 2 2.0 1.0 n_zero
# 5 2 5.0 5.0 n
# 6 2 0.4 0.2 z_rate

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

R dplyr join by range or virtual column - r

Related

Mutate a dynamic subset of variable

Identifying Differences in Rows using Groupby a Column

R combine rows with similar values

Creating Nodes and Edges Dataframes from Tidy Dataframes

use user defined function after grouped data in R

Categories

Resources