unique pairwise distances between any points in the dataframe - r

I have a list of ten points with X and Ỳ coordinates. I would like to calculate the possible permutations of distances between any two points. Precisely, only one of the distances in 1-2, 2-1 should be present. I have managed to remove the distances of a point with itself. But couldn't achieve this permutation distances.
# Data Generation
df <- data.frame(X = runif(10, 0, 1), Y = runif(10, 0, 1), ID = 1:10)
# Temporary key Creation
df <- df %>% mutate(key = 1)
# Calculating pairwise distances
df %>% full_join(df, by = "key") %>%
mutate(dist = sqrt((X.x - X.y)^2 + (Y.x - Y.y)^2)) %>%
select(ID.x, ID.y, dist) %>% filter(!dist == 0) %>% head(11)
# Output
# ID.x ID.y dist
# 1 1 2 0.90858911
# 2 1 3 0.71154587
# 3 1 4 0.05687495
# 4 1 5 1.03885510
# 5 1 6 0.93747717
# 6 1 7 0.62070415
# 7 1 8 0.88351690
# 8 1 9 0.89651911
# 9 1 10 0.05079906
# 10 2 1 0.90858911
# 11 2 3 0.27530175
How to achieve the expected output shown below?
# Expected Output
# ID.x ID.y dist
# 1 1 2 0.90858911
# 2 1 3 0.71154587
# 3 1 4 0.05687495
# 4 1 5 1.03885510
# 5 1 6 0.93747717
# 6 1 7 0.62070415
# 7 1 8 0.88351690
# 8 1 9 0.89651911
# 9 1 10 0.05079906
# 10 2 3 0.27530175
# 11 2 4 0.5415415
But this approach is computationally slower compared to dist(). Would be happier to listen to faster approaches.

I would use dist on the data and then process the output into the required format. You can replace dist with any other distance function. Here I've used letters rather than numbers as ID to better show what is happening
set.seed(42)
df <- data.frame(X = runif(10, 0, 1), Y = runif(10, 0, 1), ID = letters[1:10])
df %>%
column_to_rownames("ID") %>% #make the ID the rownames. dist will use these> NB will not work on a tibble
dist() %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column(var = "ID.x") %>% #capture the row IDs
gather(key = ID.y, value = dist, -ID.x) %>%
filter(ID.x < ID.y) %>%
as_tibble()
# A tibble: 45 x 3
ID.x ID.y dist
<chr> <chr> <dbl>
1 a b 0.2623175
2 a c 0.7891034
3 b c 0.6856994
4 a d 0.2191960
5 b d 0.4757855
6 c d 0.8704269
7 a e 0.2730984
8 b e 0.3913770
9 c e 0.5912681
10 d e 0.2800021
# ... with 35 more rows
dist is very fast compared with looping through calculating distances.
The code can probably be made more efficient, by working directly of the dist object rather than converting it into a matrix.

Perhaps this is a slightly simpler approach:
df <- data.frame(X = runif(10, 0, 1), Y = runif(10, 0, 1), ID = 1:10)
df2 <- data.frame(ID1 = rep(1:10, each = 10),
ID2 = 1:10,
distance = as.vector(as.matrix((dist(df)))))
Then get rid of diagonal:
df2 <- df2[df2$ID1 != df2$ID2,]
Get rid of upper triangle:
df2 <- df2[df2$ID1 < df2$ID2,]
df2
ID1 ID2 distance
2 1 2 1.000615
3 1 3 2.057813
4 1 4 3.010261
5 1 5 4.039502
6 1 6 5.029982
7 1 7 6.035427
8 1 8 7.012540
9 1 9 8.006249
10 1 10 9.015352
13 2 3 1.099245
14 2 4 2.011664
...

Related

How to make bin for the each row of dataframe only with the non NA values?

I have a dataframe which contains values and NAs. Some of them have the NAs from the start of the row, some of them have the NAs at the end of the row.
# like this way
df<- data.frame(A=c(1,5,6, 1,NA,NA),
B=c(1,2,3, 2,NA,NA),
C=c(1,3,NA, 4,3,NA),
D=c(1,1,NA, 6,10,NA),
E=c(1,NA,NA, 1,1,1),
F=c(1,NA,NA, 1,1,1))
Now I would like to build two bins for each row based on the non NA values and sum them up.
#expected output
Sum Bin
3 1
3 2
7 1
5 2
6 1
3 2
...
Now what I did is I first separate the dataframe into 2 base on whether the row will start or end with NAs. Then I use a loop for the calculation.
df_bin <- data.frame(Sum = 0, Bin = 0)
bin = 2 # set bin for the calculation
for (i in 1:nrow(df)) {
l <- sum(!is.na(df[i,]))
ll <- as.integer(l/bin)
s <- c()
j <- 1
while (j <= (bin-1)) {
k <- sum(df[i,(j*ll-ll+1):(j*ll)])
s <- c(s,k)
j = j+1
}
k <- k <- sum(df[i,(j*(bin-1)+1):l])
s <- c(s,k)
df2 <- data.frame(Sum = s, Bin = 1:bin)
df_bin <- rbind(df_bin,df2)
}
But it runs very slow, I was wondering if there is a more elegant way to do it. Thank you in advance : )
A pure tidyverse solution using pivoting:
df %>%
mutate(orig_row = 1:n()) %>%
pivot_longer(-orig_row) %>% filter(!is.na(value)) %>%
group_by(orig_row) %>% mutate(Bin = round(1 + seq(0, n() - 1) / n())) %>%
group_by(orig_row, Bin) %>% summarise(Sum = sum(value)) %>% ungroup() %>%
select(-orig_row)
Result:
# A tibble: 12 x 2
Bin Sum
<dbl> <dbl>
1 1 3
2 2 3
3 1 7
4 2 4
5 1 6
6 2 3
7 1 7
8 2 8
9 1 13
10 2 2
11 1 1
12 2 1
You can try using apply :
do.call(rbind, apply(df, 1, function(x) {
#Remove NA values
x1 <- na.omit(x)
#Calculate length of non-NA values
n <- length(x1)
#Calculate mid point
half_len <- round(n/2)
#Create dataframe with sum of two bin values
data.frame(Sum = c(sum(x1[1:half_len]), sum(x1[(half_len + 1):n])),
Bin = 1:2)
}))
# Sum Bin
#1 3 1
#2 3 2
#3 7 1
#4 4 2
#5 6 1
#6 3 2
#7 7 1
#8 8 2
#9 13 1
#10 2 2
#11 1 1
#12 1 2

programming R loop

I need help with programming R. I have data.frame B with one column
x<- c("300","300","300","400","400","400","500","500","500"....etc.) **2 milion rows**
and I need create next columns with rank. Next columns should look as
y<- c(1,2,3,1,2,3,1,2,3,......etc. )
I used cycle with for
B$y[1]=1
for (i in 2:length(B$x))
{
B$y[i]<-ifelse(B$x[i]==B$x[i-1], B$y[i-1]+1, 1)
}
The process ran for 4 hours.
So I need help anything speed up or anything else.
Thanks for your answer.
Here is a solution with base R:
B <- data.frame(x = rep(c(300, 400, 400), sample(c(5:10), 3)))
B
B$y <- ave(B$x, B$x, FUN=seq_along)
Here's an approach with dplyr that takes about 0.2 seconds on 2 million rows.
First I make sample data:
n = 2E6 # number of rows in test
library(dplyr)
sample_data <- data.frame(
x = round(runif(n = n, min = 1, max = 100000), digits = 0)
) %>%
arrange(x) # Optional, added to make output clearer so that each x is adjacent to the others that match.
Then I group by x and make y show which # occurrence of x it is within that group.
sample_data_with_rank <- sample_data %>%
group_by(x) %>%
mutate(y = row_number()) %>%
ungroup()
head(sample_data_with_rank, 20)
# A tibble: 20 x 2
x y
<dbl> <int>
1 1 1
2 1 2
3 1 3
4 1 4
5 1 5
6 1 6
7 1 7
8 1 8
9 1 9
10 1 10
11 1 11
12 1 12
13 1 13
14 1 14
15 1 15
16 2 1
17 2 2
18 2 3
19 2 4
20 2 5

bind_rows to each group of tibble

Consider the following two tibbles:
library(tidyverse)
a <- tibble(time = c(-1, 0), value = c(100, 200))
b <- tibble(id = rep(letters[1:2], each = 3), time = rep(1:3, 2), value = 1:6)
So a and b have the same columns and b has an additional column called id.
I want to do the following: group b by id and then add tibble a on top of each group.
So the output should look like this:
# A tibble: 10 x 3
id time value
<chr> <int> <int>
1 a -1 100
2 a 0 200
3 a 1 1
4 a 2 2
5 a 3 3
6 b -1 100
7 b 0 200
8 b 1 4
9 b 2 5
10 b 3 6
Of course there are multiple workarounds to achieve this (like loops for example). But in my case I have a large number of IDs and a very large number of columns.
I would be thankful if anyone could point me towards the direction of a solution within the tidyverse.
Thank you
We can expand the data frame a with id from b and then bind_rows them together.
library(tidyverse)
a2 <- expand(a, id = b$id, nesting(time, value))
b2 <- bind_rows(a2, b) %>% arrange(id, time)
b2
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a -1 100
# 2 a 0 200
# 3 a 1 1
# 4 a 2 2
# 5 a 3 3
# 6 b -1 100
# 7 b 0 200
# 8 b 1 4
# 9 b 2 5
# 10 b 3 6
split from base R will divide a data frame into a list of subsets based on an index.
b %>%
split(b[["id"]]) %>%
lapply(bind_rows, a) %>%
lapply(select, -"id") %>%
bind_rows(.id = "id")
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a 1 1
# 2 a 2 2
# 3 a 3 3
# 4 a -1 100
# 5 a 0 200
# 6 b 1 4
# 7 b 2 5
# 8 b 3 6
# 9 b -1 100
# 10 b 0 200
An idea (via base R) is to split your data frame and create a new one with id + the other data frame and rbind, i.e.
df = do.call(rbind, lapply(split(b, b$id), function(i)rbind(data.frame(id = i$id[1], a), i)))
which gives
id time value
a.1 a -1 100
a.2 a 0 200
a.3 a 1 1
a.4 a 2 2
a.5 a 3 3
b.1 b -1 100
b.2 b 0 200
b.3 b 1 4
b.4 b 2 5
b.5 b 3 6
NOTE: You can remove the rownames by simply calling rownames(df) <- NULL
We can nest and add the relevant rows to each nested item :
library(tidyverse)
b %>%
nest(-id) %>%
mutate(data= map(data,~bind_rows(a,.x))) %>%
unnest
# # A tibble: 10 x 3
# id time value
# <chr> <dbl> <dbl>
# 1 a -1 100
# 2 a 0 200
# 3 a 1 1
# 4 a 2 2
# 5 a 3 3
# 6 b -1 100
# 7 b 0 200
# 8 b 1 4
# 9 b 2 5
# 10 b 3 6
Maybe not the most efficient way, but easy to follow:
library(tidyverse)
a <- tibble(time = c(-1, 0), value = c(100, 200))
b <- tibble(id = rep(letters[1:2], each = 3), time = rep(1:3, 2), value =
1:6)
a.a <- a %>% add_column(id = rep("a",length(a)))
a.b <- a %>% add_column(id = rep("b",length(a)))
joint <- bind_rows(b,a.a,a.b)
(joint <- arrange(joint,id))

Repeating rows of data.frame in dplyr [duplicate]

This question already has answers here:
Repeat each row of data.frame the number of times specified in a column
(10 answers)
Closed 2 years ago.
I have a trouble with repeating rows of my real data using dplyr. There is already another post in here repeat-rows-of-a-data-frame but no solution for dplyr.
Here I just wonder how could be the solution for dplyr
but failed with error:
Error: wrong result size (16), expected 4 or 1
library(dplyr)
df <- data.frame(column = letters[1:4])
df_rep <- df%>%
mutate(column=rep(column,each=4))
Expected output
>df_rep
column
#a
#a
#a
#a
#b
#b
#b
#b
#*
#*
#*
Using the uncount function will solve this problem as well. The column count indicates how often a row should be repeated.
library(tidyverse)
df <- tibble(letters = letters[1:4])
df
# A tibble: 4 x 1
letters
<chr>
1 a
2 b
3 c
4 d
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
uncount(count)
# A tibble: 11 x 1
letters
<chr>
1 a
2 a
3 b
4 b
5 b
6 c
7 c
8 d
9 d
10 d
11 d
I was looking for a similar (but slightly different) solution. Posting here in case it's useful to anyone else.
In my case, I needed a more general solution that allows each letter to be repeated an arbitrary number of times. Here's what I came up with:
library(tidyverse)
df <- data.frame(letters = letters[1:4])
df
> df
letters
1 a
2 b
3 c
4 d
Let's say I want 2 A's, 3 B's, 2 C's and 4 D's:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count))
# A tibble: 11 x 2
# Groups: letters [4]
letters count
<fctr> <int>
1 a 1
2 a 2
3 b 1
4 b 2
5 b 3
6 c 1
7 c 2
8 d 1
9 d 2
10 d 3
11 d 4
If you don't want to keep the count column:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count)) %>%
select(letters)
# A tibble: 11 x 1
# Groups: letters [4]
letters
<fctr>
1 a
2 a
3 b
4 b
5 b
6 c
7 c
8 d
9 d
10 d
11 d
If you want the count to reflect the number of times each letter is repeated:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count)) %>%
mutate(count = max(count))
# A tibble: 11 x 2
# Groups: letters [4]
letters count
<fctr> <dbl>
1 a 2
2 a 2
3 b 3
4 b 3
5 b 3
6 c 2
7 c 2
8 d 4
9 d 4
10 d 4
11 d 4
This is rife with peril if the data.frame has other columns (there, I said it!), but the do block will allow you to generate a derived data.frame within a dplyr pipe (though, ceci n'est pas un pipe):
library(dplyr)
df <- data.frame(column = letters[1:4], stringsAsFactors = FALSE)
df %>%
do( data.frame(column = rep(.$column, each = 4), stringsAsFactors = FALSE) )
# column
# 1 a
# 2 a
# 3 a
# 4 a
# 5 b
# 6 b
# 7 b
# 8 b
# 9 c
# 10 c
# 11 c
# 12 c
# 13 d
# 14 d
# 15 d
# 16 d
As #Frank suggested, a much better alternative could be
df %>% slice(rep(1:n(), each=4))
I did a quick benchmark to show that uncount() is a lot faster than expand()
# for the pipe
library(magrittr)
# create some test data
df_test <-
tibble::tibble(
letter = letters,
row_count = sample(1:10, size = 26, replace = TRUE)
)
# benchmark
bench <- microbenchmark::microbenchmark(
expand = df_test %>%
dplyr::group_by(letter) %>%
tidyr::expand(row_count = seq(1:row_count)),
uncount = df_test %>%
tidyr::uncount(row_count)
)
# plot the benchmark
ggplot2::autoplot(bench)

R, dplyr: cumulative version of n_distinct

I have a dataframe as follows. It is ordered by column time.
Input -
df = data.frame(time = 1:20,
grp = sort(rep(1:5,4)),
var1 = rep(c('A','B'),10)
)
head(df,10)
time grp var1
1 1 1 A
2 2 1 B
3 3 1 A
4 4 1 B
5 5 2 A
6 6 2 B
7 7 2 A
8 8 2 B
9 9 3 A
10 10 3 B
I want to create another variable var2 which computes no of distinct var1 values so far i.e. until that point in time for each group grp . This is a little different from what I'd get if I were to use n_distinct.
Expected output -
time grp var1 var2
1 1 1 A 1
2 2 1 B 2
3 3 1 A 2
4 4 1 B 2
5 5 2 A 1
6 6 2 B 2
7 7 2 A 2
8 8 2 B 2
9 9 3 A 1
10 10 3 B 2
I want to create a function say cum_n_distinct for this and use it as -
d_out = df %>%
arrange(time) %>%
group_by(grp) %>%
mutate(var2 = cum_n_distinct(var1))
A dplyr solution inspired from #akrun's answer -
Ths logic is basically to set 1st occurrence of each unique values of var1 to 1 and rest to 0 for each group grp and then apply cumsum on it -
df = df %>%
arrange(time) %>%
group_by(grp,var1) %>%
mutate(var_temp = ifelse(row_number()==1,1,0)) %>%
group_by(grp) %>%
mutate(var2 = cumsum(var_temp)) %>%
select(-var_temp)
head(df,10)
Source: local data frame [10 x 4]
Groups: grp
time grp var1 var2
1 1 1 A 1
2 2 1 B 2
3 3 1 A 2
4 4 1 B 2
5 5 2 A 1
6 6 2 B 2
7 7 2 A 2
8 8 2 B 2
9 9 3 A 1
10 10 3 B 2
Assuming stuff is ordered by time already, first define a cumulative distinct function:
dist_cum <- function(var)
sapply(seq_along(var), function(x) length(unique(head(var, x))))
Then a base solution that uses ave to create groups (note, assumes var1 is factor), and then applies our function to each group:
transform(df, var2=ave(as.integer(var1), grp, FUN=dist_cum))
A data.table solution, basically doing the same thing:
library(data.table)
(data.table(df)[, var2:=dist_cum(var1), by=grp])
And dplyr, again, same thing:
library(dplyr)
df %>% group_by(grp) %>% mutate(var2=dist_cum(var1))
Try:
Update
With your new dataset, an approach in base R
df$var2 <- unlist(lapply(split(df, df$grp),
function(x) {x$var2 <-0
indx <- match(unique(x$var1), x$var1)
x$var2[indx] <- 1
cumsum(x$var2) }))
head(df,7)
# time grp var1 var2
# 1 1 1 A 1
# 2 2 1 B 2
# 3 3 1 A 2
# 4 4 1 B 2
# 5 5 2 A 1
# 6 6 2 B 2
# 7 7 2 A 2
Here's another solution using data.table that's pretty quick.
Generic Function
cum_n_distinct <- function(x, na.include = TRUE){
# Given a vector x, returns a corresponding vector y
# where the ith element of y gives the number of unique
# elements observed up to and including index i
# if na.include = TRUE (default) NA is counted as an
# additional unique element, otherwise it's essentially ignored
temp <- data.table(x, idx = seq_along(x))
firsts <- temp[temp[, .I[1L], by = x]$V1]
if(na.include == FALSE) firsts <- firsts[!is.na(x)]
y <- rep(0, times = length(x))
y[firsts$idx] <- 1
y <- cumsum(y)
return(y)
}
Example Use
cum_n_distinct(c(5,10,10,15,5)) # 1 2 2 3 3
cum_n_distinct(c(5,NA,10,15,5)) # 1 2 3 4 4
cum_n_distinct(c(5,NA,10,15,5), na.include = FALSE) # 1 1 2 3 3
Solution To Your Question
d_out = df %>%
arrange(time) %>%
group_by(grp) %>%
mutate(var2 = cum_n_distinct(var1))

Resources