Calculating distance between 2 elements of a data frame - r

I have a data frame that looks like this:
library(dplyr)
size_df <- tibble(size_chr = c("XS", "S", "M", "L", "XL", "1XL", "2XL", "3XL", "4XL", "5XL", "6XL"),
size_min = c(0,36,39,42,45,48,52,56,60,64,66),
size_max = c(36,39,42,45,48,52,56,60,64,66,70))
For any given number less than 70, I want to find the two sizes that it lies between, and the distance between them both (normalised to between 0 and 1)
For example:
input <- 37.2
# S 0.6
# M 0.4
input <- 48
# XL 1
input <- 68
# 5XL 0.5
# 6XL 0.5

This is the perfect case for findInterval(). We'll create a vector of the breaks between categories and use those to calculate scaling factors.
size_breaks <- c(size_df[["size_min"]], max(size_df[["size_max"]]))
size_breaks
# [1] 0 36 39 42 45 48 52 56 60 64 66 70
size_spans <- diff(size_breaks)
size_scales <- 1 / size_spans
size_scales
# [1] 0.02777778 0.33333333 0.33333333 0.33333333 0.33333333 0.25000000 0.25000000
# [8] 0.25000000 0.25000000 0.50000000 0.25000000
findInterval() will give us the index of the lower bound. The upper bound is just that index + 1.
neighbor_distances <- function(x) {
lower <- findInterval(x, size_breaks)
neighbors <- c(lower, lower + 1)
distances <- abs(x - size_breaks[neighbors]) * size_scales[lower]
tibble(
size_chr = size_df[["size_chr"]][neighbors],
distance = distances
)
}
It works well for your first example.
neighbor_distances(37.2)
# # A tibble: 2 x 2
# size_chr distance
# <chr> <dbl>
# 1 S 0.4
# 2 M 0.600
The second example gives two rows instead of just one, but that can be handled with extra logic in the function. I left that logic out to keep things simple.
neighbor_distances(48)
# # A tibble: 2 x 2
# size_chr distance
# <chr> <dbl>
# 1 1XL 0
# 2 2XL 1
It gives a different answer for your third example, but I don't know why you expect a number to be compared to a size category smaller than the lower bound.
neighbor_distances(68)
# # A tibble: 2 x 2
# size_chr distance
# <chr> <dbl>
# 1 6XL 0.5
# 2 NA 0.5

INDS = c(max(1, tail(which(size_df$size_min < input), 1)),
min(NROW(size_df), 1 + head(which(size_df$size_max > input), 1)))
size_df$size_chr[INDS]
#[1] "S" "M"
DIST = c(abs(size_df$size_min[INDS[1]] - input),
abs(size_df$size_max[INDS[2]] - input))
DIST/sum(DIST)
#[1] 0.2 0.8

Related

Cumsum but with maximum number of datapoints

I'm looking to create a hybrid of cumsum() and TTR::runSum()where cumSum() runs up until a pre-specified number of datapoints, at which points it acts more like a runSum()
For example:
library(TTR)
data <- rep(1:3,2)
cumsum <- cumsum(data)
runSum <- runSum(data, n = 3)
DesiredResult <- ifelse(is.na(runSum),cumsum,runSum)
Is there a way to get to DesiredResult that doesn't require getting finangly with NAs?
That is what the partial=TRUE argument to rollapplyr does. Here we show this with sum and also with sd and IQR. (Note that the sd of one value is NA and we chose IQR since it is a measure of spread that can be calculated for scalars although it is always 0 in that case.)
library(zoo)
rollapplyr(data, 3, sum, partial = TRUE)
## [1] 1 3 6 6 6 6
rollapplyr(data, 3, sd, partial = TRUE)
## [1] NA 0.7071068 1.0000000 1.0000000 1.0000000 1.0000000
rollapplyr(data, 3, IQR, partial = TRUE)
## [1] 0.0 0.5 1.0 1.0 1.0 1.0
Here are three alternatives.
n <- 3
rowSums(embed(c(rep(0, n - 1), data), n)) # base R
# [1] 1 3 6 6 6 6
library(TTR)
runSum(c(rep(0, n - 1), data), n = n)
# [1] NA NA 1 3 6 6 6 6 # na.omit fixes the beginning
library(zoo)
rollsum(c(rep(0, n - 1), data), k = 3, align = "right")
# [1] 1 3 6 6 6 6

How do you write a matrix using a "for" loop in R?

So for data evaluation that I am doing at the moment I want to write a matrix using a "for" loop.
Let's say I have random numbers between 0 and 100:
E <- runif(100, 0, 100)
t <- 0 #start
for(t in 0:90) {
D <- length(E[E >= t, E < (t + 10)])
t = t + 10
}
So what I want to do is write "D" into a matrix at each iteration with "t" in one column and "D" in the other.
I've heard that you should avoid loops in R, but I don't know an alternative.
Rather than using a loop, you can do this with sapply, which operates on each item in a sequence and stores the result in a vector, and then cbind to create the matrix:
E <- runif(100, 0, 100)
t <- seq(0, 90, 10)
D <- sapply(t, function(ti) {
sum(E >= ti & E < (ti + 10))
})
cbind(t, D)
#> t D
#> [1,] 0 11
#> [2,] 10 12
#> [3,] 20 14
#> [4,] 30 11
#> [5,] 40 9
#> [6,] 50 12
#> [7,] 60 7
#> [8,] 70 7
#> [9,] 80 6
#> [10,] 90 11
Note that I also used sum(E >= ti & E < (ti + 10)) rather than length(length(E[E >= ti & E < (ti + 10)])), as a slightly shorter way of finding the number of items in E that were greater than t but less than t + 10.
It seems that you want to bin your variable into categories - this is exactly what cut does:
E <- runif(100, 0, 100)
table(cut(E, breaks = seq(0,100,10), right=FALSE))
#> [0,10) [10,20) [20,30) [30,40) [40,50) [50,60) [60,70) [70,80) [80,90)
#> 10 10 7 10 8 10 12 11 10
#>[90,100)
#> 12
If you don't want to see categories labels, remove table call; if you want it in "tabular" format, wrap it in as.matrix.
Please note that if you are doing it for plotting purposes, then both hist and ggplot will do it automatically for you:
hist(E, breaks = seq(0,100,10))
library("ggplot2")
ggplot(data.frame(var=E), aes(x=var)) + geom_histogram(binwidth = 10)

Counting values within levels

I have a set of levels in R that I generate with cut, e.g. say fractional values between 0 and 1, broken down into 0.1 bins:
> frac <- cut(c(0, 1), breaks=10)
> levels(frac)
[1] "(-0.001,0.1]" "(0.1,0.2]" "(0.2,0.3]" "(0.3,0.4]" "(0.4,0.5]"
[6] "(0.5,0.6]" "(0.6,0.7]" "(0.7,0.8]" "(0.8,0.9]" "(0.9,1]"
Given a vector v containing continuous values between [0.0, 1.0], how do I count the frequency of elements in v that fall within each level in levels(frac)?
I could customize the number of breaks and/or the interval from which I am making levels, so I'm looking for a way to do this with standard R commands, so that I can build a two-column data frame: one column for the levels as factors, and the second column for a fractional or percentage value of total elements in v over the level.
Note: The following does not work:
> table(frac)
frac
(-0.001,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6]
1 0 0 0 0 0
(0.6,0.7] (0.7,0.8] (0.8,0.9] (0.9,1]
0 0 0 1
If I use cut on v directly, then I do not get the same levels when I run cut on different vectors, because the range of values — their minimum and maximum — is going to be different between arbitrary vectors, and so while I may have the same number of breaks, the level intervals will not be the same.
My goal is to take different vectors and bin them to the same set of levels. Hopefully this helps clarify my question. Thanks for any assistance.
Amend frac to actually represent your desired intervals, and then use the table function:
x = runif(100) # For example.
frac = cut(x, breaks = seq(0, 1, 0.1))
table(frac)
Result:
frac
(0,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6] (0.6,0.7] (0.7,0.8]
14 9 8 10 8 12 7 7
(0.8,0.9] (0.9,1]
16 9
Introduce extremes c(0, 1) to v then use the same cut:
library(dplyr)
#dummy data
set.seed(1)
v <- round(runif(7), 2)
#result
data.frame(v,
vFrac = cut(c(0, 1, v), breaks = 10)[-c(1, 2)]) %>%
group_by(vFrac) %>%
mutate(vFreq = n())
# Source: local data frame [10 x 3]
# Groups: vFrac [8]
#
# v vFrac vFreq
# <dbl> <fctr> <int>
# 1 0.27 (0.2,0.3] 1
# 2 0.37 (0.3,0.4] 1
# 3 0.57 (0.5,0.6] 1
# 4 0.91 (0.9,1] 2
# 5 0.20 (0.1,0.2] 1
# 6 0.90 (0.8,0.9] 1
# 7 0.94 (0.9,1] 2
frac = seq(0,1,by=0.1)
ranges = paste(head(frac,-1), frac[-1], sep=" - ")
freq = hist(v, breaks=frac, include.lowest=TRUE, plot=FALSE)
data.frame(range = ranges, frequency = freq$counts)
Use findInterval instead of cut:
v<-data.frame(v=runif(100,0,1))
library(plyr)
v$x<-findInterval(v$v,seq(0,1,by=0.1))*0.1
ddply(v, .(x), summarize, n=length(x))
frac = seq(0, 1, 0.1)
set.seed(42); v = rnorm(10, 0.5, 0.2)
sapply(1:(length(frac)-1), function(i) sum(frac[i]<v & frac[i+1]>=v))
#[1] 0 0 0 1 3 2 1 1 1 1

Split a list every nth element [duplicate]

I need to split a sorted unknown length vector in R into "top 10%,..., bottom 10%"
So, for example if I have vector <- order(c(1:98928)), I want to split it into 10 different vectors, each one representing approximately 10% of the total length.
Ive tried using split <- split(vector, 1:10) but as I dont know the length of the vector, I get this error if its not multiple
data length is not a multiple of split variable
And even if its multiple and the function works, split() does not keep the order of my original vector. This is what split gives:
split(c(1:10) , 1:2)
$`1`
[1] 1 3 5 7 9
$`2`
[1] 2 4 6 8 10
And this is what I want:
$`1`
[1] 1 2 3 4 5
$`2`
[1] 6 7 8 9 10
Im newbie in R and Ive been trying lots of things without success, does anyone knows how to do this?
Problem statement
Break a sorted vector x every 10% into 10 chunks.
Note there are two interpretation for this:
Cutting by vector index:
split(x, floor(10 * seq.int(0, length(x) - 1) / length(x)))
Cutting by vector values (say, quantiles):
split(x, cut(x, quantile(x, prob = 0:10 / 10, names = FALSE), include = TRUE))
In the following, I will make demonstration using data:
set.seed(0); x <- sort(round(rnorm(23),1))
Particularly, our example data are Normally distributed rather than uniformly distributed, so cutting by index and cutting by value are substantially different.
Result
cutting by index
#$`0`
#[1] -1.5 -1.2 -1.1
#
#$`1`
#[1] -0.9 -0.9
#
#$`2`
#[1] -0.8 -0.4
#
#$`3`
#[1] -0.3 -0.3 -0.3
#
#$`4`
#[1] -0.3 -0.2
#
#$`5`
#[1] 0.0 0.1
#
#$`6`
#[1] 0.3 0.4 0.4
#
#$`7`
#[1] 0.4 0.8
#
#$`8`
#[1] 1.3 1.3
#
#$`9`
#[1] 1.3 2.4
cutting by quantile
#$`[-1.5,-1.06]`
#[1] -1.5 -1.2 -1.1
#
#$`(-1.06,-0.86]`
#[1] -0.9 -0.9
#
#$`(-0.86,-0.34]`
#[1] -0.8 -0.4
#
#$`(-0.34,-0.3]`
#[1] -0.3 -0.3 -0.3 -0.3
#
#$`(-0.3,-0.2]`
#[1] -0.2
#
#$`(-0.2,0.14]`
#[1] 0.0 0.1
#
#$`(0.14,0.4]`
#[1] 0.3 0.4 0.4 0.4
#
#$`(0.4,0.64]`
#numeric(0)
#
#$`(0.64,1.3]`
#[1] 0.8 1.3 1.3 1.3
#
#$`(1.3,2.4]`
#[1] 2.4
If you have your vector as a column (named vec) in a data frame, you can simply do something like this:
df$new_vec <- cut(df$vec , breaks = quantile(df$vec, c(0, .1,.., 1)),
labels=1:10, include.lowest=TRUE)
x <- 1:98
y <- split(x, ((seq(length(x))-1)*10)%/%length(x)+1)
Explanation:
seq(length(x)) = 1..98
seq(length(x))-1 = 0..97
(seq(length(x))-1)*10 = (0, 10, ..., 970)
# each number about 10% of values, totally 98
((seq(length(x))-1)*10)%/%length(x) = (0, ..., 0, 1, ..., 1, ..., 9, ..., 9)
# each number about 10% of values, totally 98
seq(length(x))-1)*10)%/%length(x)+1 = (1, ..., 1, 2, ..., 2, ..., 10, ..., 10)
# splits first ~10% of numbers to 1, next ~10% of numbers to 2 etc.
split(x, ((seq(length(x))-1)*10)%/%length(x)+1)
If the vector is sorted, then you could just create a group variable with the same length of vector and split on it. In real case, it will require a little more effort since the length of the vector may not be a multiple of 10 but for your toy example, you can do:
n = 2
split(x, rep(1:n, each = length(x)/n))
# $`1`
# [1] 1 2 3 4 5
# $`2`
# [1] 6 7 8 9 10
A real case example, where the vector's length is not a multiple of the number of groups:
vec = 1:13
n = 3
split(vec, sort(seq_along(vec)%%n))
# $`0`
# [1] 1 2 3 4
# $`1`
# [1] 5 6 7 8 9
# $`2`
# [1] 10 11 12 13
You can use the sum() function to determine the positions to extract a section of the vector. Using a logical operator greater than (>) or less than (<) the percentile value you are indicating. Since sum() assigns the value of 1 if TRUE and 0 if FALSE. It is important to order the elements of the vector first.
# A vector with numbers from 1 to 100
data <- seq(1,100)
# 25th percentile value and 75th percentile value
ps1 <- quantile(data,probs=c(0.25))
ps2 <- quantile(data,probs=c(0.75))
# Positions to split
position1 <- sum(data<=ps1)
position2 <- sum(data<=ps2)
# Split with positions in a sorted data
sort(data)[position1:position2]
The result is
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
In the same way you can divide an ordered vector into 10 equal parts in the following way, specifying the percentiles
# A vector with numbers from 1 to 100
data <- seq(1,100)
# sub vectors based on percentiles
subvectors <- quantile(data,probs=c(0.10,0.20,0.30,0.40,0.50,0.60,0.70,0.80,0.90,1))
for (i in 1:length(subvectors)-1){
# Percentiles values
ps1 <- subvectors[i]
ps2 <- subvectors[i+1]
# Positions to split
position1 <- sum(data<=ps1)
position2 <- sum(data<=ps2)
# Split with positions in a sorted data
print(sort(data)[position1:position2])
}

How to select/find coordinates within a distance from a list (X/Y) using R

I have a data frame with list of X/Y locations (>2000 rows). What I want is to select or find all the rows/locations based on a max distance. For example, from the data frame select all the locations that are between 1-100 km from each other. Any suggestions on how to do this?
You need to somehow determine the distance between each pair of rows.
The simplest way is with a corresponding distance matrix
# Assuming Thresh is your threshold
thresh <- 10
# create some sample data
set.seed(123)
DT <- data.table(X=sample(-10:10, 5, TRUE), Y=sample(-10:10, 5, TRUE))
# create the disance matrix
distTable <- matrix(apply(createTable(DT), 1, distance), nrow=nrow(DT))
# remove the lower.triangle since we have symmetry (we don't want duplicates)
distTable[lower.tri(distTable)] <- NA
# Show which rows are above the threshold
pairedRows <- which(distTable >= thresh, arr.ind=TRUE)
colnames(pairedRows) <- c("RowA", "RowB") # clean up the names
Starting with:
> DT
X Y
1: -4 -10
2: 6 1
3: -2 8
4: 8 1
5: 9 -1
We get:
> pairedRows
RowA RowB
[1,] 1 2
[2,] 1 3
[3,] 2 3
[4,] 1 4
[5,] 3 4
[6,] 1 5
[7,] 3 5
These are the two functions used for creating the distance matrix
# pair-up all of the rows
createTable <- function(DT)
expand.grid(apply(DT, 1, list), apply(DT, 1, list))
# simple cartesian/pythagorean distance
distance <- function(CoordPair)
sqrt(sum((CoordPair[[2]][[1]] - CoordPair[[1]][[1]])^2, na.rm=FALSE))
I'm not entirely clear from your question, but assuming you mean you want to take each row of coordinates and find all the other rows whose coordinates fall within a certain distance:
# Create data set for example
set.seed(42)
x <- sample(-100:100, 10)
set.seed(456)
y <- sample(-100:100, 10)
coords <- data.frame(
"x" = x,
"y" = y)
# Loop through all rows
lapply(1:nrow(coords), function(i) {
dis <- sqrt(
(coords[i,"x"] - coords[, "x"])^2 + # insert your preferred
(coords[i,"y"] - coords[, "y"])^2 # distance calculation here
)
names(dis) <- 1:nrow(coords) # replace this part with an index or
# row names if you have them
dis[dis > 0 & dis <= 100] # change numbers to preferred threshold
})
[[1]]
2 6 7 9 10
25.31798 95.01579 40.01250 30.87070 73.75636
[[2]]
1 6 7 9 10
25.317978 89.022469 51.107729 9.486833 60.539243
[[3]]
5 6 8
70.71068 91.78780 94.86833
[[4]]
5 10
40.16217 99.32774
[[5]]
3 4 6 10
70.71068 40.16217 93.40771 82.49242
[[6]]
1 2 3 5 7 8 9 10
95.01579 89.02247 91.78780 93.40771 64.53681 75.66373 97.08244 34.92850
[[7]]
1 2 6 9 10
40.01250 51.10773 64.53681 60.41523 57.55867
[[8]]
3 6
94.86833 75.66373
[[9]]
1 2 6 7 10
30.870698 9.486833 97.082439 60.415230 67.119297
[[10]]
1 2 4 5 6 7 9
73.75636 60.53924 99.32774 82.49242 34.92850 57.55867 67.11930

Resources