grouping locations based on 50 mile radius - r

Hi I have a dataset and I am trying to get a group cluster id based on the 50 mile radius. Here is the structure of the dataset
g_lat<- c(45.52306, 40.26719, 34.05223, 37.38605, 37.77493)
g_long<- c(-122.67648,-86.13490, -118.24368, -122.08385, -122.41942)
df<- data.frame(g_lat, g_long)
I want to create a group cluster id which is basically going to group locations that are within 50 mile radius. Let me know how I can achieve this? Thanks so much. Below is the expected output.
g_lat g_long clusterid
45.52306 -122.67648 1
40.26719 -86.13490 2
34.05223 -118.24368 3
37.38605 -122.08385 4
37.77493 -122.41942 4

g_lat<- c(45.52306, 40.26719, 34.05223, 37.38605, 37.77493)
g_long<- c(-122.67648,-86.13490, -118.24368, -122.08385, -122.41942)
df<- data.frame(point = c(1:5), longitude = g_long, latitude = g_lat)
library(sf)
my.sf.point <- st_as_sf(x = df,
coords = c("longitude", "latitude"),
crs = "+proj=longlat +datum=WGS84")
#distance matrix in feet
st_distance(my.sf.point)
#which poiint are within 50 miles (~80467.2 meters)
l <- st_is_within_distance(my.sf.point, dist = 80467.2 )
l
# Sparse geometry binary predicate list of length 5, where the predicate was `is_within_distance'
# 1: 1
# 2: 2
# 3: 3
# 4: 4, 5
# 5: 4, 5
df$within_50 <- rowSums(as.matrix(l))-1
df
# point longitude latitude within_50
# 1 1 -122.6765 45.52306 0
# 2 2 -86.1349 40.26719 0
# 3 3 -118.2437 34.05223 0
# 4 4 -122.0838 37.38605 1
# 5 5 -122.4194 37.77493 1
m <- as.matrix(l)
colnames(m) <- c(1:nrow(df))
rownames(m) <- c(1:nroe(df))
df$points_within_50 <- apply( m, 1, function(u) paste( names(which(u)), collapse="," ) )
df$clusterid <- dplyr::group_indices(df, df$points_within_50)
# point longitude latitude within_50 points_within_50 clusterid
# 1 1 -122.6765 45.52306 0 1 1
# 2 2 -86.1349 40.26719 0 2 2
# 3 3 -118.2437 34.05223 0 3 3
# 4 4 -122.0838 37.38605 1 4,5 4
# 5 5 -122.4194 37.77493 1 4,5 4

You can create 2d matrix with distances between the locations. The geosphere has a function that does the heavy lifting for you.
library(geosphere)
library(magrittr)
g_lat <- c(45.52306, 40.26719, 34.05223, 37.38605, 37.77493)
g_long <- c(-122.67648,-86.13490, -118.24368, -122.08385, -122.41942)
m <- cbind(g_long, g_lat)
(matrix <- distm(m) / 1609.34)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.0000 1872.882 825.4595 562.3847 534.8927
#> [2,] 1872.8818 0.000 1812.5862 1936.5786 1946.4373
#> [3,] 825.4595 1812.586 0.0000 315.2862 347.3751
#> [4,] 562.3847 1936.579 315.2862 0.0000 32.5345
#> [5,] 534.8927 1946.437 347.3751 32.5345 0.0000
matrix < 50
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] TRUE FALSE FALSE FALSE FALSE
#> [2,] FALSE TRUE FALSE FALSE FALSE
#> [3,] FALSE FALSE TRUE FALSE FALSE
#> [4,] FALSE FALSE FALSE TRUE TRUE
#> [5,] FALSE FALSE FALSE TRUE TRUE
colSums(matrix < 50)
#> [1] 1 1 1 2 2
Created on 2018-09-16 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).

Related

R: Recreating the Travelling Salesman Problem

I am working with the R programming language.
I am trying to recreate the travelling salesman problem. The travelling salesman problem is a problem where a salesman has to visit "n" number of cities exactly once, in such a way that his total distance is minimized.
For this problem, I first created a dataset made of n = 6 cities (longitude, latitude):
set.seed(123)
data_1 = data.frame(id = c(1,2,3), long = rnorm(3, -74, 1 ), lat = rnorm(3, 40, 1 ))
data_2 = data.frame(id = c(4,5,6), long = rnorm(3, -78, 1 ), lat = rnorm(3, 42, 1 ))
final_data = rbind(data_1, data_2)
final_data
id long lat
1 1 -74.56048 40.07051
2 2 -74.23018 40.12929
3 3 -72.44129 41.71506
4 4 -77.53908 41.55434
5 5 -79.26506 43.22408
6 6 -78.68685 42.35981
For a given order of cities (e.g. 1,2,3,4,5,6), I created a function which determines the distance (based on the Euclidean Distance) between each successive pair of cities, and then calculates the total distance travelled:
distance <- function( long1, lat1, long2, lat2, long3, lat3, long4, lat4, long5, lat5, long6, lat6) {
d1_2 = sqrt( (long1 - lat1)^2 + (long2 - lat2)^2 )
d2_3 = sqrt( (long2 - lat2)^2 + (lat3 - long3)^2 )
d3_4 = sqrt( (long3 - lat3)^2 + (long4 - lat4)^2 )
d4_5 = sqrt( (long4 - lat4)^2 + (long5 - lat5)^2 )
d5_6 = sqrt( (long5 - lat5)^2 + (long6 - lat6)^2 )
return( d1_2 + d2_3 + d3_4 + d4_5 + d5_6 )
}
distance(final_data[1,2], final_data[1,3], final_data[2,2], final_data[2,3], final_data[3,2], final_data[3,3], final_data[4,2], final_data[4,3], final_data[5,2], final_data[5,3], final_data[6,2], final_data[6,3])
Then, I can randomize the order of the rows to obtain different routes and calculate the distance for each route:
#first route
rows <- sample(nrow(final_data))
route_1 <- final_data[rows, ]
> route_1
id long lat
1 1 -74.56048 40.07051
3 3 -72.44129 41.71506
4 4 -77.53908 41.55434
2 2 -74.23018 40.12929
6 6 -78.68685 42.35981
5 5 -79.26506 43.22408
distance(route_1[1,2], route_1[1,3], route_1[2,2], route_1[2,3], route_1[3,2], route_1[3,3], route_1[4,2], route_1[4,3], route_1[5,2], route_1[5,3], route_1[6,2], route_1[6,3])
[1] 830.5902
Next Route:
#second route
rows <- sample(nrow(final_data))
route_2 <- final_data[rows, ]
> route_2
id long lat
5 5 -79.26506 43.22408
4 4 -77.53908 41.55434
3 3 -72.44129 41.71506
2 2 -74.23018 40.12929
1 1 -74.56048 40.07051
6 6 -78.68685 42.35981
distance(route_2[1,2], route_2[1,3], route_2[2,2], route_2[2,3], route_2[3,2], route_2[3,3], route_2[4,2], route_2[4,3], route_2[5,2], route_2[5,3], route_2[6,2], route_2[6,3])
[1] 826.028
#etc
My Question: In the spirit of the Travelling Salesman Problem, I am trying to (ironically) show that what I am doing is extremely inefficient and will not work for more than 10 cities (i.e. take too long to run). In the case of 6 cities, can someone please show me how to calculate the distance for every possible route (6! = 720 routes) and calculate the time required to compute all these distances?
Here is what I know how to do so far:
Part 1: Generate All Possible Routes
library(combinat)
all_routes = permn(c(1,2,3,4,5,6))
> head(all_routes)
[[1]]
[1] 1 2 3 4 5 6
[[2]]
[1] 1 2 3 4 6 5
[[3]]
[1] 1 2 3 6 4 5
[[4]]
[1] 1 2 6 3 4 5
[[5]]
[1] 1 6 2 3 4 5
[[6]]
[1] 6 1 2 3 4 5
Part 2: Record the Time Required to Calculate a Single Route
start.time <- Sys.time()
distance(route_1[1,2], route_1[1,3], route_1[2,2], route_1[2,3], route_1[3,2], route_1[3,3], route_1[4,2], route_1[4,3], route_1[5,2], route_1[5,3], route_1[6,2], route_1[6,3])
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
Time difference of 0.003665924 secs
Can someone please show me how to put this all together?
Thanks!
To calculate the cumulative distance for all 6! routes for the given final_data could be done like this:
set.seed(123)
data_1 = data.frame(id = c(1,2,3), long = rnorm(3, -74, 1 ), lat = rnorm(3, 40, 1 ))
data_2 = data.frame(id = c(4,5,6), long = rnorm(3, -78, 1 ), lat = rnorm(3, 42, 1 ))
final_data = rbind(data_1, data_2)
N <- nrow(final_data) # just for repeated convenience
final_data
# id long lat
# 1 1 -74.56048 40.07051
# 2 2 -74.23018 40.12929
# 3 3 -72.44129 41.71506
# 4 4 -77.53908 41.55434
# 5 5 -79.26506 43.22408
# 6 6 -78.68685 42.35981
Calculate the distances between each city, pair-wise. I'm using distHaversine because you listed lat/lon, and part of me cringes seeing cartesian distance calcs applied to that :-)
dists <- outer(seq_len(N), seq_len(N), function(a,b) {
geosphere::distHaversine(final_data[a,2:3], final_data[b,2:3]) # Notes 1, 2
})
dists
# [,1] [,2] [,3] [,4] [,5] [,6]
# [1,] 0.00 28876.24 255554.4 300408.5 525566.9 429264.3
# [2,] 28876.24 0.00 231942.7 320616.0 541980.9 448013.6
# [3,] 255554.43 231942.67 0.0 424449.9 584761.5 521210.7
# [4,] 300408.47 320616.03 424449.9 0.0 233840.9 130640.9
# [5,] 525566.87 541980.93 584761.5 233840.9 0.0 107178.2
# [6,] 429264.34 448013.57 521210.7 130640.9 107178.2 0.0
(Units are in meters.)
Calculate the cumulative distance along each of the routes:
perms <- gtools::permutations(N, N)
nrow(perms)
# [1] 720
perms[c(1:4, 719:720),]
# [,1] [,2] [,3] [,4] [,5] [,6]
# [1,] 1 2 3 4 5 6
# [2,] 1 2 3 4 6 5
# [3,] 1 2 3 5 4 6
# [4,] 1 2 3 5 6 4
# [5,] 6 5 4 3 1 2
# [6,] 6 5 4 3 2 1
allroutes5 <- t(apply(perms, 1, function(route) {
dists[cbind(route[-N], route[-1])]
}))
head(allroutes5)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 28876.24 231942.7 424449.9 233840.9 107178.2
# [2,] 28876.24 231942.7 424449.9 130640.9 107178.2
# [3,] 28876.24 231942.7 584761.5 233840.9 130640.9
# [4,] 28876.24 231942.7 584761.5 107178.2 130640.9
# [5,] 28876.24 231942.7 521210.7 130640.9 233840.9
# [6,] 28876.24 231942.7 521210.7 107178.2 233840.9
allroutes_total <- rowSums(allroutes5)
head(allroutes_total)
# [1] 1026287.9 923087.9 1210062.2 1083399.4 1146511.4 1123048.7
As confirmation of this, the first row of allroutes5 is the sequence of cities 1, 2, 3, 4, 5, and 6. Recalling dists above, from 1-2 is 28876; 2-3 is 231942; 3-4 is 424449; etc. Sum these up, and we have the total distance traveled over all cities in that route. allroutes_total holds the distances for all 720 possible routings (permutations).
min(allroutes_total)
# [1] 799046.4
which.min(allroutes_total)
# [1] 266
perms[which.min(allroutes_total),]
# [1] 3 2 1 4 6 5
Notes:
Using your formula, I was able to duplicate your distances:
dists <- outer(seq_len(N), seq_len(N), function(a,b) {
sqrt((final_data[a,"long"] - final_data[a,"lat"])^2 + (final_data[b,"long"] - final_data[b,"lat"])^2)
})
dists
# [,1] [,2] [,3] [,4] [,5] [,6]
# [1,] 162.1127 161.9208 161.7774 165.2982 167.7613 166.7110
# [2,] 161.9208 161.7287 161.5852 165.1101 167.5759 166.5244
# [3,] 161.7774 161.5852 161.4415 164.9694 167.4373 166.3850
# [4,] 165.2982 165.1101 164.9694 168.4235 170.8415 169.8103
# [5,] 167.7613 167.5759 167.4373 170.8415 173.2258 172.2088
# [6,] 166.7110 166.5244 166.3850 169.8103 172.2088 171.1858
### first route
which(apply(perms, 1, identical, c(1L, 3L, 4L, 2L, 6L, 5L)))
# [1] 32
allroutes_total[32]
# [1] 830.5902
### second route
which(apply(perms, 1, identical, c(5L, 4L, 3L, 2L, 1L, 6L)))
# [1] 567
allroutes_total[567]
# [1] 826.028
And if you're curious, your second route was tied for fifth-shortest:
min(allroutes_total)
# [1] 826.0252
which.min(allroutes_total)
# [1] 561
perms[which.min(allroutes_total),]
# [1] 5 4 2 3 1 6
rank(allroutes_total)[567]
# [1] 5.5
I'm not sure that's the right distance calculation, though. I'd think the euclidean distance should be:
dists <- outer(seq_len(N), seq_len(N), function(a,b) {
sqrt((final_data[a,"long"] - final_data[b,"long"])^2 + (final_data[a,"lat"] - final_data[b,"lat"])^2)
})
dists
# [,1] [,2] [,3] [,4] [,5] [,6]
# [1,] 0.0000000 0.3354875 2.682444 3.327741 5.663758 4.718888
# [2,] 0.3354875 0.0000000 2.390565 3.602725 5.909975 4.983694
# [3,] 2.6824442 2.3905652 0.000000 5.100325 6.988631 6.278753
# [4,] 3.3277405 3.6027253 5.100325 0.000000 2.401467 1.402200
# [5,] 5.6637577 5.9099750 6.988631 2.401467 0.000000 1.039848
# [6,] 4.7188885 4.9836936 6.278753 1.402200 1.039848 0.000000

How to vectorize a function

I have a 5x4 matrix. I have created a function call fun1, fun1 use double for loop to loop through the matrix and use distance function to work out the distance between two-row. The final results matrix will be a 5x5 matrix.
I am struggling to covert this fun1 to a vectorization function(no loop, only apply function).
x =
[,1] [,2] [,3] [,4]
[1,] 1 6 11 16
[2,] 2 7 12 17
[3,] 3 8 13 18
[4,] 4 9 14 19
[5,] 5 10 15 20
distance = function(a, b) {
sqrt(sum((a - b)^2))
}
fun1 = function(x) {
n = nrow(x)
results = matrix(0, nrow = n, ncol = n)
for (i in seq_len(n)) {
for (j in seq_len(n)) {
results[i,j] = distance(m[i,], m[j,])
}
}
results
}
You can do it with just a matrix multiplication, some additions and a transpose.
x <- matrix(1:20, nrow = 5)
z <- x %*% t(x)
sqrt(diag(z)+t(diag(z)-2*z))
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0 2 4 6 8
#> [2,] 2 0 2 4 6
#> [3,] 4 2 0 2 4
#> [4,] 6 4 2 0 2
#> [5,] 8 6 4 2 0
Interestingly this is faster than the in built method mentioned in the comments above!
mdist <- function(x) {
z <- x %*% t(x)
sqrt(diag(z)+t(diag(z)-2*z))
}
n <- 1000
l <- 100
x <- matrix(runif(n*l), ncol = l)
microbenchmark::microbenchmark(
z1 = as.matrix(dist(x)),
z2 = dist(x, diag = TRUE, upper = TRUE),
z3 = mdist(x),
times = 100
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> z1 82.98502 90.20049 98.54552 94.85027 101.78114 140.1809 100
#> z2 72.54279 76.22054 82.75410 79.31865 83.47765 231.3008 100
#> z3 54.58258 59.73461 65.62313 63.14435 67.49865 115.0379 100
In a pinch, Vectorize can do what you need:
outer(seq_len(nrow(m)), seq_len(nrow(m)),
Vectorize(function(i,j) distance(m[i,], m[j,]), vectorize.args=c("i","j")))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 2 4 6 8
# [2,] 2 0 2 4 6
# [3,] 4 2 0 2 4
# [4,] 6 4 2 0 2
# [5,] 8 6 4 2 0
Vectorize takes a function as an argument and returns a function that accepts vectors, iterating internally. The function passed to it is called once for each element within the vector passed. By default, Vectorize only vectorizes the first argument of the function, but it can "zip" along multiple arguments, assuming they are all the same length, by using vectorize.args=.
This might be a little easier to visualize by redefining distance:
distance_ind = function(i, j, data) {
sqrt(sum((data[i,] - data[j,])^2))
}
distance_ind(1, 2, m)
# [1] 2
distance_ind(c(1,3), c(2,3), m)
# [1] 2 ### wrong
distance_ind_vec <- Vectorize(distance_ind, vectorize.args = c("i", "j"))
distance_ind_vec(c(1,3), c(2,3), m)
# [1] 2 0
And the outer call:
outer(seq_len(nrow(m)), seq_len(nrow(m)), distance_ind_vec, data = m)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 2 4 6 8
# [2,] 2 0 2 4 6
# [3,] 4 2 0 2 4
# [4,] 6 4 2 0 2
# [5,] 8 6 4 2 0

Find similar groups of numbers across rows R

I'm trying to find similar patterns of numbers across a dataframe. I have a dataframe with 5 columns and some columns have a random number between 3 and 50. However, for some rows 2 or 3 columns don't have a number.
A B C D E
5 23 6
9 33 7 8 12
33 7 14
6 18 23 48
8 44 33 7 9
I want to know what are the recurring numbers, so I'm interested in:
Row 1 and 4 that have the number 23 and 6,
Row 2 and 5 that have number 9, 33 and 8,
Row 2, 3 and 5 that have number 33 and 7.
Basically I'm trying to get the number of different combinations.
I'm a bit stuck about how to do this. I've tried to join the numbers in a list.
for (i in 1:dim(knots_all)[1]) {
knots_all$list_knots <- list(sort(knots_all[i,1:5]))
}
I've also tried intersect but it doesn't seem very efficient as R also considers the NAs which I want to disregard.
I would like to hear some ideas about the best way to achieve this. I've been thinking about this problem but I'm not able to understand how to get to the answer. My mind is stuck so any idea is much appreciated!
Thank you!
There's no specific/target pattern you want to capture. It seems like you need a process to identify the numbers that appear more often in your dataset and then see in which rows they appear.
I'll modify your example dataset to have number 23 appearing twice in the same row in order to illustrate some useful differences in counts.
df = read.table(text = "
A B C D E
5 23 6 23 NA
9 33 7 8 12
33 7 14 NA NA
6 18 23 48 NA
8 44 33 7 9
", header=T)
library(dplyr)
library(tidyr)
df %>%
mutate(row_id = row_number()) %>% # add a row flag
gather(col_name,value,-row_id) %>% # reshape
filter(!is.na(value)) %>% # exclude NAs
group_by(value) %>% # for each number value
summarise(NumOccurences = n(), # count occurences
rows = paste(sort(row_id), collapse = "_"), # capture rows
NumRowOccurences = n_distinct(row_id), # count occurences in unique rows
unique_rows = paste(sort(unique(row_id)), collapse = "_")) %>% # capture unique rows
arrange(desc(NumOccurences)) # order by number popularity (occurences)
# # A tibble: 12 x 5
# value NumOccurences rows NumRowOccurences unique_rows
# <int> <int> <chr> <int> <chr>
# 1 7 3 2_3_5 3 2_3_5
# 2 23 3 1_1_4 2 1_4
# 3 33 3 2_3_5 3 2_3_5
# 4 6 2 1_4 2 1_4
# 5 8 2 2_5 2 2_5
# 6 9 2 2_5 2 2_5
# 7 5 1 1 1 1
# 8 12 1 2 1 2
# 9 14 1 3 1 3
# 10 18 1 4 1 4
# 11 44 1 5 1 5
# 12 48 1 4 1 4
Make a list of lists:
List = [1[],2[],...,n[]].
Loop through your data frame and for your example ad A to List = [1[],2[],.5[A]..,[n]] (at index = 5). And so on for every column.
after this loop through list check if the list (in the list) are filled and have multiple columns.
this should get you started.
good luck
This is an algorithm which can detect numbers presents in two columns.
df <- data.frame(A = c(5, 23, 6, NA, NA),
B = c(9, 33, 7, 8, 12),
C = c(33, 7, 14, NA, NA),
D = c(6, 18, 23, 48, NA),
E = c(8, 44, 33, 7, 9))
L <- as.list(df)
LL <- rep(list(rep(list(NA), length(L))), length(L))
for(i in 1:length(L)){
for(j in 1:length(L))
LL[[i]][[j]] <- intersect(L[[i]], L[[j]])
}
To see the overlapping numbers in columns 1 and 4:
LL[[1]][[4]]
[1] 23 6 NA
To see all overapping numbers:
unique(unlist(LL))
[1] 5 23 6 NA 9 33 7 8 12 14 18 48 44
It could be changed a little bit (by adding a level in the nested loop and if the for loop) to see the pesence in 3 different columns etc
One example for dealing with the NA would be to temporarily fill them with randomly generated numbers:
# data
df <- data.frame(A = c(5,9,33,6,8),
B = c(23,33,7,18,44),
C = c(6,7,14,23,33),
D = c(NA, 8, NA, 48, 7),
E = c(NA, 12, NA, NA, 9))
# fill NA with random numbers
set.seed(1)
df2 <- as.data.frame(do.call(cbind, lapply(df, function(x) ifelse(is.na(x), rnorm(1), x))))
> df2
A B C D E
1 5 23 6 -0.6264538 0.1836433
2 9 33 7 8.0000000 12.0000000
3 33 7 14 -0.6264538 0.1836433
4 6 18 23 48.0000000 0.1836433
5 8 44 33 7.0000000 9.0000000
# split data by rows
df2 <- split(df2, seq_along(df2))
# compare rows with each other
temp <- lapply(lapply(df2, function(x) lapply(df2, function(y) x %in% y)), function(x) do.call(rbind, x))
# delete self comparisons
output <- lapply(1:5, function(x) temp[[x]] <- temp[[x]][-x,])
Result:
[[1]]
[,1] [,2] [,3] [,4] [,5]
2 FALSE FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE TRUE TRUE
4 FALSE TRUE TRUE FALSE TRUE
5 FALSE FALSE FALSE FALSE FALSE
[[2]]
[,1] [,2] [,3] [,4] [,5]
1 FALSE FALSE FALSE FALSE FALSE
3 FALSE TRUE TRUE FALSE FALSE
4 FALSE FALSE FALSE FALSE FALSE
5 TRUE TRUE TRUE TRUE FALSE
[[3]]
[,1] [,2] [,3] [,4] [,5]
1 FALSE FALSE FALSE TRUE TRUE
2 TRUE TRUE FALSE FALSE FALSE
4 FALSE FALSE FALSE FALSE TRUE
5 TRUE TRUE FALSE FALSE FALSE
[[4]]
[,1] [,2] [,3] [,4] [,5]
1 TRUE FALSE TRUE FALSE TRUE
2 FALSE FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE FALSE TRUE
5 FALSE FALSE FALSE FALSE FALSE
[[5]]
[,1] [,2] [,3] [,4] [,5]
1 FALSE FALSE FALSE FALSE FALSE
2 TRUE FALSE TRUE TRUE TRUE
3 FALSE FALSE TRUE TRUE FALSE
4 FALSE FALSE FALSE FALSE FALSE

extract every two elements in matrix row in r in sequence to calculate euclidean distance

How to extract every two elements in sequence in a matrix and return the result as a matrix so that I could feed the answer in a formula for calculation:
For example, I have a one row matrix with 6 columns:
[,1][,2][,3][,4][,5][,6]
[1,] 2 1 5 5 10 1
I want to extract column 1 and two in first iteration, 3 and 4 in second iteration and so on. The result has to be in the form of matrix.
[1,] 2 1
[2,] 5 5
[3,] 10 1
My original codes:
data <- matrix(c(1,1,1,2,2,1,2,2,5,5,5,6,10,1,10,2,11,1,11,2), ncol = 2)
Center Matrix:
[,1][,2][,3][,4][,5][,6]
[1,] 2 1 5 5 10 1
[2,] 1 1 2 1 10 1
[3,] 5 5 5 6 11 2
[4,] 2 2 5 5 10 1
[5,] 2 1 5 6 5 5
[6,] 2 2 5 5 11 1
[7,] 2 1 5 5 10 1
[8,] 1 1 5 6 11 1
[9,] 2 1 5 5 10 1
[10,] 5 6 11 1 10 2
objCentroidDist <- function(data, centers) {
resultMatrix <- matrix(NA, nrow=dim(data)[1], ncol=dim(centers)[1])
for(i in 1:nrow(centers)) {
resultMatrix [,i] <- sqrt(rowSums(t(t(data)-centers[i, ])^2))
}
resultMatrix
}
objCentroidDist(data,centers)
I want the Result matrix to be as per below:
[1,][,2][,3]
[1,]
[2,]
[3,]
[4,]
[5,]
[7,]
[8,]
[9,]
[10]
My concern is, how to calculate the data-centers distance if the dimensions of the data matrix are two, and centers matrix are six. (to calculate the distance from the data matrix and every two columns in centers matrix). Each row of the centers matrix has three centers.
Something like this maybe?
m <- matrix(c(2,1,5,5,10,1), ncol = 6)
list.seq.pairs <- lapply(seq(1, ncol(m), 2), function(x) {
m[,c(x, x+1)]
})
> list.seq.pairs
[[1]]
[1] 2 1
[[2]]
[1] 5 5
[[3]]
[1] 10 1
And, in case you're wanting to iterate over multiple rows in a matrix,
you can expand on the above like this:
mm <- matrix(1:18, ncol = 6, byrow = TRUE)
apply(mm, 1, function(x) {
lapply(seq(1, length(x), 2), function(y) {
x[c(y, y+1)]
})
})
EDIT:
I'm really not sure what you're after exactly. I think, if you want each row transformed into a 2 x 3 matrix:
mm <- matrix(1:18, ncol = 6, byrow = TRUE)
list.mats <- lapply(1:nrow(mm), function(x){
a = matrix(mm[x,], ncol = 2, byrow = TRUE)
})
> list.mats
[[1]]
[,1] [,2]
[1,] 1 2
[2,] 3 4
[3,] 5 6
[[2]]
[,1] [,2]
[1,] 7 8
[2,] 9 10
[3,] 11 12
[[3]]
[,1] [,2]
[1,] 13 14
[2,] 15 16
[3,] 17 18
If, however, you want to get to your results matrix- I think it's probably easiest to do whatever calculations you need to do while you're dealing with each row:
results <- t(apply(mm, 1, function(x) {
sapply(seq(1, length(x), 2), function(y) {
val1 = x[y] # Get item one
val2 = x[y+1] # Get item two
val1 / val2 # Do your calculation here
})
}))
> results
[,1] [,2] [,3]
[1,] 0.5000000 0.7500 0.8333333
[2,] 0.8750000 0.9000 0.9166667
[3,] 0.9285714 0.9375 0.9444444
That said, I don't understand what you're trying to do so this may miss the mark. You may have more luck if you ask a new question where you show example input and the actual expected output that you're after, with the actual values you expect.

Creating a 5x5 matrix with 0's lined diagonally

In R, I want create a 5x5 matrix of 0,1,3,5,7 such that:
0 1 3 5 7
1 0 3 5 7
1 3 0 5 7
1 3 5 0 7
1 3 5 7 0
So obviously I can generate the starting matrix:
z <- c(0,1,3,5,7)
matrix(z, ncol=5, nrow=5, byrow = TRUE)
but I'm unsure of how to move the 0's position. I'm sure I have to use some sort of for/in loop, but I really don't know what exactly I need to do.
How about this:
m <- 1 - diag(5)
m[m==1] <- rep(c(1,3,5,7), each=5)
m
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 1 3 5 7
# [2,] 1 0 3 5 7
# [3,] 1 3 0 5 7
# [4,] 1 3 5 0 7
# [5,] 1 3 5 7 0
Or we can do:
z <- c(1,3,5,7)
mat <- 1-diag(5)
mat[mat==1] <- z
t(mat)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 1 3 5 7
# [2,] 1 0 3 5 7
# [3,] 1 3 0 5 7
# [4,] 1 3 5 0 7
# [5,] 1 3 5 7 0
Yet another solution just to enjoy combn as well:
r <- integer(5)
t(combn(5, 1, function(v) {r[v]<-0;r[-v]<-z;r}))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 1 3 5 7
# [2,] 1 0 3 5 7
# [3,] 1 3 0 5 7
# [4,] 1 3 5 0 7
# [5,] 1 3 5 7 0
Or using sapply:
v <- integer(5)
t(sapply(seq(5), function(x) {v[x]<-0;v[-x]<-z;v}))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 1 3 5 7
# [2,] 1 0 3 5 7
# [3,] 1 3 0 5 7
# [4,] 1 3 5 0 7
# [5,] 1 3 5 7 0
Here's a solution that builds the data vector with a couple of calls to rep(), a couple of calls to c(), a seq(), and an rbind(), and then wraps it in a call to matrix():
N <- 5L;
matrix(rep(c(0,rbind(seq(1,(N-1)*2,2),0)),rep(c(1,N),len=N*2-1)),N);
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 1 3 5 7
## [2,] 1 0 3 5 7
## [3,] 1 3 0 5 7
## [4,] 1 3 5 0 7
## [5,] 1 3 5 7 0
Another idea, using two calls to diag() and a cumsum():
N <- 5L;
(1-diag(N))*(cumsum(diag(N)*2)-1);
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 1 3 5 7
## [2,] 1 0 3 5 7
## [3,] 1 3 0 5 7
## [4,] 1 3 5 0 7
## [5,] 1 3 5 7 0
Benchmarking
Note: For the following benchmarking tests I modified everyone's solutions where necessary to ensure they are parameterized on the matrix size N. For the most part, this just involved replacing some literals with N, and replacing instances of c(1,3,5,7) with seq(1,(N-1)*2,2). I think this is fair.
library(microbenchmark);
josh <- function(N) { m <- 1-diag(N); m[m==1] <- rep(seq(1,(N-1)*2,2),each=N); m; };
marat <- function(N) matrix(rbind(0,col(diag(N))*2-1),nrow=N,ncol=N);
gregor <- function(N) { x = seq(1,(N-1)*2,2); t(mapply(FUN = append, after = c(0, seq_along(x)), MoreArgs = list(x = x, values = 0))); };
barkley <- function(N) { my_vec <- seq(1,(N-1)*2,2); my_val <- 0; my_mat <- matrix(NA, ncol = length(my_vec)+1, nrow = length(my_vec)+1); for (i in 1:nrow(my_mat)) { my_mat[i, i] <- my_val; my_mat[i, -i] <- my_vec; }; my_mat; };
m0h3n <- function(N) { z <- seq(1,(N-1)*2,2); mat=1-diag(N); mat[mat==1]=z; t(mat); };
bgoldst1 <- function(N) matrix(rep(c(0,rbind(seq(1,(N-1)*2,2),0)),rep(c(1,N),len=N*2-1)),N);
bgoldst2 <- function(N) (1-diag(N))*(cumsum(diag(N)*2)-1);
## small-scale: 5x5
N <- 5L;
ex <- josh(N);
identical(ex,marat(N));
## [1] TRUE
identical(ex,gregor(N));
## [1] TRUE
identical(ex,barkley(N));
## [1] TRUE
identical(ex,m0h3n(N));
## [1] TRUE
identical(ex,bgoldst1(N));
## [1] TRUE
identical(ex,bgoldst2(N));
## [1] TRUE
microbenchmark(josh(N),marat(N),gregor(N),barkley(N),m0h3n(N),bgoldst1(N),bgoldst2(N));
## Unit: microseconds
## expr min lq mean median uq max neval
## josh(N) 20.101 21.8110 25.71966 23.0935 24.8045 108.197 100
## marat(N) 5.987 8.1260 9.01131 8.5535 8.9820 24.805 100
## gregor(N) 49.608 51.9605 57.61397 53.8850 61.7965 98.361 100
## barkley(N) 29.081 32.0750 36.33830 33.7855 41.9110 54.740 100
## m0h3n(N) 22.666 24.8040 28.45663 26.0870 28.4400 59.445 100
## bgoldst1(N) 20.528 23.0940 25.49303 23.5220 24.8050 56.879 100
## bgoldst2(N) 3.849 5.1320 5.73551 5.5600 5.9880 16.251 100
## medium-scale: 50x50
N <- 50L;
ex <- josh(N);
identical(ex,marat(N));
## [1] TRUE
identical(ex,gregor(N));
## [1] TRUE
identical(ex,barkley(N));
## [1] TRUE
identical(ex,m0h3n(N));
## [1] TRUE
identical(ex,bgoldst1(N));
## [1] TRUE
identical(ex,bgoldst2(N));
## [1] TRUE
microbenchmark(josh(N),marat(N),gregor(N),barkley(N),m0h3n(N),bgoldst1(N),bgoldst2(N));
## Unit: microseconds
## expr min lq mean median uq max neval
## josh(N) 106.913 110.7630 115.68488 113.1145 116.1080 179.187 100
## marat(N) 62.866 65.4310 78.96237 66.7140 67.9980 1163.215 100
## gregor(N) 195.438 205.2735 233.66129 213.6130 227.9395 1307.334 100
## barkley(N) 184.746 194.5825 227.43905 198.6455 207.1980 1502.771 100
## m0h3n(N) 73.557 76.1230 92.48893 78.6885 81.6820 1176.045 100
## bgoldst1(N) 51.318 54.3125 95.76484 56.4500 60.0855 1732.421 100
## bgoldst2(N) 18.817 21.8110 45.01952 22.6670 23.5220 1118.739 100
## large-scale: 1000x1000
N <- 1e3L;
ex <- josh(N);
identical(ex,marat(N));
## [1] TRUE
identical(ex,gregor(N));
## [1] TRUE
identical(ex,barkley(N));
## [1] TRUE
identical(ex,m0h3n(N));
## [1] TRUE
identical(ex,bgoldst1(N));
## [1] TRUE
identical(ex,bgoldst2(N));
## [1] TRUE
microbenchmark(josh(N),marat(N),gregor(N),barkley(N),m0h3n(N),bgoldst1(N),bgoldst2(N));
## Unit: milliseconds
## expr min lq mean median uq max neval
## josh(N) 40.32035 43.42810 54.46468 45.36386 80.17241 90.69608 100
## marat(N) 41.00074 45.34248 54.74335 47.00904 50.74608 93.85429 100
## gregor(N) 33.65923 37.82393 50.50060 40.24914 75.09810 83.27246 100
## barkley(N) 31.02233 35.42223 43.08745 36.85615 39.81999 85.28585 100
## m0h3n(N) 27.08622 31.00202 38.98395 32.33244 34.33856 90.82652 100
## bgoldst1(N) 12.53962 13.02672 18.31603 14.92314 16.96433 59.87945 100
## bgoldst2(N) 13.23926 16.87965 28.81906 18.92319 54.60009 62.01258 100
## very large scale: 10,000x10,000
N <- 1e4L;
ex <- josh(N);
identical(ex,marat(N));
## [1] TRUE
identical(ex,gregor(N));
## [1] TRUE
identical(ex,barkley(N));
## [1] TRUE
identical(ex,m0h3n(N));
## [1] TRUE
identical(ex,bgoldst1(N));
## [1] TRUE
identical(ex,bgoldst2(N));
## [1] TRUE
microbenchmark(josh(N),marat(N),gregor(N),barkley(N),m0h3n(N),bgoldst1(N),bgoldst2(N));
## Unit: seconds
## expr min lq mean median uq max neval
## josh(N) 3.698714 3.908910 4.067409 4.046770 4.191938 4.608312 100
## marat(N) 6.440882 6.977273 7.272962 7.223293 7.493600 8.471888 100
## gregor(N) 3.546885 3.850812 4.032477 4.022563 4.221085 4.651799 100
## barkley(N) 2.955906 3.162409 3.324033 3.279032 3.446875 4.444848 100
## m0h3n(N) 3.355968 3.667484 3.829618 3.777151 3.973279 4.649226 100
## bgoldst1(N) 1.044510 1.260041 1.363827 1.369945 1.441194 1.819248 100
## bgoldst2(N) 1.144168 1.391711 1.517189 1.519653 1.629994 2.478636 100
Perhaps not the most beautiful solution ever, but maybe elegant in its simplicity:
my_vec <- c(1,3,5,7)
my_val <- 0
my_mat <- matrix(NA, ncol = length(my_vec)+1, nrow = length(my_vec)+1)
for (i in 1:nrow(my_mat)) {
my_mat[i, i] <- my_val
my_mat[i, -i] <- my_vec
}
my_mat
[,1] [,2] [,3] [,4] [,5]
[1,] 0 1 3 5 7
[2,] 1 0 3 5 7
[3,] 1 3 0 5 7
[4,] 1 3 5 0 7
[5,] 1 3 5 7 0
You could use
n <- 5
matrix(rbind(0,col(diag(n))*2-1),nrow=n,ncol=n)
Fun question! In poking around, I saw that append has a after argument.
x = c(1, 3, 5, 7)
t(mapply(FUN = append, after = c(0, seq_along(x)),
MoreArgs = list(x = x, values = 0)))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 1 3 5 7
# [2,] 1 0 3 5 7
# [3,] 1 3 0 5 7
# [4,] 1 3 5 0 7
# [5,] 1 3 5 7 0
Another option, directly constructing each row:
v = c(1, 3, 5, 7)
n = length(v)
t(sapply(0:n, function(i) c(v[0:i], 0, v[seq(to = n, length.out = n - i)])))
# [,1] [,2] [,3] [,4] [,5]
#[1,] 0 1 3 5 7
#[2,] 1 0 3 5 7
#[3,] 1 3 0 5 7
#[4,] 1 3 5 0 7
#[5,] 1 3 5 7 0

Resources