calculate distance between each pair of coordinates in wide dataframe - r

I want to calculate the distance between two linked set of spatial coordinates (program and admin in my fake dataset). The data are in a wide format, so both pairs of coordinates are in the same row.
library(sp)
set.seed(1)
n <- 100
program.id <- seq(1, n)
c1 <- cbind(runif(n, -90, 90), runif(n, -180, 180))
c2 <- cbind(runif(n, -90, 90), runif(n, -180, 180))
dat <- data.frame(cbind(program.id, c1, c2))
names(dat) <- c("program.id", "program.lat", "program.long", "admin.lat", "admin.long")
head(dat)
# program.id program.lat program.long admin.lat admin.long
# 1 1 -42.20844 55.70061 -41.848523 62.536404
# 2 2 -23.01770 -52.84898 -50.643849 -145.851172
# 3 3 13.11361 -82.70635 3.023431 -2.665397
# 4 4 73.47740 177.36626 -41.588893 -13.841337
# 5 5 -53.69725 48.05758 -57.389701 -44.922049
# 6 6 71.71014 -103.24507 3.343705 176.795719
I know how to create a matrix of distances among program or admin using the sp package:
ll <- c("program.lat", "program.long")
coords <- dat[ll]
dist <- apply(coords, 1,
function(eachPoint) spDistsN1(as.matrix(coords),
eachPoint, longlat=TRUE))
But what I want to do is create a nx1 vector of distances (dist.km) between each pair of coordinates and add it to dat.
# program.id program.lat program.long admin.lat admin.long dist.km
# 1 1 -42.20844 55.70061 -41.848523 62.536404 567.35
# 2 2 -23.01770 -52.84898 -50.643849 -145.851172 8267.86
# ...
Any suggestions? I've spent a while going through old SO questions, but nothing seems quite right. Happy to be proven wrong.
Update
#Amit's solution works for my toy dataset:
apply(dat,1,function(x) spDistsN1(matrix(x[2:3],nrow=1),x[3:4],longlat=TRUE))
But I think I need to swap the order of the lat, long the order of the lat long columns so long comes before lat. From ?spDistsN1:
pts: A matrix of 2D points, first column x/longitude, second column y/latitude, or a SpatialPoints or SpatialPointsDataFrame object
Also, unless I've misunderstood the logic, I think Amit's solution should grab cols [2:3] and [4:5], not [2:3] and [3:4].
My challenge now is applying this to my actual data. I've reproduced a portion below.
library(sp)
dat <- structure(list(ID = 1:4,
subcounty = c("a", "b", "c", "d"),
pro.long = c(33.47627919, 31.73605491, 31.54073482, 31.51748984),
pro.lat = c(2.73996953, 3.26530095, 3.21327597, 3.17784981),
sub.long = c(33.47552, 31.78307, 31.53083, 31.53083),
sub.lat = c(2.740362, 3.391209, 3.208736, 3.208736)),
.Names = c("ID", "subcounty", "pro.long", "pro.lat", "sub.long", "sub.lat"),
row.names = c(NA, 4L), class = "data.frame")
head(dat)
# ID subcounty pro.long pro.lat sub.long sub.lat
# 1 1 a 33.47628 2.739970 33.47552 2.740362
# 2 2 b 31.73605 3.265301 31.78307 3.391209
# 3 3 c 31.54073 3.213276 31.53083 3.208736
# 4 4 d 31.51749 3.177850 31.53083 3.208736
apply(dat, 1, function(x) spDistsN1(matrix(x[3:4], nrow=1),
x[5:6],
longlat=TRUE))
I get the error: Error in spDistsN1(matrix(x[3:4], nrow = 1), x[5:6], longlat = TRUE) : pts must be numeric
I'm confused because these columns are numeric:
> is.numeric(dat$pro.long)
[1] TRUE
> is.numeric(dat$pro.lat)
[1] TRUE
> is.numeric(dat$sub.long)
[1] TRUE
> is.numeric(dat$sub.lat)
[1] TRUE

The problem you're having is thatapply(...) coerces the first argument to a matrix. By definition, a matrix must have all elements of the same data type. Since one of the columns in dat (dat$subcounty) is char, apply(...) coerces everything to char. In your test dataset, everything was numeric, so you didn't have this problem.
This should work:
dat$dist.km <- sapply(1:nrow(dat),function(i)
spDistsN1(as.matrix(dat[i,3:4]),as.matrix(dat[i,5:6]),longlat=T))

There is a much faster solution using data.table and geosphere.
library(data.table)
library(geosphere)
setDT(dat)[ , dist_km := distGeo(matrix(c(pro.long, pro.lat), ncol = 2),
matrix(c(sub.long, sub.lat), ncol = 2))/1000]
Benchmark:
library(sp)
jlhoward <- function(dat) { dat$dist.km <- sapply(1:nrow(dat),function(i)
spDistsN1(as.matrix(dat[i,3:4]),as.matrix(dat[i,5:6]),longlat=T)) }
rafa.pereira <- function(dat2) { setDT(dat2)[ , dist_km := distGeo(matrix(c(pro.long, pro.lat), ncol = 2),
matrix(c(sub.long, sub.lat), ncol = 2))/1000] }
> system.time( jlhoward(dat) )
user system elapsed
8.94 0.00 8.94
> system.time( rafa.pereira(dat) )
user system elapsed
0.07 0.00 0.08
Data
dat <- structure(list(ID = 1:4,
subcounty = c("a", "b", "c", "d"),
pro.long = c(33.47627919, 31.73605491, 31.54073482, 31.51748984),
pro.lat = c(2.73996953, 3.26530095, 3.21327597, 3.17784981),
sub.long = c(33.47552, 31.78307, 31.53083, 31.53083),
sub.lat = c(2.740362, 3.391209, 3.208736, 3.208736)),
.Names = c("ID", "subcounty", "pro.long", "pro.lat", "sub.long", "sub.lat"),
row.names = c(NA, 4L), class = "data.frame")
# enlarge dataset to 40,000 pairs
dat <- dat[rep(seq_len(nrow(dat)), 10000), ]

Related

Replace integers in a data frame column with other integers in R?

I want to replace a vector in a dataframe that contains only 4 numbers to specific numbers as shown below
tt <- rep(c(1,2,3,4), each = 10)
df <- data.frame(tt)
I want to replace 1 = 10; 2 = 200, 3 = 458, 4 = -0.1
You could use recode from dplyr. Note that the old values are written as character. And the new values are integers since the original column was integer:
library(tidyverse):
df %>%
mutate(tt = recode(tt, '1'= 10, '2' = 200, '3' = 458, '4' = -0.1))
tt
1 10.0
2 10.0
3 200.0
4 200.0
5 458.0
6 458.0
7 -0.1
8 -0.1
To correct the error in the code in the question and provide for a shorter example we use the input in the Note at the end. Here are several alternatives. nos defined in (1) is used in some of the others too. No packages are used.
1) indexing To get the result since the input is 1 to 4 we can use indexing. This is probably the simplest solution given that the original values of tt are in 1:4.
nos <- c(10, 200, 458, -0.1)
transform(df, tt = nos[tt])
## tt
## 1 10.0
## 2 10.0
## 3 200.0
## 4 200.0
## 5 458.0
## 6 458.0
## 7 -0.1
## 8 -0.1
1a) If the input is not necessarily in 1:4 then we could use this generalization
transform(df, tt = nos[match(tt, 1:4)])
2) arithmetic Another approach is to use arithmetic:
transform(df, tt = 10 * (tt == 1) +
200 * (tt == 2) +
458 * (tt == 3) +
-0.1 * (tt == 4))
3) outer/matrix multiplication This would also work:
transform(df, tt = c(outer(tt, 1:4, `==`) %*% nos))
3a) This is the same except we use model.matrix instead of outer.
transform(df, tt = c(model.matrix(~ factor(tt) + 0, df) %*% nos))
4) factor The levels of the factor are 1:4 and the corresponding labels are defined by nos. Extract the labels using format and then convert them to numeric.
transform(df, tt = as.numeric(format(factor(tt, levels = 1:4, labels = nos))))
4a) or as a pipeline
transform(df, tt = tt |>
factor(levels = 1:4, labels = nos) |>
format() |>
as.numeric())
5) loop We can use a simple loop. Nulling out i at the end is so that it is not made into a column.
within(df, { for(i in 1:4) tt[tt == i] <- nos[i]; i <- NULL })
6) Reduce This is somewhat similar to (5) but implements the loop using Reduce.
fun <- function(tt, i) replace(tt, tt == i, nos[i])
transform(df, tt = Reduce(fun, init = tt, 1:4))
Note
df <- data.frame(tt = c(1, 1, 2, 2, 3, 3, 4, 4))

Thousand separator to numeric columns in R

I am trying to format numbers as shown (adding thousand separator). The function is working fine but post formatting the numbers, the numeric columns does not sort by numbers since there are characters
df <- data.frame(x = c(12345,35666,345,5646575))
format_numbers <- function (df, column_name){
df[[column_name]] <- ifelse(nchar(df[[column_name]]) <= 5, paste(format(round(df[[column_name]] / 1e3, 1), trim = TRUE), "K"),
paste(format(round(df[[column_name]] / 1e6, 1), trim = TRUE), "M"))
}
df$x <- format_numbers(df,"x")
> df
x
1 12.3 K
2 35.7 K
3 0.3 K
4 5.6 M
Can we make sure the numbers are sorted in descending/ascending order post formatting ?
Note : This data df is to be incorporated in DT table
The problem is the formating part. If you do it correctly--ie while maintaining your data as numeric, then everything else will fall in place. Here I will demonstrate using S3 class:
my_numbers <- function(x) structure(x, class = c('my_numbers', 'numeric'))
format.my_numbers <- function(x,..., d = 1, L = c('', 'K', 'M', 'B', 'T')){
ifelse(abs(x) >= 1000, Recall(x/1000, d = d + 1),
sprintf('%.1f%s', x, L[d]))
}
print.my_numbers <- function(x, ...) print(format(x), quote = FALSE)
'[.my_numbers' <- function(x, ..., drop = FALSE) my_numbers(NextMethod('['))
Now you can run your code:
df <- data.frame(x = c(12345,35666,345,5646575))
df$x <- my_numbers(df$x)
df
x
1 12.3K
2 35.7K
3 345.0
4 5.6M
You can use any mathematical operation on column x as it is numeric.
eg:
cbinding with its double and ordering from smallest to larges:
cbind(x = df, y = df*2)[order(df$x),]
x x
3 345.0 690.0 # smallest
1 12.3K 24.7K
2 35.7K 71.3K
4 5.6M 11.3M # largest ie Millions
Note that under the hood, x does not change:
unclass(df$x)
[1] 12345 35666 345 5646575 # Same as given

Creating mock data with natural decreasing numbers

I want to create random mock data looks like this.
__ID__|__Amount__
1 20
1 14
1 9
1 3
2 11
2 5
2 2
Starting from the random number but the second number with the same ID should be lesser than the first one, and the third number has to be lesser than the second one. Maximum number to start should be 20.
you can just create the data first and then sort it as you need, using tidyverse :
set.seed(0)
df <- data.frame(id = rep(1:3,10), amt = sample(1:20, 30, replace = TRUE))
df %>%
group_by(id) %>%
arrange(id, desc(amt))
This is a tricky one if you want the Amount column to be truly random values you can use a recursive call that will use sample recursively:
## Recursively sampling from a uniform distribution
recursive.sample <- function(start, end, length, results = NA, counter =0) {
## To enter the recursion, counter must be smaller than the length out
## and the last result must be smaller than the starting point (except the firs time)
if(counter < length && ifelse(counter != 0, results[counter] > start, TRUE)){
## Increment the counter
counter <- counter + 1
## Sample between start and the last result or the start and the end of the vector
results[counter] <- ifelse(counter != 1, sample(start:results[counter-1], 1), sample(start:end, 1))
## Recursive call
return(recursive.sample(start = start, end = end, length = length, results = results, counter = counter))
} else {
## Exit the recursion
return(results)
}
}
## Example
set.seed(0)
recursive.sample(start = 1, end = 20, length = 3, results = NA, counter = 0)
#[1] 18 5 2
Alternatively (and way easier) you can use sort(sample()):
set.seed(0)
sort(sample(1:20, 3), decreasing = TRUE)
#[1] 18 7 6
Note that the results differ due to the lower probability of sampling higher values in the recursive function.
You can then easily create your table with your chosen function as follow:
set.seed(123)
## The ID column
ID <- c(rep(1, 4), rep(2,3))
## The Amount column
Amount <- c(recursive.sample(1, 20, 4, NA, 0), recursive.sample(1, 11, 3, NA, 0))
## The table
cbind(ID, Amount)
# ID Amount
#[1,] 1 18
#[2,] 1 5
#[3,] 1 2
#[4,] 1 2
#[5,] 2 10
#[6,] 2 3
#[7,] 2 3
Or, again, with the simple sort(sample()) function for a higher probability of picking larger numbers.
Two methods, one using dplyr and one using only base R functions. These are slightly different to the two previous solutions.
I used sorted ID column, but this is not necessary.
Method 1
rm(list = ls())
set.seed(1)
df <- data.frame(ID = rep(1:3, each = 5))
df %>% group_by(ID) %>%
mutate(Amount = sort(sample(1 : 20, n(), replace = T), decreasing = TRUE))
Method 2
rm(list = ls())
set.seed(1)
df <- data.frame(ID = rep(1:3, each = 5))
df$Amount <- NA
uniq_ID <- unique(df$ID)
index_lst <- lapply(uniq_ID, function(x) which(df$ID == x))
res <- lapply(index_lst, function(x) sort(sample(1 : 20, length(x)),
decreasing = TRUE))
df$Amount[unlist(index_lst)] <- unlist(res)
Method 2.5
This is more convoluted than the 2nd method.
rm(list = ls())
set.seed(1)
df <- data.frame(ID = rep(1:3, each = 5))
df$Amount <- NA
tab <- as.data.frame(table(df$ID))
lapply(1 : nrow(tab), function(x) df$Amount[which(df$ID == tab$Var1[x])] <<-
sort(sample(1 : 20, tab$Freq[x]), decreasing = TRUE))

How to keep only common row names between 4 data frames in R?

I have 4 data frames of genes, each data frame has gene names as row and about 20 columns of sample data. Thus each matrix has that amount of rows (genes):
A: 10,000 genes
B: 15,000 genes
C: 35,000 genes
D: 12,000 genes
Here is what I tried, it didn't select the complete list of 9,000 common row (Genes)
Data_A = read.csv("matrix_A.csv");
Data_B = read.csv("matrix_B.csv");
Data_C = read.csv("matrix_C.csv");
Data_D = read.csv("matrix_D.csv");
Expr_A = as.data.frame(t(Data_A[, -c(1:8)]))
Expr_B = as.data.frame(t(Data_B[, -c(1:8)]))
Expr_C = as.data.frame(t(Data_C[, -c(1:8)]))
Expr_D = as.data.frame(t(Data_D[, -c(1:8)]))
commonGenes1 = intersect (rownames(Data_A),rownames(Data_D))
commonGenes2 = intersect (rownames(Data_B),rownames(Data_D))
commonGenes3 = intersect (rownames(Data_C),rownames(Data_D))
Data_A = Data_A[commonGenes1,]
Data_B = Data_B[commonGenes2,]
Data_C = Data_C[commonGenes3,]
They all have 9,000 genes in common, though the data are so big I can't do this in Excel. I'm using R to treat the data, is there a way to select the common genes between the 4 data frames in R?
An example of the 4 matrices is here:
http://www.filedropper.com/matrixexample
Let's actually put things in a list (as your title suggests), it's good practice.
list_of_data = list(Data_A, Data_B, Data_C, Data_D)
## for demo purposes, you can use
# list_of_data = list(mtcars[1:6, ], mtcars[4:9, ])
# this will get the intersection of the row.names for everything in the list
common_names = Reduce(intersect, lapply(list_of_data, row.names))
list_of_data = lapply(list_of_data, function(x) { x[row.names(x) %in% common_names,] })
Thanks to #eipi10 for a better way to filter rows for each data frame in a list. Check out the revision history for a lame for loop.
What about this?
# Create some fake data:
set.seed(123)
m1 <- cbind(sample(1:5), round(rnorm(5),2))
m2 <- cbind(sample(1:5), round(rnorm(5),2))
m3 <- cbind(sample(1:5), round(rnorm(5),2))
m4 <- cbind(sample(1:5), round(rnorm(5),2))
rownames(m1) <- LETTERS[sample(1:10, 5)]
rownames(m2) <- LETTERS[sample(1:10, 5)]
rownames(m3) <- LETTERS[sample(1:10, 5)]
rownames(m4) <- LETTERS[sample(1:10, 5)]
ind <- sapply(list(m1,m2,m3), function(x) intersect(rownames(x), rownames(m4)))
mapply(function(x, y) x[rownames(x) %in% y,], x = list(m1,m2,m3), y = ind)
[[1]]
[,1] [,2]
A 4 1.24
D 5 -0.11
E 1 0.18
[[2]]
[,1] [,2]
E 5 1.22
C 2 -0.56
[[3]]
[,1] [,2]
A 2 -0.22
C 1 -0.33

extract unique rows with a condition in r

I have this kind of data:
x <- matrix(c(2,2,3,3,3,4,4,20,33,2,3,45,6,9,45,454,7,4,6,7,5), nrow = 7, ncol = 3)
In the real dataset, I have a huge matrix with a lot of columns.
I want to extract unique rows with respect to the first column(Id) and minimum of the third column. For instance, for this matrix I would expect
y <- matrix(c(2,3,4,20,3,9,45,4,5), nrow = 3, ncol = 3)
I tried a lot of things but I couldn't figure out.
Any help is appreciated.
Thanks in advance,
Zeray
Here's a version that is more complicated, but somewhat faster that Chase's ddply solution - some 200x faster :-)
uniqueMin <- function(m, idCol = 1L, minCol = ncol(m)) {
t(vapply(split(1:nrow(m), m[,idCol]), function(i, x, minCol) x[i, , drop=FALSE][which.min(x[i,minCol]),], m[1,], x=m, minCol=minCol))
}
And the following test code:
nRows <- 10000
nCols <- 100
ids <- nRows/5
m <- cbind(sample(ids, nRows, T), matrix(runif(nRows*nCols), nRows))
system.time( a<-uniqueMin(m, minCol=3L) ) # 0.07
system.time(ddply(as.data.frame(m), "V1", function(x) x[which.min(x$V3) ,])) # 15.72
You can use package plyr. Convert to a data.frame so you can group on the first column, then use which.min to extract the min row by group:
library(plyr)
ddply(as.data.frame(x), "V1", function(x) x[which.min(x$V3) ,])
V1 V2 V3
1 2 20 45
2 3 3 4
3 4 9 5

Resources