Finding closest point by comparing two data frames in R - r

I have a set of two data frames in R
First:
site_no <- c("02110500","02110550", "02110701" , "02110704", "02110760", "02110777", "021108044", "02110815")
lat_coor <- c(33.91267, 33.85083, 33.86100, 33.83295, 33.74073, 33.85156, 33.65017, 33.44461)
long_coor <- c(-78.71502, -78.89722, -79.04115, -79.04365, -78.86669, -78.65585, -79.12310, -79.17393)
AllStations <- data.frame(site_no, lat_coor, long_coor)
Second:
station <- c("USGS-02146110","USGS-02146110","USGS-02146110","USGS-02146110","USGS-02146110","USGS-021473426","USGS-021473426","USGS-021473426")
latitude <- c(34.88928, 34.85651, 34.85651, 34.85651, 34.71679, 34.24320, 34.80012, 34.80012)
longitude <- c(-81.06869, -82.22622, -82.22622, -82.22622, -82.17372, -81.31954, -82.36512, -82.36512)
ContaminantStations <- data.frame(station, latitude, longitude)
My data sets are a lot longer but for the purpose of this question I think this should be enough.
What I would like is to find all the stations from the first data frame (AllStations) that are inside a radius of the points in the second data frame (ContaminantStations) and append them into a new data frame (only the ones from AllStations), I need to extract the station with all its information. I've tried some logical but none of them work or make sense. I also try with RANN:nn2 but that only gives me the count.
Any help would be appreciated

I think you just need to iterate over each within AllStations and return the nearest of the ContaminantStations that is within a radius.
func <- function(stations, constations, radius = 250000) {
if (!NROW(stations) || !NROW(constations)) return()
if (length(radius) == 1 && NROW(constations) > 1) {
radius <- rep(radius, NROW(constations))
} else if (length(radius) != NROW(constations)) {
stop("'radius' must be length 1 or the same as the number of rows in 'constations'")
}
out <- integer(NROW(stations))
for (i in seq_len(NROW(stations))) {
dists <- geosphere::distHaversine(stations[i,], constations)
out[i] <- if (any(dists <= radius)) which.min(dists) else 0L
}
return(out)
}
This returns a integer vector, indicating the contaminant station that is closest. If none are within the radius, then it returns 0. This is safely used as a row-index on the original frame.
Each argument must include only two columns, with the first column being longitude. (I make no assumptions of the column names in the function.) radius is in meters, consistent with the geosphere package assumptions.
ind <- func(AllStations[,c("long_coor", "lat_coor")], ContaminantStations[,c("longitude", "latitude")],
radius = 230000)
ind
# [1] 0 6 6 6 0 0 6 6
These are indices on the ContaminantStations rows, where non-zero means that that contaminant station is the closest to the specific row of AllStations.
We can identify which contaminant station is closest with this (there are many ways to do this, including tidyverse and other techniques ... this is just a start).
AllStations$ClosestContaminantStation <- NA_character_
AllStations$ClosestContaminantStation[ind > 0] <- ContaminantStations$station[ind]
AllStations
# site_no lat_coor long_coor ClosestContaminantStation
# 1 02110500 33.91267 -78.71502 <NA>
# 2 02110550 33.85083 -78.89722 USGS-021473426
# 3 02110701 33.86100 -79.04115 USGS-021473426
# 4 02110704 33.83295 -79.04365 USGS-021473426
# 5 02110760 33.74073 -78.86669 <NA>
# 6 02110777 33.85156 -78.65585 <NA>
# 7 021108044 33.65017 -79.12310 USGS-021473426
# 8 02110815 33.44461 -79.17393 USGS-021473426
A vis of your data for perspective:
An alternative to this approach would be to return the distance and index of the closest contaminant station, regardless of the radius, allowing you to filter later.
func2 <- function(stations, constations, radius = 250000) {
if (!NROW(stations) || !NROW(constations)) return()
if (length(radius) == 1 && NROW(constations) > 1) {
radius <- rep(radius, NROW(constations))
} else if (length(radius) != NROW(constations)) {
stop("'radius' must be length 1 or the same as the number of rows in 'constations'")
}
out <- data.frame(ind = integer(NROW(stations)), dist = numeric(NROW(stations)))
for (i in seq_len(NROW(stations))) {
dists <- geosphere::distHaversine(stations[i,], constations)
out$ind[i] <- which.min(dists)
out$dist[i] <- min(dists)
}
return(out)
}
Demonstration, including bringing the contaminant station into the same frame.
AllStations2 <- cbind(
AllStations,
func2(AllStations[,c("long_coor", "lat_coor")], ContaminantStations[,c("longitude", "latitude")])
)
AllStations2
# site_no lat_coor long_coor ind dist
# 1 02110500 33.91267 -78.71502 1 241971.5
# 2 02110550 33.85083 -78.89722 6 227650.6
# 3 02110701 33.86100 -79.04115 6 214397.8
# 4 02110704 33.83295 -79.04365 6 214847.7
# 5 02110760 33.74073 -78.86669 6 233190.8
# 6 02110777 33.85156 -78.65585 6 249519.7
# 7 021108044 33.65017 -79.12310 6 213299.3
# 8 02110815 33.44461 -79.17393 6 217378.9
AllStations3 <- cbind(
AllStations2,
ContaminantStations[AllStations2$ind,]
)
AllStations3
# site_no lat_coor long_coor ind dist station latitude longitude
# 1 02110500 33.91267 -78.71502 1 241971.5 USGS-02146110 34.88928 -81.06869
# 6 02110550 33.85083 -78.89722 6 227650.6 USGS-021473426 34.24320 -81.31954
# 6.1 02110701 33.86100 -79.04115 6 214397.8 USGS-021473426 34.24320 -81.31954
# 6.2 02110704 33.83295 -79.04365 6 214847.7 USGS-021473426 34.24320 -81.31954
# 6.3 02110760 33.74073 -78.86669 6 233190.8 USGS-021473426 34.24320 -81.31954
# 6.4 02110777 33.85156 -78.65585 6 249519.7 USGS-021473426 34.24320 -81.31954
# 6.5 021108044 33.65017 -79.12310 6 213299.3 USGS-021473426 34.24320 -81.31954
# 6.6 02110815 33.44461 -79.17393 6 217378.9 USGS-021473426 34.24320 -81.31954
From here, you can choose your radius at will:
subset(AllStations3, dist < 230000)
# site_no lat_coor long_coor ind dist station latitude longitude
# 6 02110550 33.85083 -78.89722 6 227650.6 USGS-021473426 34.2432 -81.31954
# 6.1 02110701 33.86100 -79.04115 6 214397.8 USGS-021473426 34.2432 -81.31954
# 6.2 02110704 33.83295 -79.04365 6 214847.7 USGS-021473426 34.2432 -81.31954
# 6.5 021108044 33.65017 -79.12310 6 213299.3 USGS-021473426 34.2432 -81.31954
# 6.6 02110815 33.44461 -79.17393 6 217378.9 USGS-021473426 34.2432 -81.31954

Related

R: Making a More "Efficient" Distance Function

I am working with the R programming language.
I generated the following random data set that contains x and y points:
set.seed(123)
x_cor = rnorm(10,100,100)
y_cor = rnorm(10,100,100)
my_data = data.frame(x_cor,y_cor)
x_cor y_cor
1 43.95244 222.40818
2 76.98225 135.98138
3 255.87083 140.07715
4 107.05084 111.06827
5 112.92877 44.41589
6 271.50650 278.69131
7 146.09162 149.78505
8 -26.50612 -96.66172
9 31.31471 170.13559
10 55.43380 52.72086
I am trying to write a "greedy search" algorithm that shows which point is located the "shortest distance" from some starting point.
For example, suppose we start at -26.50612, -96.66172
distance <- function(x1,x2, y1,y2) {
dist <- sqrt((x1-x2)^2 + (y1 - y2)^2)
return(dist)
}
Then I calculated the distance between -26.50612, -96.66172 and each point :
results <- list()
for (i in 1:10){
distance_i <- distance(-26.50612, my_data[i,1], -96.66172, my_data[i,2] )
index = i
my_data_i = data.frame(distance_i, index)
results[[i]] <- my_data_i
}
results_df <- data.frame(do.call(rbind.data.frame, results))
However, I don't think this is working because the distance between the starting point -26.50612, -96.66172 and itself is not 0 (see 8th row):
distance_i index
1 264.6443 1
2 238.7042 2
3 191.3048 3
4 185.0577 4
5 151.7506 5
6 306.4785 6
7 331.2483 7
8 223.3056 8
9 213.3817 9
10 331.6455 10
My Question:
Can someone please show me how to write a function that correctly finds the nearest point from an initial point
(Step 1) Then removes the nearest point and the initial point from "my_data"
(Step 2) And then re-calculates the nearest point from "my_data" using the nearest point identified Step 1 (i.e. with the removed data)
And in the end, shows the path that was taken (e.g. 5,7,1,9,3, etc)
Can someone please show me how to do this?
Thanks!
This could be helpful and I think you can solve the further tasks by yourself
start <- c(x= -26.50612, y= -96.66172)
library(dplyr)
my_data <- data.frame(x_cor,y_cor) %>%
rowwise() %>%
mutate(dist = distance(start["x"], x_cor, start["y"], y_cor))
The solution is implemented as a recursive function distmin, which finds the closest point between an input x and a dataframe Y and then calls itself with the closest point and the dataframe without the closest point as arguments.
EDIT: I reimplemented distmin to use dataframes.
my_data = data.frame(x_cor,y_cor) |>
mutate(idx = row_number())
distmin <- function(x, Y) {
if(nrow(Y) == 0) {
NULL
} else {
dst <- sqrt((x$x_cor - Y$x_cor)^2 + (x$y_cor - Y$y_cor)^2)
m <- which.min(dst)
res <- data.frame(x, dist = dst[m], nearest = Y[m,"idx"])
rbind(res, distmin(Y[m,], Y[-m,]))
}}
N <- 5
distmin(my_data[N,], my_data[-N,])
##> x_cor y_cor idx dist nearest
##> 5 112.92877 44.41589 5 58.09169 10
##> 10 55.43380 52.72086 10 77.90211 4
##> 4 107.05084 111.06827 4 39.04847 2
##> 2 76.98225 135.98138 2 57.02661 9
##> 9 31.31471 170.13559 9 53.77858 1
##> 1 43.95244 222.40818 1 125.32571 7
##> 7 146.09162 149.78505 7 110.20762 3
##> 3 255.87083 140.07715 3 139.49323 6
##> 6 271.50650 278.69131 6 479.27176 8
The following shows the order in which points are called.
distmin(my_data[N,], my_data[-N,]) |>
mutate(ord = row_number()) |>
ggplot(aes(x = x_cor, y_cor)) +
geom_text(aes(label = ord))

Using approx function within tapply or by in R

I have a temperature profiler (tp) data for date, depth and temperature. The depth for each date is not exactly the same so I need to unify it to the same depth and set the temperature for that depth by linear approximation. I was able to do this with a loop using ‘approx’ function (see first part of the enclosed code). But I know that I should do it better without a loop (considering I will have about 600,000 rows). I tried to do it with ‘by’ function but was not successful transforming the result (list) into a data frame or matrix (see second part of the code).
Keep in mind that length of the rounded depth is not always the same as in the example.
Rounded depth is in Depth2 column, interpulated temperature is put in Temp2
What is the ‘right’ way to solve this?
# create df manually
tp <- data.frame(Date=double(31), Depth=double(31), Temperature=double(31))
tp$Date[1:11] <- '2009-12-17' ; tp$Date[12:22] <- '2009-12-18'; tp$Date[23:31] <- '2009-12-19'
tp$Depth <- c(24.92,25.50,25.88,26.33,26.92,27.41,27.93,28.37,28.82,29.38,29.92,25.07,25.56,26.06,26.54,27.04,27.53,28.03,28.52,29.02,29.50,30.01,25.05,25.55,26.04,26.53,27.02,27.52,28.01,28.53,29.01)
tp$Temperature <- c(19.08,19.06,19.06,18.87,18.67,17.27,16.53,16.43,16.30,16.26,16.22,17.62,17.43,17.11,16.72,16.38,16.28,16.20,16.15,16.13,16.11,16.08,17.54,17.43,17.32,17.14,16.89,16.53,16.28,16.20,16.13)
# create rounded depth column
tp$Depth2 <- round(tp$Depth)
# loop on date to calculate linear approximation for rounded depth
dtgrp <- tp[!duplicated(tp[,1]),1]
for (i in dtgrp) {
x1 <- tp[tp$Date == i, "Depth"]
y1 <- tp[tp$Date == i, "Temperature"]
x2 <- tp[tp$Date == i, "Depth2"]
tpa <- approx(x=x1,y=y1,xout=x2, rule=2)
tp[tp$Date == i, "Temp2"] <- tpa$y
}
# reduce result to rounded depth
tp1 <- tp[!duplicated(tp[,-c(2:3)]),-c(2:3)]
# not part of the question, but the end need is for a matrix, so this complete it:
library(reshape2)
tpbydt <- acast(tp1, Date~Depth2, value.var="Temp2")
# second part: I tried to use the by function (instead of loop) but got lost when tring to convert it to data frame or matrix
rdpth <- function(x1,y1,x2) {
tpa <- approx(x=x1,y=y1,xout=x2, rule=2)
return(tpa)
}
tp2 <- by(tp, tp$Date,function(tp) rdpth(tp$Depth,tp$Temperature,tp$Depth2), simplify = TRUE)
Very close with by call but remember it returns a list of objects. Therefore, consider building a list of data frames to be row binded at very end:
df_list <- by(tp, tp$Date, function(sub) {
tpa <- approx(x=sub$Depth, y=sub$Temperature, xout=sub$Depth2, rule=2)
df <- unique(data.frame(Date = sub$Date,
Depth2 = sub$Depth2,
Temp2 = tpa$y,
stringsAsFactors = FALSE))
return(df)
})
tp2 <- do.call(rbind, unname(df_list))
tp2
# Date Depth2 Temp2
# 1 2009-12-17 25 19.07724
# 2 2009-12-17 26 19.00933
# 5 2009-12-17 27 18.44143
# 7 2009-12-17 28 16.51409
# 9 2009-12-17 29 16.28714
# 11 2009-12-17 30 16.22000
# 12 2009-12-18 25 17.62000
# 21 2009-12-18 26 17.14840
# 4 2009-12-18 27 16.40720
# 6 2009-12-18 28 16.20480
# 8 2009-12-18 29 16.13080
# 10 2009-12-18 30 16.08059
# 13 2009-12-19 25 17.54000
# 22 2009-12-19 26 17.32898
# 41 2009-12-19 27 16.90020
# 61 2009-12-19 28 16.28510
# 81 2009-12-19 29 16.13146
And if you reset row.names, this is exactly identical to your tp1 output:
identical(data.frame(tp1, row.names = NULL),
data.frame(tp2, row.names = NULL))
# [1] TRUE

transform 2d matrix to 3D matrix

After a simulation I have data like that :
capt2[1,1] capt2[2,1] capt2[3,1] capt2[4,1] capt2[5,1] capt2[6,1] capt2[1,2] capt2[2,2] capt2[3,2] capt2[4,2]
1 4.582288e-05 5.115372e-05 6.409558e-05 7.132340e-05 6.927382e-05 5.727399e-05 2.753242e-05 3.106131e-05 3.832073e-05 4.270945e-05
2 4.675470e-05 5.045181e-05 6.467788e-05 7.112534e-05 6.809241e-05 5.885455e-05 2.789134e-05 3.097479e-05 3.790915e-05 4.176663e-05
3 4.586335e-05 5.127838e-05 6.344857e-05 6.934458e-05 6.622970e-05 5.651329e-05 2.795094e-05 3.120102e-05 3.790188e-05 4.172773e-05
4 4.572750e-05 5.150407e-05 6.333068e-05 7.145439e-05 6.624694e-05 5.836059e-05 2.795106e-05 3.055858e-05 3.826570e-05 4.172327e-05
5 4.740812e-05 5.113890e-05 6.397921e-05 7.163161e-05 6.838507e-05 5.620327e-05 2.790780e-05 3.083819e-05 3.821806e-05 4.198080e-05
6 4.583460e-05 5.106634e-05 6.340507e-05 7.030548e-05 6.886533e-05 5.901374e-05 2.792663e-05 3.136544e-05 3.862876e-05 4.177590e-05
with a length of 40000 lines.
However the [1: 6,] refers to months and the [, 1: x] refers to territories. So I would like to have [, 1: x] columns (in my dataset 28) for [1: 6,] rows and have the length (40000) in the third dimension since these are simulations.
Subsequently with my 3D table of 6 lines and 28 columns, I would like to do simple operations, such as for example a histogram of the 3D values ​​of line 1 / column 1 etc ...
edit : "capt2[3,1]" it's just the name of the column in character
Just transform it into an array.
I'll simulate some data to show you how to do this.
set.seed(42)
n <- 10 # `n` in your data would be 40,000
# your rownames
v <- c("capt2[1,1]", "capt2[2,1]", "capt2[3,1]", "capt2[4,1]", "capt2[5,1]", "capt2[6,1]",
"capt2[1,2]", "capt2[2,2]", "capt2[3,2]", "capt2[4,2]", "capt2[5,2]", "capt2[6,2]",
"capt2[1,3]", "capt2[2,3]", "capt2[3,3]", "capt2[4,3]", "capt2[5,3]", "capt2[6,3]")
M <- matrix(rnorm(3*6*n), n, dimnames=list(NULL, v)) # shall symbolize your data
M[1:2, 1:6]
# capt2[1,1] capt2[2,1] capt2[3,1] capt2[4,1] capt2[5,1] capt2[6,1]
# [1,] -0.132088 0.5156677 1.3487070 1.01687283 -0.73844075 0.8131950
# [2,] 1.476787 -0.2343653 -0.0227647 -0.02671746 0.04656394 -0.1908165
Now apply array with the right dimensions and dimnames.
A <- array(as.vector(t(M)), dim=c(6, 3, n),
dimnames=list(paste0("month.", 1:6), paste0("territory.", 1:3), NULL))
A
# , , 1
#
# territory.1 territory.2 territory.3
# month.1 -0.1320880 0.4703934 -1.3870266
# month.2 0.5156677 2.4595935 1.1573471
# month.3 1.3487070 -0.1662615 -0.2901453
# month.4 1.0168728 0.4823695 1.8922020
# month.5 -0.7384408 -0.7848878 -0.2764311
# month.6 0.8131950 1.1454705 -0.3047780
#
# , , 2
#
# territory.1 territory.2 territory.3
# month.1 1.47678742 -1.24267027 -1.3066759
# month.2 -0.23436528 -0.81838032 -1.6824809
# month.3 -0.02276470 0.86256338 0.8285461
# month.4 -0.02671746 0.99294364 -1.3859983
# month.5 0.04656394 0.16341632 -1.1094188
# month.6 -0.19081647 0.03157319 0.5978327
#
# , , 3
#
# territory.1 territory.2 territory.3
# month.1 -0.2170302 1.38157546 -0.76839533
# month.2 -0.6585034 -2.11320011 0.08731909
# month.3 0.2442259 0.09734049 -0.29122771
# month.4 0.7036078 -1.24639550 -0.41482430
# month.5 -1.0175961 -1.23671424 0.13386932
# month.6 -2.6999298 -0.83520581 1.39742941
[...]

Extract time series from netCDF in R

I created this file
using TRMM_3B42_Daily product over 1998-01-01 to 1998-12-31. This is the script I used in R:
lon=seq(-91.875,-86.875,by= 0.25)
lat=seq(13.875,16.875,by= 0.25)
x_dim <- ncdim_def( "lon", "degrees_east", lon, create_dimvar=TRUE)
y_dim <- ncdim_def( "lat", "degrees_north", lat, create_dimvar=TRUE)
t_dim <- ncdim_def( "time", "days since 1997-12-31 12:00:00.0 -0:00", 1:365, unlim=FALSE)
mv=9999.900390625
precipitation_var <- ncvar_def("precipitation", "mm", list(y_dim,x_dim,t_dim), mv)
nrow = 13
ncol = 21
NA.matrix=matrix(rep(NA,nrow*ncol))
precip=array(NA.matrix,c(nrow,ncol, 1))
for (i in 1:length(test01)){precip_nc=nc_open(test01[i])
precip_get_nc=ncvar_get(precip_nc,"precipitation")
precip=abind(precip,precip_get_nc)}
precip=precip[,,-1]
PRECIPITATION_nc = nc_create("PRECIPITATION_1998.nc", precipitation_var)
precipitation_nc_put=ncvar_put (PRECIPITATION_nc, precipitation_var, precip)
nc_close(PRECIPITATION_nc)
Following this link I tried extracting the values in order to plot a time series but it seems I am averaging the values of two cells instead of just extracting the values of a single cell. How do I fix this? Is there a way to create a loop so that it extracts the values of different cells? (in this case it would be 13 x 21 = 273)
b <- brick('PRECIPITATION_1998.nc')
be <- crop(b, extent(13.875, 14.125, -91.875,-91.625))
a <- aggregate(be, dim(be)[2:1], na.rm=TRUE)
v <- values(a)
write.csv(v, 'precip.csv', row.names=FALSE)
Also two other problems I found where that the dates in the excel file have an X in front and that the values are shown horizontally instead of vertically. Any help would be greatly appreciated!! Thanks
extraction of points data can be easily accomplished by creating a SpatialPoints object containing the point from which you want to extract data, followed by an extract operation.
Concerining the other topics: The "X"s are added because column names can not start with numerals, so a character is added. The horizontal ordering can be easily changed after extraction with some transposing
This, for example, should work (It solves also the "X"s problem and changes the format to "column like"):
library(raster)
library(stringr)
library(lubridate)
library(tidyverse)
b <- brick('/home/lb/Temp/buttami/PRECIPITATION_1998.nc')
lon = c(-91.875,-91.625) # Array of x coordinates
lat <- c(13.875, 14.125) # Array of y coordinates
points <- SpatialPoints(cbind(lat,lon)), # Build a spPoints object
# Etract and tidy
points_data <- b %>%
raster::extract(points, df = T) %>%
gather(date, value, -ID) %>%
spread(ID, value) %>% # Can be skipped if you want a "long" table
mutate(date = ymd(str_sub(names(b),2))) %>%
as_tibble()
points_data
# A tibble: 365 × 3
date `1` `2`
<date> <dbl> <dbl>
1 1998-01-01 0 0
2 1998-01-02 0 0
3 1998-01-03 0 0
4 1998-01-04 0 0
5 1998-01-05 0 0
6 1998-01-06 0 0
7 1998-01-07 0 0
8 1998-01-08 0 0
9 1998-01-09 0 0
10 1998-01-10 0 0
# ... with 355 more rows
plot(points_data$date,points_data$`1`)

How can implement a function?

I have two data files as below:
head (RNA)
Gene_ID chr start end
1 ENSG00000000003.1 X 99883667 99884983
2 ENSG00000000003.2 X 99885756 99885863
3 ENSG00000000003.3 X 99887482 99887565
4 ENSG00000000003.4 X 99888402 99888536
5 ENSG00000000003.5 X 99888928 99889026
6 ENSG00000000003.6 X 99890175 99890249
head(snp)
chr start end SNP_No
1 1 58812 58812 SNP_1
2 1 67230 67230 SNP_2
3 1 79529 79529 SNP_3
4 1 79595 79595 SNP_4
5 1 85665 85665 SNP_5
6 1 86064 86064 SNP_6
I would like to find overlap between snp file and RNA file, so I used GenomicRanges R package and I have done below commands:
gr_RNA <- GRanges(seqnames=RNA$chr,IRanges(start=RNA$start,end=RNA$end,names=RNA$Gene_ID))
gr_SNP <- GRanges(seqnames=SNP$chr, IRanges(start=SNP$start,end=SNP$end,names=SNP$SNP_No))
overlaps <- findOverlaps(gr_RNA, gr_SNP)
subsetByOver <- subsetByOverlaps(gr_RNA, gr_SNP)
match_hit <- data.frame(names(gr_RNA)[queryHits(overlaps)],names(gr_SNP)[subjectHits(overlaps)],stringsAsFactors=F)
names(match_hit) <- c('Gene_ID','SNP')
head(match_hit)
Gene_ID SNP
1 ENSG00000000457.1 SNP_307301
2 ENSG00000000457.2 SNP_307307
3 ENSG00000000457.11 SNP_307365
4 ENSG00000000457.12 SNP_307387
5 ENSG00000000460.1 SNP_306845
6 ENSG00000000460.1 SNP_306846
dim(match_hit)
[1] 12287 2
Then I expanded distance for start and end position from RNA file ("start-100" and "end+100")and run scripts again as below:
gr_RNA1 <- GRanges(seqnames=RNA$chr, IRanges(start=(RNA$start)-100, end=(RNA$end)+100, names=RNA$Gene_ID))
overlaps <- findOverlaps(gr_RNA1, gr_SNP)
subsetByOver<-subsetByOverlaps(gr_RNA1, gr_SNP)
match_hit1 <- data.frame(names(gr_RNA1)[queryHits(overlaps)],names(gr_SNP)[subjectHits(overlaps)],stringsAsFactors=F)
dim(match_hit1)
[1] 17976 2
Now, I want to implement a function which takes the RNA table, the SNP table, and the expand distance, then give me final results.
Functions in R are defined like this:
myFunction <- function(parameters) {
#function Code
return(result)
}
see also

Resources