Can't re-project precipitation data from Stereographic to PlateCarree() using Cartopy - projection

I am trying to plot precipitation data from the National Weather Service. However, the data is by default set to a stereographic projection. I'd like to plot in a PlateCarree projection but I am having some difficulties. When I try and use the PlateCarree projection in Cartopy, it plots the maps but will not overlay the precipitation data. I'm assuming this means that I am not properly re-projecting the data from stereographic to PlateCarree. Is there anything specific I need to do in order to re-project the data correctly?
Here is the code that works with the stereographic projection:
'''
=====================
NWS Precipitation Map
=====================
Plot a 1-day precipitation map using a netCDF file from the National Weather Service.
This opens the data directly in memory using the support in the netCDF library to open
from an existing memory buffer. In addition to CartoPy and Matplotlib, this uses
a custom colortable as well as MetPy's unit support.
"""
###############################
# Imports
from datetime import datetime, timedelta
from urllib.request import urlopen
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
from metpy.plots import USCOUNTIES
from metpy.units import masked_array, units
from netCDF4 import Dataset
import pandas as pd
###############################
# Download the data from the National Weather Service.
dt = datetime.utcnow() - timedelta(days=1) # This should always be available
url = 'http://water.weather.gov/precip/downloads/{dt:%Y/%m/%d}/nws_precip_1day_'\
'{dt:%Y%m%d}_conus.nc'.format(dt=dt)
data = urlopen(url).read()
nc = Dataset('data', memory=data)
###############################
# Pull the needed information out of the netCDF file
prcpvar = nc.variables['observation']
data = masked_array(prcpvar[:], units(prcpvar.units.lower())).to('in')
#data = data * 0.0393
x = nc.variables['x'][:]
y = nc.variables['y'][:]
proj_var = nc.variables[prcpvar.grid_mapping]
#%%
###############################
# Set up the projection information within CartoPy
globe = ccrs.Globe(semimajor_axis=proj_var.earth_radius)
proj = ccrs.Stereographic(central_latitude=90.0,
central_longitude=proj_var.straight_vertical_longitude_from_pole,
true_scale_latitude=proj_var.standard_parallel, globe=globe)
###############################
# Create the figure and plot the data
# create figure and axes instances
fig = plt.figure(figsize=(15, 15))
ax = fig.add_subplot(111, projection=proj)
#ax.set_extent([-75,-85,35,39])
#draw coastlines, state and country boundaries, edge of map.
ax.coastlines(resolution='10m')
ax.add_feature(cfeature.BORDERS.with_scale('10m'), linewidth=1.5)
ax.add_feature(cfeature.STATES.with_scale('10m'), linewidth=2.0)
ax.add_feature(USCOUNTIES.with_scale('500k'), edgecolor='black')
# draw filled contours.
clevs = [0.01, 0.1, 0.25, 0.50, 0.75, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0,
6.0, 8.0, 10., 20.0]
# In future MetPy
# norm, cmap = ctables.registry.get_with_boundaries('precipitation', clevs)
cmap_data = [
"#04e9e7", # 0.01 - 0.10 inches
"#019ff4", # 0.10 - 0.25 inches
"#0300f4", # 0.25 - 0.50 inches
"#02fd02", # 0.50 - 0.75 inches
"#01c501", # 0.75 - 1.00 inches
"#008e00", # 1.00 - 1.50 inches
"#fdf802", # 1.50 - 2.00 inches
"#e5bc00", # 2.00 - 2.50 inches
"#fd9500", # 2.50 - 3.00 inches
"#fd0000", # 3.00 - 4.00 inches
"#d40000", # 4.00 - 5.00 inches
"#bc0000", # 5.00 - 6.00 inches
"#f800fd", # 6.00 - 8.00 inches
"#9854c6", # 8.00 - 10.00 inches
"#fdfdfd" # 10.00+
]
cmap = mcolors.ListedColormap(cmap_data, 'precipitation')
norm = mcolors.BoundaryNorm(clevs, cmap.N)
cs = ax.contourf(x, y, data, clevs, alpha = 0.5, cmap=cmap, norm=norm)
# add colorbar.
cbar = plt.colorbar(cs, orientation='vertical')
cbar.set_label(data.units)
time = nc.creation_time[4:6]+'/'+nc.creation_time[6:8]+'/'+nc.creation_time[0:4]+' '\
+nc.creation_time[9:11] +':'+ nc.creation_time[11:13] + " UTC"
print(time)
ax.set_title('24 hr Precipitation (in)' + '\n for period ending ' + time, fontsize = 16, fontweight = 'bold' )
'''
However, when I change the projection lines to PlateCarree I run into the issue I described above. Does anyone have any advice on how to re-prroject this data?
Thanks

Just add the transform parameter to the ax.contourf method.
# Imports
from datetime import datetime, timedelta
from urllib.request import urlopen
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
from metpy.plots import USCOUNTIES
from metpy.units import masked_array, units
from netCDF4 import Dataset
import pandas as pd
###############################
# Download the data from the National Weather Service.
dt = datetime.utcnow() - timedelta(days=1) # This should always be available
url = 'http://water.weather.gov/precip/downloads/{dt:%Y/%m/%d}/nws_precip_1day_'\
'{dt:%Y%m%d}_conus.nc'.format(dt=dt)
data = urlopen(url).read()
nc = Dataset('data', memory=data)
###############################
# Pull the needed information out of the netCDF file
prcpvar = nc.variables['observation']
data = masked_array(prcpvar[:], units(prcpvar.units.lower())).to('in')
#data = data * 0.0393
x = nc.variables['x'][:]
y = nc.variables['y'][:]
proj_var = nc.variables[prcpvar.grid_mapping]
#%%
###############################
# Set up the projection information within CartoPy
globe = ccrs.Globe(semimajor_axis=proj_var.earth_radius)
proj = ccrs.Stereographic(central_latitude=90.0,
central_longitude=proj_var.straight_vertical_longitude_from_pole,
true_scale_latitude=proj_var.standard_parallel, globe=globe)
###############################
# Create the figure and plot the data
# create figure and axes instances
fig = plt.figure(figsize=(15, 15))
pc_proj = ccrs.PlateCarree()
#ax = fig.add_subplot(111, projection=proj)
ax = fig.add_subplot(111, projection=pc_proj)
ax.set_extent([-128,-65,25,52])
#draw coastlines, state and country boundaries, edge of map.
ax.coastlines(resolution='10m')
#ax.add_feature(cfeature.BORDERS.with_scale('10m'), linewidth=1.5)
#ax.add_feature(cfeature.STATES.with_scale('10m'), linewidth=2.0)
#ax.add_feature(USCOUNTIES.with_scale('500k'), edgecolor='black')
# draw filled contours.
clevs = [0.01, 0.1, 0.25, 0.50, 0.75, 1.0, 1.5, 2.0, 2.5, 3.0, 4.0, 5.0,
6.0, 8.0, 10., 20.0]
# In future MetPy
# norm, cmap = ctables.registry.get_with_boundaries('precipitation', clevs)
cmap_data = [
"#04e9e7", # 0.01 - 0.10 inches
"#019ff4", # 0.10 - 0.25 inches
"#0300f4", # 0.25 - 0.50 inches
"#02fd02", # 0.50 - 0.75 inches
"#01c501", # 0.75 - 1.00 inches
"#008e00", # 1.00 - 1.50 inches
"#fdf802", # 1.50 - 2.00 inches
"#e5bc00", # 2.00 - 2.50 inches
"#fd9500", # 2.50 - 3.00 inches
"#fd0000", # 3.00 - 4.00 inches
"#d40000", # 4.00 - 5.00 inches
"#bc0000", # 5.00 - 6.00 inches
"#f800fd", # 6.00 - 8.00 inches
"#9854c6", # 8.00 - 10.00 inches
"#fdfdfd" # 10.00+
]
cmap = mcolors.ListedColormap(cmap_data, 'precipitation')
norm = mcolors.BoundaryNorm(clevs, cmap.N)
#cs = ax.contourf(x, y, data, clevs, alpha = 0.5, cmap=cmap, norm=norm)
# add transform args
cs = ax.contourf(x, y, data, clevs, alpha = 0.5, cmap=cmap, norm=norm,transform=proj)
# add colorbar.
cbar = plt.colorbar(cs, orientation='vertical')
cbar.set_label(data.units)
time = nc.creation_time[4:6]+'/'+nc.creation_time[6:8]+'/'+nc.creation_time[0:4]+' '\
+nc.creation_time[9:11] +':'+ nc.creation_time[11:13] + " UTC"
print(time)
ax.set_title('24 hr Precipitation (in)' + '\n for period ending ' + time, fontsize = 16, fontweight = 'bold' )
plt.savefig("prec_usa_pc.png")
#plt.show()
Below is the output fig.

Related

Clustering longitudinal data with multiple variables in R

I have a dataset that contains the observations of 30 people and each of them had done 20 experiments. Suppose my data looks like this:
ID trial reaction response prop_1 prop_2
"s1" 1 2.12 0 0.52 0.48
"s1" 2 1.32 1 0.12 0.88
"s1" 3 NA 1 NA NA
"s2" 1 2.33 1 0.65 0.35
"s2" 2 2.56 0 0.43 0.57
"s2" 3 NA 1 NA NA
I want to cluster the participants using these variables. I studied traj, latrend and kml packages but all of them use just one variable to cluster the data. How can I use multiple variables to cluster a longitudinal data like this?
Any simple help or guidance would be appreciated.
Here is one way to do it.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns; sns.set()
import csv
df = pd.read_csv('C:\\business.csv')
df.dropna(axis=0,how='any',subset=['latitude','longitude'],inplace=True)
K_clusters = range(1,10)
kmeans = [KMeans(n_clusters=i) for i in K_clusters]
Y_axis = df[['latitude']]
X_axis = df[['longitude']]
score = [kmeans[i].fit(Y_axis).score(Y_axis) for i in range(len(kmeans))]# Visualize
plt.plot(K_clusters, score)
plt.xlabel('Number of Clusters')
plt.ylabel('Score')
plt.title('Elbow Curve')
plt.show()
X = df[['longitude', 'latitude']].copy()
kmeans = KMeans(n_clusters = 5, init ='k-means++')
kmeans.fit(X[X.columns[1:2]]) # Compute k-means clustering
X['cluster_label'] = kmeans.fit_predict(X[X.columns[1:3]])
centers = kmeans.cluster_centers_ # Coordinates of cluster centers
labels = kmeans.predict(X[X.columns[1:2]]) # Labels of each point
X.head(10)
X.plot.scatter(x = 'latitude', y = 'longitude', c=labels, s=50, cmap='viridis')
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200, alpha=0.5)
Here's another idea.
# import necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from collections import Counter
df = pd.read_csv('C:\\properties_2017.csv')
# df.head(10)
df = df.head(10000)
list(df)
df.shape
df.shape
df = df.sample(frac=0.2, replace=True, random_state=1)
df.shape
df = df.fillna(0)
df.isna().sum()
df['regionidzip']=df['regionidzip'].fillna(97000)
df.dropna(axis=0,how='any',subset=['latitude','longitude'],inplace=True)
X=df.loc[:,['latitude','longitude']]
zp=df.regionidzip
id_n=8
kmeans = KMeans(n_clusters=id_n, random_state=0).fit(X)
id_label=kmeans.labels_
#plot result
ptsymb = np.array(['b.','r.','m.','g.','c.','k.','b*','r*','m*','r^']);
plt.figure(figsize=(12,12))
plt.ylabel('Longitude', fontsize=12)
plt.xlabel('Latitude', fontsize=12)
for i in range(id_n):
cluster=np.where(id_label==i)[0]
plt.plot(X.latitude[cluster].values,X.longitude[cluster].values,ptsymb[i])
plt.show()
#revise the clustering based on zipcode
uniq_zp=np.unique(zp)
for i in uniq_zp:
a=np.where(zp==i)[0]
c = Counter(id_label[a])
c.most_common(1)[0][0]
id_label[a]=c.most_common(1)[0][0]
#plot result (revised)
plt.figure(figsize=(12,12))
plt.ylabel('Longitude', fontsize=12)
plt.xlabel('Latitude', fontsize=12)
for i in range(id_n):
cluster=np.where(id_label==i)[0]
plt.plot(X.latitude[cluster].values,X.longitude[cluster].values,ptsymb[i])
plt.show()
https://www.kaggle.com/xxing9703/kmean-clustering-of-latitude-and-longitude?select=zillow_data_dictionary.xlsx
https://www.kaggle.com/c/zillow-prize-1/data
Also, check this out.
https://towardsdatascience.com/clustering-geospatial-data-f0584f0b04ec
https://raw.githubusercontent.com/mdipietro09/DataScience_ArtificialIntelligence_Utils/master/machine_learning/data_stores.csv

Using xarray interp to reproject a dataarray?

I have looked a lot at the xarray documentation of the interp function and I cannot really make sense of it. I see it is a reprojection but it doesn't really fit a real case example.
Is their someone that could make sense of it for example by reprojecting this dataset on a webmercator datum?
Something like the example:
import xarray as xr
from pyproj import Transformer
ds = xr.tutorial.open_dataset("air_temperature").isel(time=0)
fig, axes = plt.subplots(ncols=2, figsize=(10, 4))
lon, lat = np.meshgrid(ds.lon, ds.lat)
shp = lon.shape
# reproject the grid
gcs_to_3857 = Transformer.from_crs(4326, 3857, always_xy=True)
x, y = gcs_to_3857.transform(lon.ravel(), lat.ravel())
# future index for a regular raster
X= np.linspace(x.min(), x.max(), shp[1])
Y= np.linspace(y.min(), y.max(), shp[0])
data["x"] = xr.DataArray(np.reshape(x, shp), dims=("lat", "lon"))
data["y"] = xr.DataArray(np.reshape(y, shp), dims=("lat", "lon"))
And here, I am stuck
Should be something like ds.interp(x=X,y=Y) but the array is indexed on lat lon
It is all a bit confusing to me...
You could also use reproject from rioxarray as suggested.
Here is the code:
import xarray as xr
import numpy as np
from rasterio.enums import Resampling
import matplotlib.pyplot as plt
ds = xr.tutorial.open_dataset('air_temperature').isel(time=0)
ds = ds.rio.write_crs('EPSG:4326')
dst = ds.rio.reproject('EPSG:3857', shape=(250, 250), resampling=Resampling.bilinear, nodata=np.nan)
fig, ax = plt.subplots(1, 2, figsize=(14, 5))
ds.air.plot(ax=ax[0])
dst.air.plot(ax=ax[1])
I think the idea is something like this:
In [1]: import xarray as xr
...: import numpy as np
...: from pyproj import Transformer
...:
...: ds = xr.tutorial.open_dataset("air_temperature").isel(time=0)
design a target grid in transformed space:
In [2]: # find the new bounds in mercator space
...: gcs_to_3857 = Transformer.from_crs(4326, 3857, always_xy=True)
...: x, y = gcs_to_3857.transform(
...: [ds.lon.min(), ds.lon.max()],
...: [ds.lat.min(), ds.lat.max()],
...: )
In [3]: # design a target grid for the re-projected data - can be any dims you want
...: X = np.linspace(x[0], x[1], 500)
...: Y = np.linspace(y[0], y[1], 600)
...: XX, YY = np.meshgrid(X, Y)
Transform this grid back into lat/lon
In [4]: # build a reverse transformer from Mercator back to lat/lons
...: merc_to_latlng = Transformer.from_crs(3857, 4326, always_xy=True)
...: new_lons, new_lats = merc_to_latlng.transform(XX.ravel(), YY.ravel())
...: new_lons = new_lons.reshape(XX.shape)
...: new_lats = new_lats.reshape(YY.shape)
Create new DataArrays to index the lat/lon values corresponding to the grid points on the target grid (indexed by x and y in Mercator space):
In [5]: # create arrays indexed by (x, y); also convert lons back to (0, 360)
...: new_lons_da = xr.DataArray((new_lons % 360), dims=["y", "x"], coords=[Y, X])
...: new_lats_da = xr.DataArray(new_lats, dims=["y", "x"], coords=[Y, X])
Use xarray's advanced indexing to interpolate the data to the new points while re-indexing onto the new grid
In [6]: ds_mercator = ds.interp(lon=new_lons_da, lat=new_lats_da, method="linear")
Now the data is indexed by x and y, with points equally spaced in Mercator space:
In [7]: ds_mercator
Out[7]:
<xarray.Dataset>
Dimensions: (y: 600, x: 500)
Coordinates:
time datetime64[ns] 2013-01-01
lon (y, x) float64 200.0 200.3 200.5 200.8 ... 329.2 329.5 329.7 330.0
lat (y, x) float64 15.0 15.0 15.0 15.0 15.0 ... 75.0 75.0 75.0 75.0
* y (y) float64 1.689e+06 1.708e+06 1.727e+06 ... 1.291e+07 1.293e+07
* x (x) float64 -1.781e+07 -1.778e+07 ... -3.369e+06 -3.34e+06
Data variables:
air (y, x) float64 nan nan nan nan nan ... 237.3 237.6 238.0 238.3 nan
Attributes:
Conventions: COARDS
title: 4x daily NMC reanalysis (1948)
description: Data is from NMC initialized reanalysis\n(4x/day). These a...
platform: Model
references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...
The new projection can be seen in the axes (and distortion) of the transformed (right) as compared to the original (left) datasets:
In [8]: fig, axes = plt.subplots(1, 2, figsize=(14, 5))
...: ds.air.plot(ax=axes[0])
...: ds_mercator.air.plot(ax=axes[1])
Out[8]: <matplotlib.collections.QuadMesh at 0x2b3b94be0

R: interpolate a value from dataframe based on two inputs

I have a data frame that looks like this:
Teff logg M_div_H U B V R I J H K L Lprime M
1 2000 4.0 -0.1 -13.443 -11.390 -7.895 -4.464 -1.831 1.666 3.511 2.701 4.345 4.765 5.680
2 2000 4.5 -0.1 -13.402 -11.416 -7.896 -4.454 -1.794 1.664 3.503 2.728 4.352 4.772 5.687
3 2000 5.0 -0.1 -13.358 -11.428 -7.888 -4.431 -1.738 1.664 3.488 2.753 4.361 4.779 5.685
4 2000 5.5 -0.1 -13.220 -11.079 -7.377 -4.136 -1.483 1.656 3.418 2.759 4.355 4.753 5.638
5 2200 3.5 -0.1 -11.866 -9.557 -6.378 -3.612 -1.185 1.892 3.294 2.608 3.929 4.289 4.842
6 2200 4.5 -0.1 -11.845 -9.643 -6.348 -3.589 -1.132 1.874 3.310 2.648 3.947 4.305 4.939
...
Let's say I have two values:
input_Teff = 4.8529282904170595E+003
input_log_g = 1.9241934741026787E+000
Notice how every V value has a unique Teff, logg combination. From the input values, I would like to interpolate a value for V. Is there a way to do this in R?
Edit 1: Here is the link to the full data frame: https://www.dropbox.com/s/prbceabxmd25etx/lcb98cor.dat?dl=0
Building on Ian Campbell's observation that you can consider your data as points on a two-dimensional plane, you can use spatial interpolation methods. The simplest approach is inverse-distance weighting, which you can implement like this
library(data.table)
d <- fread("https://www.dropbox.com/s/prbceabxmd25etx/lcb98cor.dat?dl=1")
setnames(d,"#Teff","Teff")
First rescale the data as appropriate (not shown here, see Ian's answer)
library(gstat)
# fit model
idw <- gstat(id="V", formula = V~1, locations = ~Teff+logg, data=d, nmax=7, set=list(idp = .5))
# new "points" to predict to
newd <- data.frame(Teff=c(4100, 4852.928), logg=c(1.5, 1.9241934741026787))
p <- predict(idw, newd)
#[inverse distance weighted interpolation]
p$V.pred
#[1] -0.9818571 -0.3602857
For higher dimensions you could use fields::Tps (I think you can force that to be an exact method, that is, exactly honor the observations, by making each observation a node)
We can imagine that Teff and logg exist in a 2-dimensional plane. We can see that your input point exists in that same space:
library(tidyverse)
ggplot(data,aes(x = Teff, y = logg)) +
geom_point() +
geom_point(data = data.frame(Teff = 4.8529282904170595e3, logg = 1.9241934741026787),
color = "orange")
However, we can see the scale of Teff and logg are not the same. Simply taking log(Teff) gets us pretty close, but not quite. So we can rescale between 0 and 1 instead. We can create a custom rescale function. It will become clear why we can't use scales::rescale in a moment.
rescale = function(x,y){(x - min(y))/(max(y)-min(y))}
We can now rescale the data:
data %>%
mutate(Teff.scale = rescale(Teff,Teff),
logg.scale = rescale(logg,logg)) -> data
From here, we might use raster::pointDistance to calculate the distance from the input point to all of the scaled values:
raster::pointDistance(cbind(rescale(input_Teff,data$Teff),rescale(input_log_g,data$logg)),
data[,c("Teff.scale","logg.scale")],
lonlat = FALSE)
We can use which.min to find the row with the minimum distance:
data[which.min(raster::pointDistance(cbind(rescale(input_Teff,data$Teff),rescale(input_log_g,data$logg)),
data[,c("Teff.scale","logg.scale")],
lonlat = FALSE)),]
Teff logg M_div_H U B V R I J H K L Lprime M Teff.scale logg.scale
1: 4750 2 -0.1 -2.447 -1.438 -0.355 0.159 0.589 1.384 1.976 1.881 2.079 2.083 2.489 0.05729167 0.4631902
Here we can visualize the result:
ggplot(data,aes(x = Teff.scale, y = logg.scale)) +
geom_point() +
geom_point(data = data[which.min(raster::pointDistance(cbind(rescale(input_Teff,data$Teff),rescale(input_log_g,data$logg)),data[,c("Teff.scale","logg.scale")], FALSE)),],
color = "blue") +
geom_point(data = data.frame(Teff.scale = rescale(input_Teff,data$Teff),logg.scale = rescale(input_log_g,data$logg)),
color = "orange")
And access the appropriate value for V:
data[which.min(raster::pointDistance(cbind(rescale(input_Teff,data$Teff),rescale(input_log_g,data$logg)),data[,c("Teff.scale","logg.scale")], FALSE)),"V"]
V
1: -0.355
Data:
library(data.table)
data <- fread("https://www.dropbox.com/s/prbceabxmd25etx/lcb98cor.dat?dl=1")
setnames(data,"#Teff","Teff")
input_Teff = 4.8529282904170595E+003
input_log_g = 1.9241934741026787E+000

stat_density2d - What does the legend mean?

I have a map done in R with stat_density2d. This is the code:
ggplot(data, aes(x=Lon, y=Lat)) +
stat_density2d(aes(fill = ..level..), alpha=0.5, geom="polygon",show.legend=FALSE)+
geom_point(colour="red")+
geom_path(data=map.df,aes(x=long, y=lat, group=group), colour="grey50")+
scale_fill_gradientn(colours=rev(brewer.pal(7,"Spectral")))+
xlim(-10,+2.5) +
ylim(+47,+60) +
coord_fixed(1.7) +
theme_void()
And it produces this:
Great. It works. However I do not know what the legend means. I did find this wikipedia page:
https://en.wikipedia.org/wiki/Multivariate_kernel_density_estimation
And the example they used (which contains red, orange and yellow) stated:
The coloured contours correspond to the smallest region which contains
the respective probability mass: red = 25%, orange + red = 50%, yellow
+ orange + red = 75%
However, using stat_density2d, I have 11 contours in my map. Does anyone know how stat_density2d works and what the legend means? Ideally I wanted to be able to state something like the red contour contains 25% of the plots etc.
I have read this: https://ggplot2.tidyverse.org/reference/geom_density_2d.html and I am still none the wiser.
Let's take the faithful example from ggplot2:
ggplot(faithful, aes(x = eruptions, y = waiting)) +
stat_density_2d(aes(fill = factor(stat(level))), geom = "polygon") +
geom_point() +
xlim(0.5, 6) +
ylim(40, 110)
(apologies in advance for not making this prettier)
The level is the height at which the 3D "mountains" were sliced. I don't know of a way (others might) to translate that to a percentage but I do know to get you said percentages.
If we look at that chart, level 0.002 contains the vast majority of the points (all but 2). Level 0.004 is actually 2 polygons and they contain all but ~dozen of the points. If I'm getting the gist of what you're asking that's what you want to know, except not count but the percentage of points encompassed by polygons at a given level. That's straightforward to compute using the methodology from the various ggplot2 "stats" involved.
Note that while we're importing the tidyverse and sp packages we'll use some other functions fully-qualified. Now, let's reshape the faithful data a bit:
library(tidyverse)
library(sp)
xdf <- select(faithful, x = eruptions, y = waiting)
(easier to type x and y)
Now, we'll compute the two-dimensional kernel density estimation the way ggplot2 does:
h <- c(MASS::bandwidth.nrd(xdf$x), MASS::bandwidth.nrd(xdf$y))
dens <- MASS::kde2d(
xdf$x, xdf$y, h = h, n = 100,
lims = c(0.5, 6, 40, 110)
)
breaks <- pretty(range(zdf$z), 10)
zdf <- data.frame(expand.grid(x = dens$x, y = dens$y), z = as.vector(dens$z))
z <- tapply(zdf$z, zdf[c("x", "y")], identity)
cl <- grDevices::contourLines(
x = sort(unique(dens$x)), y = sort(unique(dens$y)), z = dens$z,
levels = breaks
)
I won't clutter the answer with str() output but it's kinda fun looking at what happens there.
We can use spatial ops to figure out how many points fall within given polygons, then we can group the polygons at the same level to provide counts and percentages per-level:
SpatialPolygons(
lapply(1:length(cl), function(idx) {
Polygons(
srl = list(Polygon(
matrix(c(cl[[idx]]$x, cl[[idx]]$y), nrow=length(cl[[idx]]$x), byrow=FALSE)
)),
ID = idx
)
})
) -> cont
coordinates(xdf) <- ~x+y
data_frame(
ct = sapply(over(cont, geometry(xdf), returnList = TRUE), length),
id = 1:length(ct),
lvl = sapply(cl, function(x) x$level)
) %>%
count(lvl, wt=ct) %>%
mutate(
pct = n/length(xdf),
pct_lab = sprintf("%s of the points fall within this level", scales::percent(pct))
)
## # A tibble: 12 x 4
## lvl n pct pct_lab
## <dbl> <int> <dbl> <chr>
## 1 0.002 270 0.993 99.3% of the points fall within this level
## 2 0.004 259 0.952 95.2% of the points fall within this level
## 3 0.006 249 0.915 91.5% of the points fall within this level
## 4 0.008 232 0.853 85.3% of the points fall within this level
## 5 0.01 206 0.757 75.7% of the points fall within this level
## 6 0.012 175 0.643 64.3% of the points fall within this level
## 7 0.014 145 0.533 53.3% of the points fall within this level
## 8 0.016 94 0.346 34.6% of the points fall within this level
## 9 0.018 81 0.298 29.8% of the points fall within this level
## 10 0.02 60 0.221 22.1% of the points fall within this level
## 11 0.022 43 0.158 15.8% of the points fall within this level
## 12 0.024 13 0.0478 4.8% of the points fall within this level
I only spelled it out to avoid blathering more but the percentages will change depending on how you modify the various parameters to the density computation (same holds true for my ggalt::geom_bkde2d() which uses a different estimator).
If there is a way to tease out the percentages without re-performing the calculations there's no better way to have that pointed out than by letting other SO R folks show how much more clever they are than the person writing this answer (hopefully in more diplomatic ways than seem to be the mode of late).

Is there a quicker alternative to "gIntersection"?

I need to rapidly determine if a spatial polygon and a spatial line intersect. I am currently converting the polgon to a spatial line and using gIntersection(). Can anyone suggest a potentially quicker method? Perhaps using rasters instead of spatial line or something. I need to do this many thousands of times.
# .shp file to Spatial Line
polygon1 <- readShapeSpatial("C:.../SALandmass.shp")
polygon1filled <- SpatialPolygons(list(Polygons(list(polygon1#polygons[[1]]#Polygons[[1]]),ID=1)))
SL <- as(polygon1filled, "SpatialLines")
# Test if line between two coordinates cross the shape
Pt1 = list(x = c(CurrentLong, MapCoordsm$x[i]), y = c(CurrentLat, MapCoordsm$y[i]))
SpatialLine1 = SpatialLines(list(Lines(Line(cbind(Pt1$x,Pt1$y)), "L1")))
cross <- length(gIntersection(SpatialLine1, SL))
Where gIntersection returns a geometry with the intersection, gIntersects returns a logical indicating whether two geometries intersect.
Thanks for the input Edzer, and others.
I ran some test on your suggestions and it seems gIntersects() makes a huge difference. It is basically the same to convert the polygon to spatial lines first or just use the polygon.
Here are the test results:
# Original approach
system.time({
Pt1 = list(x = c(long1, long2), y = c(lat1, lat2))
SpatialLine1 = SpatialLines(list(Lines(Line(cbind(Pt1$x,Pt1$y)), "L1")))
cross <- length(gIntersection(SpatialLine1, SL))
})
## user system elapsed
## 0.53 0.00 0.53
# Edzer suggestion: using gIntersects
system.time({
Pt1 = list(x = c(long1, long2), y = c(lat1, lat2))
SpatialLine1 = SpatialLines(list(Lines(Line(cbind(Pt1$x,Pt1$y)), "L1")))
cross <- (gIntersects(SpatialLine1, SL))
})
# user system elapsed
# 0.06 0.00 0.06
# Edzer suggestion 2: using a polygon rather that spacial lines
system.time({
Pt1 = list(x = c(long1, long2), y = c(lat1, lat2))
SpatialLine1 = SpatialLines(list(Lines(Line(cbind(Pt1$x,Pt1$y)), "L1")))
cross <- length(gIntersection(SpatialLine1, polygon1filled))
})
# user system elapsed
# 0.43 0.05 0.48
# Edzer suggestion 1&2: using a polygon rather that spacial lines and gIntersects
system.time({
Pt1 = list(x = c(long1, long2), y = c(lat1, lat2))
SpatialLine1 = SpatialLines(list(Lines(Line(cbind(Pt1$x,Pt1$y)), "L1")))
cross <- (gIntersects(SpatialLine1, polygon1filled))
})
# user system elapsed
# 0.06 0.00 0.07

Resources