How determine optimal epsilon value in meters for DBSCAN by plotting KNN elbow - knn

Before doing DBSCAN I need to find optimal epsilon value, all the points are geographical coordinates, I need the epsilon value in meters before convert it to radians to apply DBSCAN using haversine metrics
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=4)
nbrs = neigh.fit(firms[['y', 'x']])
distances, indices = nbrs.kneighbors(firms[['y', 'x']])
AND THEN
# Plotting K-distance Graph
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.figure(figsize=(20,10))
plt.plot(distances)
plt.title('K-distance Graph',fontsize=20)
plt.xlabel('Data Points sorted by distance',fontsize=14)
plt.ylabel('Epsilon',fontsize=14)
plt.show()
and the graph output is this, but I need the epsilon value in meters.

I hope this helps to clarify, just a few observations:
a) You are already finding the optimal epsilon value, using that method and from your figure eps = 0.005.
b) If your points are geographic coordinates, you don't need the epsilon value in meters before converting only to then convert to radians so you can apply DBSCAN using haversine metrics, because from the geographic coordinates you can convert straight away to radians, and then you multiply by 6371000/1000 to get the result in kilometers, like this:
from sklearn.metrics.pairwise import haversine_distances
from math import radians
bsas = [-34.83333, -58.5166646]
paris = [49.0083899664, 2.53844117956]
bsas_in_radians = [radians(_) for _ in bsas]
paris_in_radians = [radians(_) for _ in paris]
result = haversine_distances([bsas_in_radians, paris_in_radians])
result * 6371000/1000 # multiply by Earth radius to get kilometers
Code snippet from:
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.haversine_distances.html

Related

Finding nearest station to each shop using BallTree

I've got 2 datasets, a list of shops with UK coordinates and train station also, with coordinates.
I'm using BallTree to get the nearest station to each shop with a distance, using a a code from this website and I've swapped in my dataframes appropriately.
https://automating-gis-processes.github.io/site/notebooks/L3/nearest-neighbor-faster.html
Code:
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.neighbors import BallTree
df_pocs = pd.read_csv(r'C:\Users\FLETCHWI\Desktop\XX\shops.csv', encoding = "ISO-8859-1", engine='python')
df_stations = pd.read_csv(r'C:\Users\FLETCHWI\Desktop\xx\uk_stations.csv', encoding = "ISO-8859-1", engine='python')
gdf_pocs = gpd.GeoDataFrame(
df_pocs, geometry=gpd.points_from_xy(df_pocs.longitude, df_pocs.latitude))
gdf_stations = gpd.GeoDataFrame(
df_stations, geometry=gpd.points_from_xy(df_stations.longitude, df_stations.latitude))
def get_nearest(src_points, candidates, k_neighbors=1):
"""Find nearest neighbors for all source points from a set of candidate points"""
# Create tree from the candidate points
tree = BallTree(candidates, leaf_size=15, metric='haversine')
# Find closest points and distances
distances, indices = tree.query(src_points, k=k_neighbors)
# Transpose to get distances and indices into arrays
distances = distances.transpose()
indices = indices.transpose()
# Get closest indices and distances (i.e. array at index 0)
# note: for the second closest points, you would take index 1, etc.
closest = indices[0]
closest_dist = distances[0]
# Return indices and distances
return (closest, closest_dist)
def nearest_neighbor(left_gdf, right_gdf, return_dist=False):
"""
For each point in left_gdf, find closest point in right GeoDataFrame and return them.
NOTICE: Assumes that the input Points are in WGS84 projection (lat/lon).
"""
left_geom_col = left_gdf.geometry.name
right_geom_col = right_gdf.geometry.name
# Ensure that index in right gdf is formed of sequential numbers
right = right_gdf.copy().reset_index(drop=True)
# Parse coordinates from points and insert them into a numpy array as RADIANS
left_radians = np.array(left_gdf[left_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
right_radians = np.array(right[right_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
# Find the nearest points
# -----------------------
# closest ==> index in right_gdf that corresponds to the closest point
# dist ==> distance between the nearest neighbors (in meters)
closest, dist = get_nearest(src_points=left_radians, candidates=right_radians)
# Return points from right GeoDataFrame that are closest to points in left GeoDataFrame
closest_points = right.loc[closest]
# Ensure that the index corresponds the one in left_gdf
closest_points = closest_points.reset_index(drop=True)
# Add distance if requested
if return_dist:
# Convert to meters from radians
earth_radius = 6371000 # meters
closest_points['distance'] = dist * earth_radius
return closest_points
# Find closest public transport stop for each building and get also the distance based on haversine distance
# Note: haversine distance which is implemented here is a bit slower than using e.g. 'euclidean' metric
# but useful as we get the distance between points in meters
closest_stations = nearest_neighbor(gdf_pocs, gdf_stations, return_dist=True)
Upon running the code, it returns the same station for every shop that I have. However I'd like it to find the nearest station for every shop and the distance to it.
Any help appreciated, thanks!
I did some testing of the functions and indeed lat/long needs to be reversed for it to work.
Notice the warning:
NOTICE: Assumes that the input Points are in WGS84 projection (lat/lon).
Hence, when defining the point simple change
gdf_pocs = gpd.GeoDataFrame(
df_pocs, geometry=gpd.points_from_xy(df_pocs.longitude, df_pocs.latitude))
gdf_stations = gpd.GeoDataFrame(
df_stations, geometry=gpd.points_from_xy(df_stations.longitude, df_stations.latitude))
to
gdf_pocs = gpd.GeoDataFrame(
df_pocs, geometry=gpd.points_from_xy(df_pocs.latitude, df_pocs.longitude))
gdf_stations = gpd.GeoDataFrame(
df_stations, geometry=gpd.points_from_xy(df_stations.latitude, df_stations.longitude))

The point that minimizes the sum of euclidean distances to a set of n points

I have a set of points W={(x1, y1), (x2, y2),..., (xn, yn)} on the 2D plane. Can you find an algorithm that takes these points as the input and returns a point (x, y) on the 2D plane which has the minimum sum of distances from the points in W? In other words, if
di = Euclidean_distance((x, y), (xi, yi))
I want to minimize:
d1 + d2 + ... + dn
The Problem
You're looking for the geometric median.
An Easy Solution
There is no closed-form solution to this problem, so iterative or probabilistic methods are used. The easiest way to find this is probably with Weiszfeld's algorithm:
We can implement this in Python as follows:
import numpy as np
from numpy.linalg import norm as npnorm
c_pt_old = np.random.rand(2)
c_pt_new = np.array([0,0])
while npnorm(c_pt_old-c_pt_new)>1e-6:
num = 0
denom = 0
for i in range(POINT_NUM):
dist = npnorm(c_pt_new-pts[i,:])
num += pts[i,:]/dist
denom += 1/dist
c_pt_old = c_pt_new
c_pt_new = num/denom
print(c_pt_new)
There's a chance that Weiszfeld's algorithm won't converge, so it might be best to run it several times from different starting points.
A General Solution
You can also find this using second-order cone programming (SOCP). In addition to solving your specific problem, this general formulation then allows you to easily add constraints and weightings, such as variable uncertainty in the location of each data point.
To do so, you create a number of indicator variables representing the distance between the proposed center point and the data points.
You then minimize the sum of the indicator variables. The result follows
import cvxpy as cp
import numpy as np
import matplotlib.pyplot as plt
#Generate random test data
POINT_NUM = 100
pts = np.random.rand(POINT_NUM,2)
c_pt = cp.Variable(2) #The center point we wish to locate
distances = cp.Variable(POINT_NUM) #Distance from the center point to each data point
#Generate constraints. These are used to hold distances.
constraints = []
for i in range(POINT_NUM):
constraints.append( cp.norm(c_pt-pts[i,:])<=distances[i] )
objective = cp.Minimize(cp.sum(distances))
problem = cp.Problem(objective,constraints)
optimal_value = problem.solve()
print("Optimal value = {0}".format(optimal_value))
print("Optimal location = {0}".format(c_pt.value))
plt.scatter(x=pts[:,0], y=pts[:,1], s=1)
plt.scatter(c_pt.value[0], c_pt.value[1], s=10)
plt.show()
SOCPs are available in a number of solvers including CPLEX, Elemental, ECOS, ECOS_BB, GUROBI, MOSEK, CVXOPT, and SCS.
I've tested and the two approaches give the same answers to within tolerance.
Weiszfeld, E. (1937). "Sur le point pour lequel la somme des distances de n points donnes est minimum". Tohoku Mathematical Journal. 43: 355–386.
If that point does not need to be from your sample, then the mean minimises the euclidean distance.
A third method would be to use a compact nonlinear programming formulation. An unconstrained NLP model would be:
min sum(i, ||x-p(i)|| )
This has just 2 variables (the coordinates of x).
There is a very good initial point available. Let p(i,c) be the coordinates of the data points. Then the mean is
m(c) = sum(i, p(i,c)) / n
where n is the number of data points. This point is often very close to the optimal value of x. So we can use m as an excellent initial point for x.
Some limited experiments indicate this approach is quite faster than a cone programming formulation for large n.
For details see Yet Another Math Programming Consultant - Finding the Central Point in a Point Cloud blog post.

inconsistent definition of longitude and latitude for healpy.pixelfunc.get_interp_val() or healpy.mollview()?

when I rotate a Healpix map along longitude or latitude, I get the wrong behavior.
I'm probably missing something obvious here but so far, I failed to find what.
See demo:
import numpy as np
import healpy as hp
import matplotlib.pyplot as plt
nside = 4
npix = hp.nside2npix(nside)
idx = 70
offset = 1 # rad
# set one pixel to 1 in the map
data = np.array(np.equal(np.arange(npix), idx), dtype=float)
hp.mollview(data, nest=True, title='original')
# longitude and co-latitude in radians
theta, phi = hp.pix2ang(nside, np.arange(npix), nest=True)
# rotate: offset on longitude, keep co-latitude the same
rotated = hp.get_interp_val(data, theta + offset, phi, nest=True)
hp.mollview(rotated, nest=True, title='rotated longitude')
# rotate: keep longitude the same, offset on co-latitude
rotated = hp.get_interp_val(data, theta, phi+offset, nest=True)
hp.mollview(rotated, nest=True, title='rotated latitude')
and results:
original map
rotated longitude
rotated latitude
The dot in the map rotated along longitude is translated vertically, while it is translated horizontally for the rotation along latitude. I'd expect the reverse.
Any hint about what's wrong here?
E.
Theta is co-latitude, Phi is longitude.
It is confusing because their order is inverted than what we usually expect. In fact even in healpy, for example in pix2ang if you set lonlat to true, you get as outputs first Longitude and then Latitude.
Unfortunately this is the convention and we have to stick to this.

Eigenvectors and Eigenvalues of Hessian Matrix

I want to extract centreline pixels in vessel. At first I have seleted a seed point close to a vessel edge using ginput(1) command. This provides the starting point and specifies the region of interest (ROI) on a vessel segment where the analysis needs to be performed.
figure; imshow(Igreen_eq); % Main green channel Image
p = ginput(1);
Then the selected seed point is served as centre of a circle with diameter less than the expected diameter of the vessel, in order for the circle not to intersect with the opposite edge.
t = 0:pi/20:2*pi;
d = 0.8*15; %d=80% of minwidthOfVessel so that it wont intesect with opposite edge;
R0=d/2;%radius
xi = R0*cos(t)+p(1);
yi = R0*sin(t)+p(2);
line(xi,yi,'LineWidth',2,'Color',[0 1 0]);
roimask = poly2mask(double(xi), double(yi), size(Igreen_eq,1), size(Igreen_eq,2));
figure; imshow(roimask) % Binary image of region selected
Itry = Igreen_eq;
Itry(~roimask ) = 0;
imshow(Itry);
Itry = im2double(Itry);
line(xi, yi,'LineWidth', 2, 'Color', [0 1 0]);
hold on; plot(p(1), p(2),'*r')
Problem:
Hessian matrix is to be computed for the light intensity on the circumference of this circle and the eigenvectors has to be obtained.
I have calculated Dxx,Dyy,Dxy using:
[Dxx,Dxy,Dyy] = Hessian2D(Itry,2); %(sigma=2)
I need to write a code in MATLAB for following problem"
For a point inside the vessel, the eigenvectors corresponding to the largest
eigenvalues are normal to the edges and those corresponding to the smallest eigenvalues point to the direction along the vessels.
The first two consecutive vectors on the circle with maximum change in direction are considered as the pixels reflecting the vessel boundaries. The points on the tracking direction are considered as the centers for the subsequent circles. Repetition of this process gives an estimate of the vessel boundary.
How will I calculate largest eigen values and its correspoinding eigen vector of Hessian matrix to select new seed point as discussed above.
Thanks for your reply . I have used eig2image.m to find the eigen vectors at each point on the image (in my image, there is grey values on the concentric circular region and background is black ).
[Lambda1,Lambda2,Ix,Iy]=eig2image(Dxx,Dxy,Dyy)
where Ix and Iy are largest eigen vectors.
But when I try to plot eigen vectors using :
quiver(Ix, Iy)
I can also see the vectors on the black background which should be zero !!
Can you please reply how can I plot eigen vector on the top of the image.
Assuming Dxx, Dyy, Dxy are matrices of second-order partial derivatives of dimensions size(Itry) then for a given point (m,n) in Itry you can do:
H = [Dxx(m,n) Dxy(m,n); Dxy(m,n) Dyy(m,n)];
[V,D] = eig(H); % check by H*V = V*D;
eigenVal1 = D(1,1);
eigenVal2 = D(2,2);
eigenVec1 = V(1,:);
eigenVec2 = V(2,:);
This local eigen-decomposition will give you eigenvalues (and corresponding eigenvectors) which you can sort according to magnitude. You can loop across image points or for a more compact solution see eig2image.m in FileExchange.

Algorithm: Calculate pseudo-random point inside an ellipse

For a simple particle system I'm making, I need to, given an ellipse with width and height, calculate a random point X, Y which lies in that ellipse.
Now I'm not the best at maths, so I wanted to ask here if anybody could point me in the right direction.
Maybe the right way is to choose a random float in the range of the width, take it for X and from it calculate the Y value?
Generate a random point inside a circle of radius 1. This can be done by taking a random angle phi in the interval [0, 2*pi) and a random value rho in the interval [0, 1) and compute
x = sqrt(rho) * cos(phi)
y = sqrt(rho) * sin(phi)
The square root in the formula ensures a uniform distribution inside the circle.
Scale x and y to the dimensions of the ellipse
x = x * width/2.0
y = y * height/2.0
Use rejection sampling: choose a random point in the rectangle around the ellipse. Test whether the point is inside the ellipse by checking the sign of (x-x0)^2/a^2+(y-y0)^2/b^2-1. Repeat if the point is not inside. (This assumes that the ellipse is aligned with the coordinate axes. A similar solution works in the general case but is more complicated, of course.)
It is possible to generate points within an ellipse without using rejection sampling too by carefully considering its definition in polar form. From wikipedia the polar form of an ellipse is given by
Intuitively speaking, we should sample polar angle θ more often where the radius is larger. Put more mathematically, our PDF for the random variable θ should be p(θ) dθ = dA / A, where dA is the area of a single segment at angle θ with width dθ. Using the equation for polar angle area dA = 1/2 r2 dθ and the area of an ellipse being π a b, then the PDF becomes
To randomly sample from this PDF, one direct method is the inverse CDF technique. This requires calculating the cumulative density function (CDF) and then inverting this function. Using Wolfram Alpha to get the indefinite integral, then inverting it gives inverse CDF of
where u runs between 0 and 1. So to sample a random angle θ, you just generate a uniform random number u between 0 and 1, and substitute it into this equation for the inverse CDF.
To get the random radius, the same technique that works for a circle can be used (see for example Generate a random point within a circle (uniformly)).
Here is some sample Python code which implements this algorithm:
import numpy
import matplotlib.pyplot as plt
import random
# Returns theta in [-pi/2, 3pi/2]
def generate_theta(a, b):
u = random.random() / 4.0
theta = numpy.arctan(b/a * numpy.tan(2*numpy.pi*u))
v = random.random()
if v < 0.25:
return theta
elif v < 0.5:
return numpy.pi - theta
elif v < 0.75:
return numpy.pi + theta
else:
return -theta
def radius(a, b, theta):
return a * b / numpy.sqrt((b*numpy.cos(theta))**2 + (a*numpy.sin(theta))**2)
def random_point(a, b):
random_theta = generate_theta(a, b)
max_radius = radius(a, b, random_theta)
random_radius = max_radius * numpy.sqrt(random.random())
return numpy.array([
random_radius * numpy.cos(random_theta),
random_radius * numpy.sin(random_theta)
])
a = 2
b = 1
points = numpy.array([random_point(a, b) for _ in range(2000)])
plt.scatter(points[:,0], points[:,1])
plt.show()
I know this is an old question, but I think none of the existing answers are good enough.
I was looking for a solution for exactly the same problem and got directed here by Google, found all the existing answers are not what I wanted, so I implemented my own solution entirely by myself, using information found here: https://en.wikipedia.org/wiki/Ellipse
So any point on the ellipse must satisfy that equation, how to make a point inside the ellipse?
Just scale a and b with two random numbers between 0 and 1.
I will post my code here, I just want to help.
import math
import matplotlib.pyplot as plt
import random
from matplotlib.patches import Ellipse
a = 4
b = a*math.tan(math.radians((random.random()+0.5)/2*45))
def random_point(a, b):
d = math.radians(random.random()*360)
return (a * math.cos(d) * random.random(), b * math.sin(d) * random.random())
points = [random_point(a, b) for i in range(360)]
x, y = zip(*points)
fig = plt.figure(frameon=False)
ax = fig.add_subplot(111)
ax.set_axis_off()
ax.add_patch(Ellipse((0, 0), 2*a, 2*b, edgecolor='k', fc='None', lw=2))
ax.scatter(x, y)
fig.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0)
plt.axis('scaled')
plt.box(False)
ax = plt.gca()
ax.set_xlim([-a, a])
ax.set_ylim([-b, b])
plt.set_cmap('rainbow')
plt.show()

Resources