Finding nearest station to each shop using BallTree - location

I've got 2 datasets, a list of shops with UK coordinates and train station also, with coordinates.
I'm using BallTree to get the nearest station to each shop with a distance, using a a code from this website and I've swapped in my dataframes appropriately.
import pandas as pd
import numpy as np
import geopandas as gpd
from sklearn.neighbors import BallTree
df_pocs = pd.read_csv(r'C:\Users\FLETCHWI\Desktop\XX\shops.csv', encoding = "ISO-8859-1", engine='python')
df_stations = pd.read_csv(r'C:\Users\FLETCHWI\Desktop\xx\uk_stations.csv', encoding = "ISO-8859-1", engine='python')
gdf_pocs = gpd.GeoDataFrame(
df_pocs, geometry=gpd.points_from_xy(df_pocs.longitude, df_pocs.latitude))
gdf_stations = gpd.GeoDataFrame(
df_stations, geometry=gpd.points_from_xy(df_stations.longitude, df_stations.latitude))
def get_nearest(src_points, candidates, k_neighbors=1):
"""Find nearest neighbors for all source points from a set of candidate points"""
# Create tree from the candidate points
tree = BallTree(candidates, leaf_size=15, metric='haversine')
# Find closest points and distances
distances, indices = tree.query(src_points, k=k_neighbors)
# Transpose to get distances and indices into arrays
distances = distances.transpose()
indices = indices.transpose()
# Get closest indices and distances (i.e. array at index 0)
# note: for the second closest points, you would take index 1, etc.
closest = indices[0]
closest_dist = distances[0]
# Return indices and distances
return (closest, closest_dist)
def nearest_neighbor(left_gdf, right_gdf, return_dist=False):
For each point in left_gdf, find closest point in right GeoDataFrame and return them.
NOTICE: Assumes that the input Points are in WGS84 projection (lat/lon).
left_geom_col =
right_geom_col =
# Ensure that index in right gdf is formed of sequential numbers
right = right_gdf.copy().reset_index(drop=True)
# Parse coordinates from points and insert them into a numpy array as RADIANS
left_radians = np.array(left_gdf[left_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
right_radians = np.array(right[right_geom_col].apply(lambda geom: (geom.x * np.pi / 180, geom.y * np.pi / 180)).to_list())
# Find the nearest points
# -----------------------
# closest ==> index in right_gdf that corresponds to the closest point
# dist ==> distance between the nearest neighbors (in meters)
closest, dist = get_nearest(src_points=left_radians, candidates=right_radians)
# Return points from right GeoDataFrame that are closest to points in left GeoDataFrame
closest_points = right.loc[closest]
# Ensure that the index corresponds the one in left_gdf
closest_points = closest_points.reset_index(drop=True)
# Add distance if requested
if return_dist:
# Convert to meters from radians
earth_radius = 6371000 # meters
closest_points['distance'] = dist * earth_radius
return closest_points
# Find closest public transport stop for each building and get also the distance based on haversine distance
# Note: haversine distance which is implemented here is a bit slower than using e.g. 'euclidean' metric
# but useful as we get the distance between points in meters
closest_stations = nearest_neighbor(gdf_pocs, gdf_stations, return_dist=True)
Upon running the code, it returns the same station for every shop that I have. However I'd like it to find the nearest station for every shop and the distance to it.
Any help appreciated, thanks!

I did some testing of the functions and indeed lat/long needs to be reversed for it to work.
Notice the warning:
NOTICE: Assumes that the input Points are in WGS84 projection (lat/lon).
Hence, when defining the point simple change
gdf_pocs = gpd.GeoDataFrame(
df_pocs, geometry=gpd.points_from_xy(df_pocs.longitude, df_pocs.latitude))
gdf_stations = gpd.GeoDataFrame(
df_stations, geometry=gpd.points_from_xy(df_stations.longitude, df_stations.latitude))
gdf_pocs = gpd.GeoDataFrame(
df_pocs, geometry=gpd.points_from_xy(df_pocs.latitude, df_pocs.longitude))
gdf_stations = gpd.GeoDataFrame(
df_stations, geometry=gpd.points_from_xy(df_stations.latitude, df_stations.longitude))


Function using multiprocessing returns None values

I have been stuck on this for quite a while now. I am using the multiprocessing function to speed up a function that previously looped over data points. Before using multiprocessing the function worked fine, but now it returns some none values (the first few, <10) before returning values. I have tried many things and different ways to use the multiprocessing pool.
The multiprocessing is used inside a function, which I am not sure if that might be the problem?
def SkyViewFactor(point, coords, max_radius):
betas_lin = np.linspace(0,2*np.pi,steps_beta)
"""this is the analytical dome area but should make same assumption as for d_area"""
dome_area = max_radius**2*2*np.pi
""" we throw away all point outside the dome
# dome is now a 5 column array of points:
# the 5 columns: x,y,z,radius,angle theta"""
dome_p = dome(point, coords, max_radius)
betas = np.zeros(steps_beta)
"""we loop over all points in the dome"""
d = 0
while (d < dome_p.shape[0]):
psi = np.arctan((dome_p[d,2]-point[2])/dome_p[d,3])
"""The angles of the min and max angle of the building"""
beta_min = - np.arcsin(np.sqrt(2*gridboxsize**2)/2/dome_p[d,3]) + dome_p[d,4]
beta_max = np.arcsin(np.sqrt(2*gridboxsize**2)/2/dome_p[d,3]) + dome_p[d,4]
"""Where the index of betas fall within the min and max beta, and there is not already a larger psi blocking"""
betas[np.nonzero(np.logical_and((betas < psi), np.logical_and((beta_min <= betas_lin), (betas_lin < beta_max))))] = psi
d +=1
areas = d_area(betas, steps_beta, max_radius)
"""The SVF is the fraction of area of the dome that is not blocked"""
SVF = np.around((dome_area - np.sum(areas))/dome_area, 3)
return SVF
def calc_SVF(coords, max_radius, blocklength):
Function to calculate the sky view factor.
We create a dome around a point with a certain radius,
and for each point in the dome we evaluate of the height of this point blocks the view
:param coords: all coordinates of our dataset
:param max_radius: maximum radius we think influences the svf
:param blocklength: the first amount of points in our data set we want to evaluate
:return: SVF for all points
def parallel_runs_SVF():
points = [coords[i,:] for i in range(blocklength)]
pool = Pool()
SVF_list = []
SVF_par = partial(SkyViewFactor, coords=coords,max_radius=max_radius) # prod_x has only one argument x (y is fixed to 10)
SVF =, points)
# if SVF != None:
# SVF_list.append(SVF)
return SVF
if __name__ == '__SVF__':
return parallel_runs_SVF()
This function is later called in:
def reshape_SVF(data,coords,julianday,lat,long,LMT,reshape,save_CSV,save_Im):
[x_len, y_len] = [int(data.shape[0]/2),int(data.shape[1]/2)]
blocklength = int(x_len*y_len)
"Compute SVF and SF and Reshape the shadow factors and SVF back to nd array"
SVFs = calc_SVF(coords,max_radius,blocklength)
SFs = calc_SF(coords,julianday,lat,long,LMT,blocklength)
#SVFs = filter(None, SVFs)
"If reshape is true we reshape the arrays to the original data matrix"
if reshape == True:
SVF_matrix = np.ndarray([x_len,y_len])
SF_matrix = np.ndarray([x_len,y_len])
for i in range(blocklength):
SVF_matrix[coords[int(i-x_len/2),0],coords[int(i-y_len/2),1]] = SVFs[i]
SF_matrix[coords[int(i-x_len/2),0],coords[int(i-y_len/2),1]] = SFs[i]
if save_CSV == True:
np.savetxt("SVFmatrix.csv", SVF_matrix, delimiter=",")
np.savetxt("SFmatrix.csv", SF_matrix, delimiter=",")
if save_Im == True:
tf.imwrite('SVF_matrix.tif', SVF_matrix, photometric='minisblack')
tf.imwrite('SF_matrix.tif', SF_matrix, photometric='minisblack')
return SF_matrix,SF_matrix
elif reshape == False:
np.savetxt("SVFs.csv", SVFs, delimiter=",")
np.savetxt("SFs.csv", SFs, delimiter=",")
return SVFs, SFs
SFs is a similar function with the same structure (it also uses multiprocessing). The goal is to return a list with all Sky View factors, or if reshape is true a matrix with the same shape as the original input data (DSM data) with the sky view factor for each location.
I get the error:
SVF_matrix[coords[int(i-x_len/2),0],coords[int(i-y_len/2),1]] = SVFs[i]
TypeError: 'NoneType' object is not subscriptable
I tried to filter out nones with the filter() function using
SVFs = filter(None, SVFs)
This returns the error
TypeError: 'NoneType' object is not iterable. Also I do not know if the None values returned are instead of the actual values or extra (i.e. if I have 1000 datapoints I should have 1000 sky view factors, but do I get an array with 3 Nones and then numbers, do I get 3 Nones ánd 1000 sky view factors or do I get a list of 1000 values of which the first 3 are Nones?)
I also tried to Make an empty list and append the SVF only if this is not None, this however also does not work
SVF_list = []
if SVF != None:
return SVF_list
To include all used functions: These are the functions used in SkyViewFactor to calculate distances and the area elements
def dist(point, coord):
:param point: evaluation point (x,y,z)
:param coord: array of coordinates with heights
:return: the distance from each coordinate to the point and the angle
# Columns is dx
dx = (coord[:,1]-point[1])*gridboxsize
# Rows is dy
dy = (coord[:,0]-point[0])*gridboxsize
dist = np.sqrt(abs(dx)**2 + abs(dy)**2)
"""angle is 0 north direction"""
angle = np.arctan2(dy,dx)+np.pi/2
return dist,angle
def dome(point, coords, maxR):
:param point: point we are evaluating
:param coords: array of coordinates with heights
:param maxR: maximum radius in which we think the coordinates can influence the SVF
:return: a dome of points that we take into account to evaluate the SVF
radii, angles = dist(point,coords)
coords = np.column_stack([coords, radii])
coords = np.column_stack([coords, angles])
"""the dome consist of points higher than the view height and within the radius we want"""
dome = coords[(np.logical_and(coords[:,3]<maxR,coords[:,3]>0.1)),:]
dome = dome[(dome[:,2]>point[2]),:]
return dome
def d_area(psi,steps_beta,maxR):
"""Radius at ground surface and at the height of the projection of the building"""
d_area = 2*np.pi/steps_beta*maxR**2*np.sin(psi)
return d_area
Help with this or some other suggestion to speed up my code / use multiprocessing the right way is very appreciated!
I am trying to speed up a for loop using multiprocessing, this works without multiprocessing but since I have to iterate over 1.250.000 datapoints it is way to slow. With multiprocessing it returns None for the first few values.

Calculate the non-projected area inside a contour line created by Basemap

I am currently trying to determine the area inside specfic contour lines on a Mollweide map projection using Basemap. Specifically, what I'm looking for is the area of various credible intervals in square degrees (or degrees2) of an astronomical event on the celestial sphere. The plot is shown below:
Fortunately, a similar question has already been answered before that helps considerably. The method outlined in the answer is able to account for holes within the contour as well which is a necessity for my use case. My adapted code for this particular method is provided below:
# generate a regular lat/lon grid.
nlats = 300; nlons = 300; delta_lon = 2.*np.pi/(nlons-1); delta_lat = np.pi/(nlats-1)
lats = (0.5*np.pi-delta_lat*np.indices((nlats,nlons))[0,:,:])
lons = (delta_lon*np.indices((nlats,nlons))[1,:,:] - np.pi)
map = Basemap(projection='moll',lon_0=0, celestial=True)
# compute native map projection coordinates of lat/lon grid
x, y = map(lons*180./np.pi, lats*180./np.pi)
areas = []
cred_ints = [0.5,0.9]
for k in range(len(cred_ints)):
cs = map.contourf(x,y,p1,levels=[0.0,cred_ints[k]]) ## p1 is the cumulative distribution across all points in the sky (usually determined via KDE on the data)
##organizing paths and computing individual areas
paths = cs.collections[0].get_paths()
area_of_individual_polygons = []
for p in paths:
sign = 1 ##<-- assures that area of first(outer) polygon will be summed
verts = p.vertices
codes =
idx = np.where(codes==Path.MOVETO)[0]
vert_segs = np.split(verts,idx)[1:]
code_segs = np.split(codes,idx)[1:]
for code, vert in zip(code_segs,vert_segs):
##computing the area of the polygon
sign = -1 ##<-- assures that the other (inner) polygons will be subtracted
##computing total area
total_area = np.sum(area_of_individual_polygons)
As far as I can tell this method works beautifully... except for one small wrinkle: this calculates the area using the projected coordinate units. I'm not entirely sure what the units are in this case but they are definitely not degrees2 (the calculated areas are on the order of 1013 units2... maybe the units are pixels?). As alluded to earlier, what I'm looking for is how to calculate the equivalent area in the global coordinate units, i.e. in degrees2.
Is there a way to convert the area calculated in the projected domain back into the global domain in square degrees? Or perhaps is there a way to modify this method so that it determines the area in degrees2 from the get go?
Any help will be greatly appreciated!
For anyone that comes across this question, while I didn't figure out a way to directly convert the projected area back into the global domain, I did develop a new solution by transforming the contour path vertices (but this time defined in the lat/lon coordinate system) via an area preserving sinusoidal projection:
where φ is the latitude, λ is the longitude, and λ0 is the longitude of the central meridian.
This flat projection means you can just use the package Shapely to determine the area of the polygon defined by the projected vertices (in square units for a radius of 1 unit, or more simply steradians). Multiplying this number by (180/π)2 will give you the area in square degrees for the contour in question.
Fortunately, only minor adjustments to the code mentioned in the OP was needed to achieve this. The final code is provided below:
# generate a regular lat/lon grid.
nlats = 300; nlons = 300;
delta_lat = np.pi/(nlats-1); delta_lon = 2.*np.pi/(nlons-1);
lats = (0.5*np.pi-delta_lat*np.indices((nlats,nlons))[0,:,:])
lons = (delta_lon*np.indices((nlats,nlons))[1,:,:])
# collect and organize contour data for each credible interval
cred_ints = [0.5,0.9]
ci_areas = []
for k in range(len(cred_ints)):
cs = plt.contourf(lons,lats,p1,levels=[0,cred_ints[k]]) ## p1 is the cumulative distribution across all points in the sky (usually determined via KDE of the dataset in question)
paths = cs.collections[0].get_paths()
##organizing paths and computing individual areas
area_of_individual_polygons = []
for p in paths:
sign = 1 ##<-- assures that area of first(outer) polygon will be summed
vertices = p.vertices
codes =
idx = np.where(codes==Path.MOVETO)[0]
verts_segs = np.split(vertices,idx)[1:]
for verts in verts_segs:
# transforming the coordinates via an area preserving sinusoidal projection
x = (verts[:,0] - (0)*np.ones_like(verts[:,0]))*np.cos(verts[:,1])
y = verts[:,1]
verts_proj = np.stack((x,y), axis=1)
##computing the area of the polygon
sign = -1 ##<-- assures that the other(inner) polygons/holes will be subtracted
##computing total area
total_area = ((180/np.pi)**2)*np.sum(area_of_individual_polygons)

Obtaining a region of evenly distributed points on a sphere

There are several questions on this site about distributing points on the surface of a sphere, but all of these are based on actually generating all of the points on that sphere. My favorite thus far is the golden spiral discussed in Evenly distributing n points on a sphere.
I need to cover a sphere in trillions of points, but only ever need to actually generate a tiny region of the surface (earth down to ~10 meters, looking at a roughly 1 km^2 area). The points generated for that region must match the points that would be generated for the entire sphere (i.e., stitching small regions together must yield the same result as generating a larger region), and generation should be pretty fast.
My attempts to use the golden spiral with such a large number of points have been thwarted by floating point precision issues.
The best I've managed to come up with is generating points at equally spaced latitudes and calculating longitudinal spacing based on the circumference at that latitude. The result is far from satisfactory however (especially the resulting horizontal rings of points).
Does anyone have a suggestion for generating a small region of distributed points on the surface of a sphere?
The vertices of a geodesic sphere would work well in this application.
You start with an icosahedron, divide each face into a triangular mesh of whatever resolution you like, and project the points onto the surface of the sphere.
The Fibonacci sphere approximation is quite easy to generalize efficiently to a subset of points computation, as the analytic formulas are very straight-forward.
The below code computes the subset of points shown below for a trillion points in a few seconds of runtime on my weak laptop and a relatively under optimised python implementation.
Code to compute the above is below, and includes a means to verify the subset computation is exactly the same as a brute-force computation (however don't try it for trillion points, it will never finish unless you have a super-computer!)
Please note, the use of 128-bit doubles is an absolute requirement when you do the computation over more than about a billion points as there are major quantisation artefacts otherwise!
Runtime scales with r' * N where r' is the ratio of the subset to that of the full sphere. Thus, a very small r' can be computed very efficiently.
#!/usr/bin/env python3
import argparse
import mpl_toolkits.mplot3d.axes3d as ax3d
import matplotlib.pyplot as plt
import numpy as np
def fibonacci_sphere_pts(num_pts):
ga = (3 - np.sqrt(5)) * np.pi # golden angle
# Create a list of golden angle increments along tha range of number of points
theta = ga * np.arange(num_pts)
# Z is a split into a range of -1 to 1 in order to create a unit circle
z = np.linspace(1 / num_pts - 1, 1 - 1 / num_pts, num_pts)
# a list of the radii at each height step of the unit circle
radius = np.sqrt(1 - z * z)
# Determine where xy fall on the sphere, given the azimuthal and polar angles
y = radius * np.sin(theta)
x = radius * np.cos(theta)
return np.asarray(list(zip(x,y,z)))
def fibonacci_sphere(num_pts):
x,y,z = zip(*fibonacci_sphere_subset(num_pts))
# Display points in a scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x, y, z)
def fibonacci_sphere_subset_pts(num_pts, p0, r0 ):
Get a subset of a full fibonacci_sphere
ga = (3 - np.sqrt(5)) * np.pi # golden angle
x0, y0, z0 = p0
z_s = 1 / num_pts - 1
z_e = 1 - 1 / num_pts
# linspace formula for range [z_s,z_e] for N points is
# z_k = z_s + (z_e - z_s) / (N-1) * k , for k [0,N)
# therefore k = (z_k - z_s)*(N-1) / (z_e - z_s)
# would be the closest value of k
k = int(np.round((z0 - z_s) * (num_pts - 1) / (z_e - z_s)))
# here a sufficient number of "layers" of the fibonacci sphere must be
# selected to obtain enough points to be a superset of the subset given the
# radius, we use a heuristic to determine the number but it can be obtained
# exactly by the correct formula instead (by choosing an upperbound)
dz = (z_e - z_s) / (num_pts-1)
n_dk = int(np.ceil( r0 / dz ))
dk = np.arange(k - n_dk, k + n_dk+1)
dk = dk[np.where((dk>=0)&(dk<num_pts))[0]]
# NOTE: *must* use long double over regular doubles below, otherwise there
# are major quantization errors in the output for large number of points
theta = ga * dk.astype(np.longdouble)
z = z_s + (z_e - z_s ) / (num_pts-1) *dk
radius = np.sqrt(1 - z * z)
y = radius * np.sin(theta)
x = radius * np.cos(theta)
idx = np.where(np.square(x - x0) + np.square(y-y0) + np.square(z-z0) <= r0*r0)[0]
return x[idx],y[idx],z[idx]
def fibonacci_sphere_subset(num_pts, p0, r0, do_compare=False ):
Display fib sphere subset points and optionally compare against bruteforce computation
x,y,z = fibonacci_sphere_subset_pts(num_pts,p0,r0)
if do_compare:
subset = zip(x,y,z)
subset_bf = fibonacci_sphere_pts(num_pts)
x0,y0,z0 = p0
subset_bf = [ (x,y,z) for (x,y,z) in subset_bf if np.square(x - x0) + np.square(y-y0) + np.square(z-z0) <= r0*r0 ]
subset_bf = np.asarray(subset_bf)
if np.allclose(subset,subset_bf):
print('PASS: subset and bruteforce computation agree completely')
print('FAIL: subset and bruteforce computation DO NOT agree completely')
# Display points in a scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(x, y, z)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="fibonacci sphere")
"numpts", type=int, help="number of points to distribute along sphere"
args = parser.parse_args()
# hard-coded point to query with a tiny fixed radius
p0 = (.5,.5,np.sqrt(1. - .5*.5 - .5*.5)) # coordinate of query point representing center of subset, note all coordinates fall between -1 and 1
r0 = .00001 # the radius of the subset, a very small number is chosen as radius of full sphere is 1.0

How determine optimal epsilon value in meters for DBSCAN by plotting KNN elbow

Before doing DBSCAN I need to find optimal epsilon value, all the points are geographical coordinates, I need the epsilon value in meters before convert it to radians to apply DBSCAN using haversine metrics
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=4)
nbrs =[['y', 'x']])
distances, indices = nbrs.kneighbors(firms[['y', 'x']])
# Plotting K-distance Graph
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.title('K-distance Graph',fontsize=20)
plt.xlabel('Data Points sorted by distance',fontsize=14)
and the graph output is this, but I need the epsilon value in meters.
I hope this helps to clarify, just a few observations:
a) You are already finding the optimal epsilon value, using that method and from your figure eps = 0.005.
b) If your points are geographic coordinates, you don't need the epsilon value in meters before converting only to then convert to radians so you can apply DBSCAN using haversine metrics, because from the geographic coordinates you can convert straight away to radians, and then you multiply by 6371000/1000 to get the result in kilometers, like this:
from sklearn.metrics.pairwise import haversine_distances
from math import radians
bsas = [-34.83333, -58.5166646]
paris = [49.0083899664, 2.53844117956]
bsas_in_radians = [radians(_) for _ in bsas]
paris_in_radians = [radians(_) for _ in paris]
result = haversine_distances([bsas_in_radians, paris_in_radians])
result * 6371000/1000 # multiply by Earth radius to get kilometers
Code snippet from:

How do I perform a curve fit with an array of points and touching a specific point in that array

I need help with curve fitting a given set of points. The points form a parabola and I ought to find the peak point of the result. Issue is when I do a curve fit, it sometimes doesn't touch the max y-coordinate even if the actual point is given in the input array.
Following is the code snippet. Here 1.88 is the actual peak y-coordinate (13.05,1.88). But the graph generated by the code does not touch the point due to curve fitting. So is there a way to fit the curve making sure that it touches the max point given in the input array?
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit, minimize_scalar
fig = plt.gcf()
#fig.set_size_inches(18.5, 10.5)
x = [4.59,9.02,13.05,18.47,20.3]
y = [1.7,1.84,1.88,1.7,1.64]
def f(x, p1, p2, p3):
return p3*(p1/((x-p2)**2 + (p1/2)**2))
popt, pcov = curve_fit(f, x, y)
# find the peak
fm = lambda x: -f(x, *popt)
r = minimize_scalar(fm, bounds=(1, 5))
print( "maximum:", r["x"], f(r["x"], *popt) ) #maximum: 2.99846874275 18.3928199902
plt.text(1,1.9,'maximum '+str(round(r["x"],2))+'( #'+str(round(f(r["x"], *popt),2)) + ' )')
x_curve = np.linspace(min(x), max(x), 50)
plt.plot(x_curve, f(x_curve, *popt))
plt.plot(r['x'], f(r['x'], *popt), 'ko')
Here is a graphical code example using your equation with weighted fitting, where I have made the max point larger to more easily see the effect of the weighting. In non-weighted curve fitting, all weights are implicitly 1.0 as all data points have equal weight. Scipy's curve_fit routine uses weights in the form of uncertainties, so that giving a point a very small uncertainty (which I have done) is like giving the point a very large weight. This technique can be used to make a fit pass arbitrarily close to any single data point by any software that can perform weghted fitting.
import numpy, scipy, matplotlib
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
x = [4.59,9.02,13.05,18.47,20.3]
y = [1.7,1.84,2.0,1.7,1.64]
# note the single very small uncertainty - try making this value 1.0
uncertainties = numpy.array([1.0, 1.0, 1.0E-6, 1.0, 1.0])
# rename data to use previous example
xData = numpy.array(x)
yData = numpy.array(y)
def func(x, p1, p2, p3):
return p3*(p1/((x-p2)**2 + (p1/2)**2))
# these are the same as the scipy defaults
initialParameters = numpy.array([1.0, 1.0, 1.0])
# curve fit the test data, first without uncertainties to
# get us closer to initial starting parameters
ssqParameters, pcov = curve_fit(func, xData, yData, p0 = initialParameters)
# now that we have better starting parameters, use uncertainties
fittedParameters, pcov = curve_fit(func, xData, yData, p0 = ssqParameters, sigma=uncertainties, absolute_sigma=True)
modelPredictions = func(xData, *fittedParameters)
absError = modelPredictions - yData
SE = numpy.square(absError) # squared errors
MSE = numpy.mean(SE) # mean squared errors
RMSE = numpy.sqrt(MSE) # Root Mean Squared Error, RMSE
Rsquared = 1.0 - (numpy.var(absError) / numpy.var(yData))
print('Parameters:', fittedParameters)
print('RMSE:', RMSE)
print('R-squared:', Rsquared)
# graphics output section
def ModelAndScatterPlot(graphWidth, graphHeight):
f = plt.figure(figsize=(graphWidth/100.0, graphHeight/100.0), dpi=100)
axes = f.add_subplot(111)
# first the raw data as a scatter plot
axes.plot(xData, yData, 'D')
# create data for the fitted equation plot
xModel = numpy.linspace(min(xData), max(xData))
yModel = func(xModel, *fittedParameters)
# now the model as a line plot
axes.plot(xModel, yModel)
axes.set_xlabel('X Data') # X axis data label
axes.set_ylabel('Y Data') # Y axis data label
plt.close('all') # clean up after using pyplot
graphWidth = 800
graphHeight = 600
ModelAndScatterPlot(graphWidth, graphHeight)
