Vectorized Distances¶

In this project notebook we'll be comparing for loop and vectorized implemented distance calculations. With large datasets, the vectorized formulation becomes more than 1000 times faster!

Imports¶

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.spatial.distance import cdist
from haversine import haversine, Unit
from math import isclose

Sample Data¶

tx_json = {"city":{"4":"Dallas","5":"Houston","33":"Austin","55":"Fort Worth"},
           "state_name":{"4":"Texas","5":"Texas","33":"Texas","55":"Texas"},
           "lat":{"4":32.7935,"5":29.786,"33":30.3005,"55":32.7817},
           "lng":{"4":-96.7667,"5":-95.3885,"33":-97.7522,"55":-97.3474}}
df = pd.DataFrame(tx_json)
lat = df['lat'].values
lng = df['lng'].values
x = np.array([lat, lng]).T
y = x.copy()

Pseudo Large Dataset for Speed Check¶

a = np.tile(x.T,1000).T
b = np.tile(y.T,1000).T
print(a.shape)
print(b.shape)

(4000, 2)
(4000, 2)

Euclidean Distance¶

def dist(a, b):

    """
    returns the geometric distance of two long lat coordinates

    Parameters
    ----------
    a : numpy array
        the long lat coordinate of the patient
    b : numpy array
        the long lat coordinate of the site(s). can either be a single
    site or multiple sites

    Returns
    -------
    distances : array or float
        dist will return a float if only a single site is specified in b,
        otherwise will return an array
    """

    # if a is a list of patient zips
    if len(a.shape) > 1:
        a = a[:, None, :]
        b = b[None, :, :]

    return np.sqrt(((a-b)**2).sum(axis=-1))

    # else if b is a list of site zips
    if len(b.shape) > 1:
        b = b.T

    return np.sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2)

def dist_no_np(a_, b_):
    res = []
    for a in a_:
        ress = []
        for b in b_:
            ress.append(np.sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2))
        res.append(ress)

Speed Check¶

%%timeit
dist(a, b)

473 µs ± 19.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

%%timeit
dist_no_np(a, b)

161 ms ± 457 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

%%timeit
cdist(a, b, metric='euclidean')

126 µs ± 482 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

euclid_result = dist(x, y)

cdist(x, y, metric='euclidean')

array([[0.        , 3.30824598, 2.68071991, 0.58081988],
       [3.30824598, 0.        , 2.41904691, 3.57931665],
       [2.68071991, 2.41904691, 0.        , 2.51400407],
       [0.58081988, 3.57931665, 2.51400407, 0.        ]])

Haversine Distance¶

Non Vectorized¶

# if you have the local file https://simplemaps.com/data/us-cities
df = pd.read_csv("../../Downloads/simplemaps_uscities_basicv1.75/uscities.csv")
df = df.loc[(df['city'].isin(['Austin', 'Dallas', 'Fort Worth', 'Houston'])) &
       (df['state_id'] == 'TX')]

# df[['city', 'state_name', 'lat', 'lng']].to_json()

def test_haversine():
    tx_json = {"city":{"4":"Dallas","5":"Houston","33":"Austin","55":"Fort Worth"},
               "state_name":{"4":"Texas","5":"Texas","33":"Texas","55":"Texas"},
               "lat":{"4":32.7935,"5":29.786,"33":30.3005,"55":32.7817},
               "lng":{"4":-96.7667,"5":-95.3885,"33":-97.7522,"55":-97.3474}}
    df = pd.DataFrame(tx_json)
    lat = df['lat'].values
    lng = df['lng'].values
    x = np.array([lat, lng]).T
    y = x.copy()
    # https://stackoverflow.com/questions/58307211/python-generate-distance-matrix-for-large-number-of-locations
    answer = cdist(x, y, metric=haversine, unit= Unit.MILES)
    tx_distances = np.array([[  0.        , 223.15619132, 181.75853929,  33.74018707],
                           [223.15619132,   0.        , 145.77171262, 237.09579454],
                           [181.75853929, 145.77171262,   0.        , 173.08331265],
                           [ 33.74018707, 237.09579454, 173.08331265,   0.        ]])
    isvect = np.vectorize(isclose)
    assert isvect(answer, tx_distances, abs_tol=1).all(), "distances are incorrect"
    print("Test Passed")

test_haversine()

Test Passed

EARTH_RADIUS = 3958.76
# Copied from the pgeocode library (along with the zip code lat/lon values)
# https://github.com/symerio/pgeocode/blob/96b86f5e8b9569de0dd5eaa37385cb06752bb944/pgeocode.py#L396
# note that the docstring is incorrect as this will only work with arrays of shape (1, 2)
def haversine_distance(x, y):
    """Haversine (great circle) distance
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    Parameters
    ----------
    x : array, shape=(n_samples, 2)
      the first list of coordinates (degrees)
    y : array: shape=(n_samples, 2)
      the second list of coordinates (degress)
    Returns
    -------
    d : array, shape=(n_samples,)
      the distance between corrdinates (km)
    References
    ----------
    https://en.wikipedia.org/wiki/Great-circle_distance
    """
    # if the points are the same, their distance is zero
    if x[0] == y[0] and x[1] == y[1]:
        return 0
    x_rad = np.radians(x)
    y_rad = np.radians(y)


    d = y_rad - x_rad

    dlat, dlon = d.T
    x_lat = x_rad[0]
    y_lat = y_rad[0]

    a = (
        np.sin(dlat / 2.0) ** 2
        + np.cos(x_lat) * np.cos(y_lat) * np.sin(dlon / 2.0) ** 2
    )

    c = 2 * np.arcsin(np.sqrt(a))
    return EARTH_RADIUS * c

def hav_dist_no_np(a_, b_):
    res = []
    for a in a_:
        ress = []
        for b in b_:
            ress.append(haversine_distance(a, b))
        res.append(ress)
    return res

def test_haversine_non_vect():
    tx_json = {"city":{"4":"Dallas","5":"Houston","33":"Austin","55":"Fort Worth"},
               "state_name":{"4":"Texas","5":"Texas","33":"Texas","55":"Texas"},
               "lat":{"4":32.7935,"5":29.786,"33":30.3005,"55":32.7817},
               "lng":{"4":-96.7667,"5":-95.3885,"33":-97.7522,"55":-97.3474}}
    df = pd.DataFrame(tx_json)
    lat = df['lat'].values
    lng = df['lng'].values
    x = np.array([lat, lng]).T
    y = x.copy()
    answer = np.array(hav_dist_no_np(x, y))
    tx_distances = np.array([[  0.        , 223.15619132, 181.75853929,  33.74018707],
                           [223.15619132,   0.        , 145.77171262, 237.09579454],
                           [181.75853929, 145.77171262,   0.        , 173.08331265],
                           [ 33.74018707, 237.09579454, 173.08331265,   0.        ]])
    isvect = np.vectorize(isclose)
    assert isvect(answer, tx_distances, abs_tol=1).all(), "distances are incorrect"
    print("Test Passed")

test_haversine_non_vect()

Test Passed

Vectorized¶

def haversine_vectorized(x, y):
    """
    Vectorized haversine formual will work with arrays of long/lat coordinates

    Parameters
    ----------
    x : array, shape=(n_samples_1, 2)
      the first list of coordinates (degrees)
    y : array: shape=(n_samples_2, 2)
      the second list of coordinates (degress)
    Returns
    -------
    d : array, shape=(n_samples_1, n_samples_2)
      the distance between corrdinates (km)
    """
    EARTH_RADIUS = 3958.76

    # convert all latitudes/longitudes from decimal degrees to radians
    x_rad = np.radians(x)
    y_rad = np.radians(y)

    # unpack latitude/longitude
    x_lat, x_lng = x_rad[:, 0], x_rad[:, 1]
    y_lat, y_lng = y_rad[:, 0], y_rad[:, 1]

    # broadcast, can also use np.expand_dims
    x_lat = x_lat[None, :]
    x_lng = x_lng[None, :]
    y_lat = y_lat[:, None]
    y_lng = y_lng[:, None]

    # calculate haversine
    d_lat = y_lat - x_lat
    d_lng = y_lng - x_lng

    a = (np.sin(d_lat / 2.0) ** 2
         + np.cos(x_lat) * np.cos(y_lat) * np.sin(d_lng / 2.0) ** 2)

    return 2 * EARTH_RADIUS * np.arcsin(np.sqrt(a))

haversine_vectorized(x, y)

array([[  0.        , 223.15611622, 181.75847812,  33.74017572],
       [223.15611622,   0.        , 145.77166356, 237.09571474],
       [181.75847812, 145.77166356,   0.        , 173.0832544 ],
       [ 33.74017572, 237.09571474, 173.0832544 ,   0.        ]])

def test_haversine_vect():
    tx_json = {"city":{"4":"Dallas","5":"Houston","33":"Austin","55":"Fort Worth"},
               "state_name":{"4":"Texas","5":"Texas","33":"Texas","55":"Texas"},
               "lat":{"4":32.7935,"5":29.786,"33":30.3005,"55":32.7817},
               "lng":{"4":-96.7667,"5":-95.3885,"33":-97.7522,"55":-97.3474}}
    df = pd.DataFrame(tx_json)
    lat = df['lat'].values
    lng = df['lng'].values
    x = np.array([lat, lng]).T
    y = x.copy()
    answer = haversine_vectorized(x, y)
    tx_distances = np.array([[  0.        , 223.15619132, 181.75853929,  33.74018707],
                           [223.15619132,   0.        , 145.77171262, 237.09579454],
                           [181.75853929, 145.77171262,   0.        , 173.08331265],
                           [ 33.74018707, 237.09579454, 173.08331265,   0.        ]])
    isvect = np.vectorize(isclose)
    assert isvect(answer, tx_distances, abs_tol=1).all(), "distances are incorrect"
    print("Test Passed")

test_haversine_vect()

Test Passed

Speed Check¶

Test with small arrays leads to 10x speedup

%%timeit
haversine_vectorized(x, y)

7.86 µs ± 42 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

%%timeit
hav_dist_no_np(x, y)

62.4 µs ± 434 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

Test with large arrays leads to >1000x speedup

%%timeit
haversine_vectorized(np.tile(x.T,1000).T, np.tile(x.T,1000).T)

277 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

%%timeit
hav_dist_no_np(np.tile(x.T,1000).T, np.tile(x.T,1000).T)

1min 1s ± 285 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)