Welcome, guest | Sign In | My Account | Store | Cart

Three ways to calculate a distance matrix out of a list of n-dimensional points using scipy.

Python, 73 lines
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from scipy import reshape, sqrt, identity

# nDimPoints: list of n-dim tuples
# distFunc: calculates the distance based on the differences
# Ex: Manhatten would be: distFunc=sum(deltaPoint[d] for d in xrange(len(deltaPoint)
def calcDistanceMatrix(nDimPoints, 
                       distFunc=lambda deltaPoint: sqrt(sum(deltaPoint[d]**2 for d in xrange(len(deltaPoint))))):
    nDimPoints = array(nDimPoints)
    dim = len(nDimPoints[0])
    delta = [None]*dim
    for d in xrange(dim):
        data = nDimPoints[:,d]
        delta[d] = data - reshape(data,(len(data),1)) # computes all possible combinations

    dist = distFunc(delta)
    dist = dist + identity(len(data))*dist.max() # eliminate self matching
    # dist is the matrix of distances from one coordinate to any other
    return dist

from numpy.matlib import repmat, repeat
def calcDistanceMatrixFastEuclidean(points):
    numPoints = len(points)
    distMat = sqrt(sum((repmat(points, numPoints, 1) - repeat(points, numPoints, axis=0))**2, axis=1))
    return distMat.reshape((numPoints,numPoints))

from numpy import mat, zeros, newaxis
def calcDistanceMatrixFastEuclidean2(nDimPoints):
    nDimPoints = array(nDimPoints)
    n,m = nDimPoints.shape
    delta = zeros((n,n),'d')
    for d in xrange(m):
        data = nDimPoints[:,d]
        delta += (data - data[:,newaxis])**2
    return sqrt(delta)

#################
# Unittest
#################
class CalcDistanceMatrixTestCase(unittest.TestCase):
    def setUp(self):
        self.distanceMatrixFunc = "calcDistanceMatrix"
    
    def test_2D(self):
        points = [[0, 0], [1, 1], [4, 5]]
        dm = eval("%s(points)"%self.distanceMatrixFunc)
        self.assertAlmostEqual(1.414213562373095049, dm[0][1])
        self.assertAlmostEqual(6.403124237432848686, dm[0][2])
        self.assertAlmostEqual(5, dm[1][2])
        self._testSymmetry(dm)
        
    def test_3D(self):
        points = [[0, 0, 0], [1.0, 1, 1], [4, 5, 6], [10,10,10]]
        dm = eval("%s(points)"%self.distanceMatrixFunc)
        self.assertAlmostEqual(1.732050807568877294, dm[0][1])
        self.assertAlmostEqual(8.77496438739212206, dm[0][2])
        self.assertAlmostEqual(17.32050807568877294, dm[0][3])
        self.assertAlmostEqual(7.071067811865475244, dm[1][2])
        self.assertAlmostEqual(15.58845726811989564, dm[1][3])
        self.assertAlmostEqual(8.77496438739212206, dm[2][3])
        self._testSymmetry(dm)
        
    def _testSymmetry(self, dm):
        for i in range(len(dm)):
            for j in range(len(dm)):
                self.assertEqual(dm[i][j], dm[j][i])
        
class CalcDistanceMatrixFastTestCase(CalcDistanceMatrixTestCase):
    def setUp(self):
        self.distanceMatrixFunc = "calcDistanceMatrixFastEuclidean"
 
class CalcDistanceMatrixFast2TestCase(CalcDistanceMatrixTestCase):
    def setUp(self):
        self.distanceMatrixFunc = "calcDistanceMatrixFastEuclidean2"
 

The first algorithm is taken from the very good tutorial http://stsdas.stsci.edu/perry/pydatatut.pdf (chapter 3.7.7 "Finding nearest neighbors") and adapted to the n-dimensional case. Thanks to Keir Mierle for the ...FastEuclidean... functions, which are faster than calcDistanceMatrix by using euclidean distance directly.