I wrote the following Cython code that calculates pairwise distances à la scipy.spatial.distance.cdist
.
# cython: infer_types=True
# cython: boundscheck=False
# cython: wraparound=False
cimport numpy as np
cimport cython
import numpy as np
from libc.math cimport sqrt
cdef inline double dist(double[:] a, double[:] b) nogil:
return sqrt((a[0] - b[0])**2 +
(a[1] - b[1])**2 +
(a[2] - b[2])**2)
cpdef double [:, :] dists_inline(double[:, :] a, double[:, :] b):
cdef unsigned int i, j
cdef double[:, :] d = np.empty((a.shape[0], b.shape[0]))
for i in range(a.shape[0]):
for j in range(b.shape[0]):
d[i, j] = dist(a[i], b[j])
return np.asarray(d)
cpdef double [:, :] dists(double[:, :] a, double[:, :] b):
cdef unsigned int i, j
cdef double[:, :] d = np.empty((a.shape[0], b.shape[0]))
for i in range(a.shape[0]):
for j in range(b.shape[0]):
d[i, j] = sqrt((a[i, 0] - b[j, 0])**2 +
(a[i, 1] - b[j, 1])**2 +
(a[i, 2] - b[j, 2])**2)
return np.asarray(d)
import numpy as np
from scipy.spatial.distance import cdist
neigh_k = np.random.random((14, 3))*2 - 1
neigh_k = np.vstack([[0, 0, 0], neigh_k])
k_points = np.random.random((100000, 3))*10 - 5
It manages to achieve the same performance as scipy.spatial.cdist:
%timeit cdist(neigh_k, k_points)
3.94 ms ± 458 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit dists(neigh_k, k_points)
3.74 ms ± 239 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
However, when using the inline version of the function above, the performance drops by a factor of 10.
%timeit dists_inline(neigh_k, k_points)
41.3 ms ± 75.5 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
As far as I understand, dists
and dists_inline
should generate almost the exact same binary.
(Similar question, where simply using memoryviews was the solution: Cython inline function with numpy array as parameter)
It works correctly if you pass
cdef inline double dist2(double[:, :] a, double[:, :] b, int i, int j) nogil:
return sqrt((a[i, 0] - b[j, 0])**2 +
(a[i, 1] - b[j, 1])**2 +
(a[i, 2] - b[j, 2])**2)
In the original case however, it seems some memoryview translation is going on. The Cython visualisation shows me for the call to dist1
:
__pyx_t_13.data = __pyx_v_a.data;
__pyx_t_13.memview = __pyx_v_a.memview;
__PYX_INC_MEMVIEW(&__pyx_t_13, 0);
{
Py_ssize_t __pyx_tmp_idx = __pyx_v_i;
Py_ssize_t __pyx_tmp_stride = __pyx_v_a.strides[0];
__pyx_t_13.data += __pyx_tmp_idx * __pyx_tmp_stride;
}
__pyx_t_13.shape[0] = __pyx_v_a.shape[1];
__pyx_t_13.strides[0] = __pyx_v_a.strides[1];
__pyx_t_13.suboffsets[0] = -1;
__pyx_t_14.data = __pyx_v_b.data;
__pyx_t_14.memview = __pyx_v_b.memview;
__PYX_INC_MEMVIEW(&__pyx_t_14, 0);
{
Py_ssize_t __pyx_tmp_idx = __pyx_v_j;
Py_ssize_t __pyx_tmp_stride = __pyx_v_b.strides[0];
__pyx_t_14.data += __pyx_tmp_idx * __pyx_tmp_stride;
}
__pyx_t_14.shape[0] = __pyx_v_b.shape[1];
__pyx_t_14.strides[0] = __pyx_v_b.strides[1];
__pyx_t_14.suboffsets[0] = -1;
__pyx_t_15 = __pyx_v_i;
__pyx_t_16 = __pyx_v_j;
*((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_d.data + __pyx_t_15 * __pyx_v_d.strides[0]) ) + __pyx_t_16 * __pyx_v_d.strides[1]) )) = __pyx_f_46_cython_magic_2e95e60ce94e9b349e663b4db389bdfa_dist(__pyx_t_13, __pyx_t_14);
__PYX_XDEC_MEMVIEW(&__pyx_t_13, 1);
__pyx_t_13.memview = NULL;
__pyx_t_13.data = NULL;
__PYX_XDEC_MEMVIEW(&__pyx_t_14, 1);
__pyx_t_14.memview = NULL;
__pyx_t_14.data = NULL;
}
}
whereas in the second case only the call itself occurs
__pyx_t_13 = __pyx_v_i;
__pyx_t_14 = __pyx_v_j;
*((double *) ( /* dim=1 */ (( /* dim=0 */ (__pyx_v_d.data + __pyx_t_13 * __pyx_v_d.strides[0]) ) + __pyx_t_14 * __pyx_v_d.strides[1]) )) = __pyx_f_46_cython_magic_2e95e60ce94e9b349e663b4db389bdfa_dist2(__pyx_v_a, __pyx_v_b, __pyx_v_i, __pyx_v_j);
}
}
The performance does not improve even if I explicitly declare C-contiguousness.
I would expect it to essentially compile to the following if C-contiguousness is declared:
cdef inline double dist3(double* a, double* b) nogil:
return sqrt((a[0] - b[0])**2 +
(a[1] - b[1])**2 +
(a[2] - b[2])**2)
cpdef double [:, ::1] dists_inline3(double[:, ::1] a, double[:, ::1] b):
cdef unsigned int i, j
cdef double[:, ::1] d = np.empty((a.shape[0], b.shape[0]))
for i in range(a.shape[0]):
for j in range(b.shape[0]):
d[i, j] = dist3(&a[i,0], &b[j,0])
return np.asarray(d)
This works and has the expected performance.