Performance comparison#
Here is a performance comparison using random numpy or dask arrays. This is the same setup used in the boost documentation to compare with numpy performances.
This notebook was run on a Dell Precision 5480 (CPU 13th Gen Intel Core i7-13700H x 20 threads).
Recap#
numpy |
xhistogram |
xarray-histogram |
|
|---|---|---|---|
Numpy flat |
59 ms |
389 ms |
34 ms |
Numpy along 1D |
352 ms |
24 ms |
|
Numpy 2 variables |
801 ms |
780 ms |
63 ms |
Dask flat |
91 ms |
61 ms |
|
Dask along 1D |
186 ms |
100 ms |
|
Dask 2 variables |
150 ms |
83 ms |
[1]:
import numpy as np
import xarray as xr
import xarray_histogram as xh
from numpy.testing import assert_allclose
from xhistogram.xarray import histogram as xhistogram
Numpy arrays#
One-dimensional histogram (flattened)#
[2]:
rng = np.random.default_rng(seed=10)
values = rng.normal(size=[1000, 10_000]).astype(np.float32)
x = xr.DataArray(values, name='test_1d')
hist_kw = dict(bins=100, range=(-3, 3))
answer, _ = np.histogram(values, **hist_kw)
numpy#
[3]:
%%timeit
h, _ = np.histogram(values, **hist_kw)
assert_allclose(h, answer)
59.2 ms ± 571 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
xhistogram#
[4]:
%%timeit
h = xhistogram(x, **hist_kw)
assert_allclose(h.values, answer)
388 ms ± 925 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
xarray-histogram#
[5]:
%%timeit
h = xh.histogram(x, **hist_kw)
assert_allclose(h.values, answer, atol=1)
33.6 ms ± 506 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Two variables (flattened)#
[6]:
values = rng.normal(size=[2, 10_000_000]).astype(np.float32)
x = [xr.DataArray(values[i], name=f"test_2d_{i}") for i in range(2)]
hist_kw = dict(bins=100, range=[(-3, 3), (-3, 3)])
answer, _, _ = np.histogram2d(*values, **hist_kw)
numpy#
[7]:
%%timeit
h, _, _ = np.histogram2d(*values, **hist_kw)
assert_allclose(h, answer, atol=1)
801 ms ± 2.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
xhistogram#
[8]:
%%timeit
h = xhistogram(*x, **hist_kw)
assert_allclose(h.values, answer, atol=1)
780 ms ± 882 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
xarray-histogram#
[9]:
%%timeit
h = xh.histogram2d(*x, **hist_kw)
assert_allclose(h.values, answer, atol=1)
63.2 ms ± 1.13 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
One-dimensional (along second dimension)#
[10]:
values = np.random.normal(size=[3, 3_000_000]).astype(np.float32)
x = xr.DataArray(values, name='test_1d', dims=['t', 'x'])
hist_kw = dict(bins=100, range=(-3, 3))
answer = xhistogram(x, **hist_kw, dim=['x']).load()
xhistogram#
[11]:
%%timeit
h = xhistogram(x, **hist_kw, dim=['x'])
assert_allclose(h.values, answer.values, atol=1)
351 ms ± 886 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
xarray-histogram#
[12]:
%%timeit
h = xh.histogram(x, **hist_kw, dims=['x'])
assert_allclose(h.values, answer.values, atol=1)
24.2 ms ± 46.7 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Dask arrays#
[13]:
import boost_histogram as bh
import dask.array as da
One-dimensional histogram (flattened)#
[14]:
chunk_size = 1_000_000
values = da.random.normal(size=[10_000_000], chunks=(chunk_size)).astype(np.float32)
x = xr.DataArray(values, name='test_1d')
ax = bh.axis.Regular(100, -3, 3)
answer = xhistogram(x, bins=ax.edges).load()
xhistogram#
[15]:
%%timeit
assert not x._in_memory
h = xhistogram(x, bins=ax.edges, block_size=chunk_size)
assert_allclose(h.values, answer)
90.8 ms ± 1.89 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
xarray-histogram#
[16]:
%%timeit
assert not x._in_memory
h = xh.histogram(x, bins=ax)
assert_allclose(h.values, answer)
61.3 ms ± 3.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Two variables (flattened)#
[17]:
values = da.random.normal(size=[2, 10_000_000], chunks=(1, chunk_size)).astype(np.float32)
x = [xr.DataArray(values[i], name=f"test_2d_{i}") for i in range(2)]
axes = [bh.axis.Regular(100, -3, 3) for _ in range(2)]
edges = [ax.edges for ax in axes]
answer = xhistogram(*x, bins=edges).load()
xhistogram#
[18]:
%%timeit
assert not x[0]._in_memory
h = xhistogram(*x, bins=edges, block_size=chunk_size)
assert_allclose(h.values, answer)
150 ms ± 836 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
xarray-histogram#
[19]:
%%timeit
assert not x[0]._in_memory
h = xh.histogram2d(*x, bins=axes)
assert_allclose(h.values, answer)
82.7 ms ± 4.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
One-dimensional (along second dimension)#
[20]:
values = da.random.normal(size=[3, 10_000_000], chunks=(1, chunk_size))
x = xr.DataArray(values, name='test_1d', dims=['t', 'x'])
ax = bh.axis.Regular(100, -3, 3)
answer = xhistogram(x, bins=ax.edges, dim=['x']).load()
xhistogram#
[21]:
%%timeit
h = xhistogram(x, bins=ax.edges, dim=['x'], block_size=chunk_size)
assert_allclose(h.values, answer.values)
186 ms ± 3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
xarray-histogram#
[22]:
%%timeit
h = xh.histogram(x, bins=ax, dims=['x'])
assert_allclose(h.values, answer.values)
99.6 ms ± 3.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)