Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
1f24930
added memory optimizations and slight modifications
ParticularMiner Apr 20, 2021
7c4fcfb
fixed backward compatibility issue.
ParticularMiner Apr 20, 2021
3e4c416
defragmented memory allocation in C/C++ code
ParticularMiner Apr 23, 2021
f1da08c
made ntop always flexible (i.e., not only when ntop >= B.shape[1])
ParticularMiner Apr 24, 2021
ee0c206
fixed C++ and cython "docstrings"
ParticularMiner Apr 25, 2021
174cd89
removed redundant code
ParticularMiner Apr 26, 2021
dedaae9
modified comparison2.py for benchmarking
ParticularMiner Apr 28, 2021
78e684c
changed flag return_best_topn to return_best_ntop
ParticularMiner Apr 29, 2021
98721a3
introduced scouting function and removed C/C++ memory-management due to
ParticularMiner Apr 29, 2021
3d9c17e
added percentage increment in timing to comparison2.py
ParticularMiner Apr 29, 2021
3e56698
deleted array_wrappers extension
ParticularMiner Apr 29, 2021
6d79684
made trimming modification to awesome_cossim_topn
ParticularMiner Apr 30, 2021
45b9e73
added new parameter-values for comparison2.py
ParticularMiner May 1, 2021
9614645
performed tests on sample sparse matrices in data files
ParticularMiner May 2, 2021
0447351
introduced a heuristic method to reduce over-estimate nnz_max and
ParticularMiner May 2, 2021
d333f73
fixed docstrings
ParticularMiner May 2, 2021
8cdc9d6
fixed tests and examples
ParticularMiner May 2, 2021
604de3b
further optimized speed by using vector reserve function
ParticularMiner May 3, 2021
d413cb6
added .github/workflows/test.yml to perform tests through GitHub actions
ParticularMiner May 5, 2021
333ac66
added example/mem_prof.py script to be used with mprof (in the package
ParticularMiner May 19, 2021
1661b7f
applied cython's C/C++ to Python exception conversion capabilities
ParticularMiner Jun 1, 2021
4a3cdb6
extended unit-test to cover ArrayWrappers and awesome_cossim_topn option
ParticularMiner Jun 8, 2021
e6838d5
restored .gitignore
ParticularMiner Jun 9, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: Run tests
on:
pull_request:
push:
branches:
- master

jobs:
test:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: [3.7, 3.8, 3.9]
os: [ubuntu-latest, windows-latest]

steps:
- uses: actions/checkout@v2

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}

- name: Install dev-package
run: |
python -m pip install --upgrade pip
pip install -v -e .

- name: Run tests
run: |
pip install pytest
pip install pandas
pytest -ra --capture=no --showlocals
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ dependency_links.txt
PKG-INFO
SOURCES.txt
top_level.txt
mprofile_20210520051136.dat
.project
.pydevproject
.settings

sparse_dot_topn/sparse.cpp
sparse_dot_topn.egg-info
Expand Down
Binary file added example/Figure_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
74 changes: 70 additions & 4 deletions example/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,45 @@
from __future__ import print_function
import timeit
import numpy as np
from scipy.sparse import rand
from scipy.sparse import coo_matrix
from sparse_dot_topn import awesome_cossim_topn # noqa: F401

N = 10
N = 1000
thresh = 0.01

a = rand(100, 1000000, density=0.005, format='csr')
b = rand(1000000, 200, density=0.005, format='csr')
nr_vocab = 2 << 24
density = 1e-6
n_samples = 1000000
n_duplicates = 1000000
nnz_a = int(n_samples * nr_vocab * density)
nnz_b = int(n_duplicates * nr_vocab * density)


print(f'density = {density}', flush=True)
print(f'nr_vocab = {nr_vocab}', flush=True)
print(f'n_samples = {n_samples}', flush=True)
print(f'n_duplicates = {n_duplicates}', flush=True)
print(f'nnz_a = {nnz_a}', flush=True)
print(f'nnz_b = {nnz_b}', flush=True)
print('\n', flush=True)

rng1 = np.random.RandomState(42)
rng2 = np.random.RandomState(43)

row = rng1.randint(n_samples, size=nnz_a)
cols = rng2.randint(nr_vocab, size=nnz_a)
data = rng1.rand(nnz_a)

a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
a = a_sparse.tocsr()

row = rng1.randint(n_duplicates, size=nnz_b)
cols = rng2.randint(nr_vocab, size=nnz_b)
data = rng1.rand(nnz_b)

b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
b = b_sparse.T.tocsr()


# top 5 results per row

Expand All @@ -37,6 +68,41 @@
globals=globals())
print(rtv)

print("Threaded function with 3 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)',
number=3,
globals=globals())
print(rtv)

print("Threaded function with 4 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)',
number=3,
globals=globals())
print(rtv)

print("Threaded function with 5 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)',
number=3,
globals=globals())
print(rtv)

print("Threaded function with 6 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)',
number=3,
globals=globals())
print(rtv)

print("Threaded function with 7 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)',
number=3,
globals=globals())
print(rtv)

# use scipy and numpy function


Expand Down
157 changes: 157 additions & 0 deletions example/comparison2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""
This file compare our boosting method with calling scipy+numpy function directly
"""

from __future__ import print_function
import timeit
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix
from sparse_dot_topn import awesome_cossim_topn # noqa: F401

df = pd.DataFrame(columns=['sample', '#threads', 'python'])

N = 4000
thresh = 0.01

nr_vocab = int(26**3)
density = 30/nr_vocab
n_samples = 1000000
n_duplicates = N
nnz_a = int(n_samples * nr_vocab * density)
nnz_b = int(n_duplicates * nr_vocab * density)

print(f'ntop = {N}', flush=True)
print(f'threshold = {thresh}', flush=True)
print(f'density = {density}', flush=True)
print(f'nr_vocab = {nr_vocab}', flush=True)
print(f'n_samples = {n_samples}', flush=True)
print(f'n_duplicates = {n_duplicates}', flush=True)
print(f'nnz_A = {nnz_a}', flush=True)
print(f'nnz_B = {nnz_b}', flush=True)
print('', flush=True)

rng1 = np.random.RandomState(42)

n_matrix_pairs = 2**4
nnz_arr = np.full(n_matrix_pairs, 0)
ntop_arr = np.full(n_matrix_pairs, 0)
r = 0
for it in range(n_matrix_pairs):

row = rng1.randint(n_samples, size=nnz_a)
cols = rng1.randint(nr_vocab, size=nnz_a)
data = rng1.rand(nnz_a)

a_sparse = coo_matrix((data, (row, cols)), shape=(n_samples, nr_vocab))
a = a_sparse.tocsr()

row = rng1.randint(n_duplicates, size=nnz_b)
cols = rng1.randint(nr_vocab, size=nnz_b)
data = rng1.rand(nnz_b)

b_sparse = coo_matrix((data, (row, cols)), shape=(n_duplicates, nr_vocab))
b = b_sparse.T.tocsr()

C, C_ntop = awesome_cossim_topn(a, b, N, thresh, return_best_ntop=True)
print(f'nnz(A*B) = {len(C.data)}', flush=True)
print(f'ntop(A*B) = {C_ntop}', flush=True)
print('', flush=True)
nnz_arr[it] = len(C.data)
ntop_arr[it] = C_ntop
del C
del C_ntop

# top 5 results per row

print("Non-parallelized sparse_dot_topn function")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh)',
number=3,
globals=globals())
df.loc[r] = [it, 0, rtv]
r += 1
print('sample\t\tpython', flush=True)
print(f'{it}\t\t{rtv:7.4f}', flush=True)

print("Threaded function with 1 thread")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 1)',
number=3,
globals=globals())
df.loc[r] = [it, 1, rtv]
r += 1
print('sample\t\tpython', flush=True)
print(f'{it}\t\t{rtv:7.4f}', flush=True)

print("Threaded function with 2 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 2)',
number=3,
globals=globals())
df.loc[r] = [it, 2, rtv]
r += 1
print('sample\t\tpython', flush=True)
print(f'{it}\t\t{rtv:7.4f}', flush=True)

print("Threaded function with 3 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 3)',
number=3,
globals=globals())
df.loc[r] = [it, 3, rtv]
r += 1
print('sample\t\tpython', flush=True)
print(f'{it}\t\t{rtv:7.4f}', flush=True)

print("Threaded function with 4 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 4)',
number=3,
globals=globals())
df.loc[r] = [it, 4, rtv]
r += 1
print('sample\t\tpython', flush=True)
print(f'{it}\t\t{rtv:7.4f}', flush=True)

print("Threaded function with 5 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 5)',
number=3,
globals=globals())
df.loc[r] = [it, 5, rtv]
r += 1
print('sample\t\tpython', flush=True)
print(f'{it}\t\t{rtv:7.4f}', flush=True)

print("Threaded function with 6 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 6)',
number=3,
globals=globals())
df.loc[r] = [it, 6, rtv]
r += 1
print('sample\t\tpython', flush=True)
print(f'{it}\t\t{rtv:7.4f}', flush=True)

print("Threaded function with 7 threads")

rtv = timeit.timeit('awesome_cossim_topn(a, b, N, thresh, True, 7)',
number=3,
globals=globals())
df.loc[r] = [it, 7, rtv]
r += 1
print('sample\t\tpython', flush=True)
print(f'{it}\t\t{rtv:7.4f}', flush=True)

print('')
print(f'nnz(A*B) = {nnz_arr[:(it + 1)].mean()} +/- {nnz_arr[:(it + 1)].std()}')
print(f'ntop(A*B) = {ntop_arr[:(it + 1)].mean()} +/- {ntop_arr[:(it + 1)].std()}')
print('')
df = df.astype({
'sample': np.int64, '#threads': np.int64, 'python': np.float64})
results = df.groupby('#threads', as_index=True, sort=True)[['python']].mean()

print(results)
print('')
print('')
Loading