From 542511e1045fb801092031dcba50d69bdfb02994 Mon Sep 17 00:00:00 2001 From: jowong04 Date: Fri, 9 May 2025 13:42:28 -0700 Subject: [PATCH 1/8] mibf: add set k function --- include/btllib/mi_bloom_filter.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/btllib/mi_bloom_filter.hpp b/include/btllib/mi_bloom_filter.hpp index da2417bd..bea47e76 100644 --- a/include/btllib/mi_bloom_filter.hpp +++ b/include/btllib/mi_bloom_filter.hpp @@ -241,6 +241,9 @@ class MIBloomFilter /** Returns the occurence count for each ID in the miBF */ std::vector get_id_occurence_count(const bool& include_saturated); + /** set k-mer size*/ + void set_k(unsigned k) { kmer_size = k; } + /** Returns an a filter size large enough to maintain an occupancy specified */ static size_t calc_optimal_size(size_t entries, From a23fbfe5ff263f9753c0a9781fd78d250ff24525 Mon Sep 17 00:00:00 2001 From: jowong04 Date: Fri, 9 May 2025 13:42:53 -0700 Subject: [PATCH 2/8] update wrappers --- wrappers/python/btllib_wrap.cxx | 93 +++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/wrappers/python/btllib_wrap.cxx b/wrappers/python/btllib_wrap.cxx index a2722021..a2c9efe6 100644 --- a/wrappers/python/btllib_wrap.cxx +++ b/wrappers/python/btllib_wrap.cxx @@ -56460,6 +56460,36 @@ SWIGINTERN PyObject *_wrap_MIBloomFilter8_get_id_occurence_count(PyObject *self, } +SWIGINTERN PyObject *_wrap_MIBloomFilter8_set_k(PyObject *self, PyObject *args) { + PyObject *resultobj = 0; + btllib::MIBloomFilter< uint8_t > *arg1 = (btllib::MIBloomFilter< uint8_t > *) 0 ; + unsigned int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + unsigned int val2 ; + int ecode2 = 0 ; + PyObject *swig_obj[2] ; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(self, &argp1,SWIGTYPE_p_btllib__MIBloomFilterT_uint8_t_t, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "MIBloomFilter8_set_k" "', argument " "1"" of type '" "btllib::MIBloomFilter< uint8_t > *""'"); + } + arg1 = reinterpret_cast< btllib::MIBloomFilter< uint8_t > * >(argp1); + ecode2 = SWIG_AsVal_unsigned_SS_int(swig_obj[0], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "MIBloomFilter8_set_k" "', argument " "2"" of type '" "unsigned int""'"); + } + arg2 = static_cast< unsigned int >(val2); + (arg1)->set_k(arg2); + resultobj = SWIG_Py_Void(); + return resultobj; +fail: + return NULL; +} + + SWIGINTERN PyObject *_wrap_MIBloomFilter8_calc_optimal_size(PyObject *self, PyObject *args) { PyObject *resultobj = 0; size_t arg1 ; @@ -57554,6 +57584,36 @@ SWIGINTERN PyObject *_wrap_MIBloomFilter16_get_id_occurence_count(PyObject *self } +SWIGINTERN PyObject *_wrap_MIBloomFilter16_set_k(PyObject *self, PyObject *args) { + PyObject *resultobj = 0; + btllib::MIBloomFilter< uint16_t > *arg1 = (btllib::MIBloomFilter< uint16_t > *) 0 ; + unsigned int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + unsigned int val2 ; + int ecode2 = 0 ; + PyObject *swig_obj[2] ; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(self, &argp1,SWIGTYPE_p_btllib__MIBloomFilterT_uint16_t_t, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "MIBloomFilter16_set_k" "', argument " "1"" of type '" "btllib::MIBloomFilter< uint16_t > *""'"); + } + arg1 = reinterpret_cast< btllib::MIBloomFilter< uint16_t > * >(argp1); + ecode2 = SWIG_AsVal_unsigned_SS_int(swig_obj[0], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "MIBloomFilter16_set_k" "', argument " "2"" of type '" "unsigned int""'"); + } + arg2 = static_cast< unsigned int >(val2); + (arg1)->set_k(arg2); + resultobj = SWIG_Py_Void(); + return resultobj; +fail: + return NULL; +} + + SWIGINTERN PyObject *_wrap_MIBloomFilter16_calc_optimal_size(PyObject *self, PyObject *args) { PyObject *resultobj = 0; size_t arg1 ; @@ -58648,6 +58708,36 @@ SWIGINTERN PyObject *_wrap_MIBloomFilter32_get_id_occurence_count(PyObject *self } +SWIGINTERN PyObject *_wrap_MIBloomFilter32_set_k(PyObject *self, PyObject *args) { + PyObject *resultobj = 0; + btllib::MIBloomFilter< uint32_t > *arg1 = (btllib::MIBloomFilter< uint32_t > *) 0 ; + unsigned int arg2 ; + void *argp1 = 0 ; + int res1 = 0 ; + unsigned int val2 ; + int ecode2 = 0 ; + PyObject *swig_obj[2] ; + + if (!args) SWIG_fail; + swig_obj[0] = args; + res1 = SWIG_ConvertPtr(self, &argp1,SWIGTYPE_p_btllib__MIBloomFilterT_uint32_t_t, 0 | 0 ); + if (!SWIG_IsOK(res1)) { + SWIG_exception_fail(SWIG_ArgError(res1), "in method '" "MIBloomFilter32_set_k" "', argument " "1"" of type '" "btllib::MIBloomFilter< uint32_t > *""'"); + } + arg1 = reinterpret_cast< btllib::MIBloomFilter< uint32_t > * >(argp1); + ecode2 = SWIG_AsVal_unsigned_SS_int(swig_obj[0], &val2); + if (!SWIG_IsOK(ecode2)) { + SWIG_exception_fail(SWIG_ArgError(ecode2), "in method '" "MIBloomFilter32_set_k" "', argument " "2"" of type '" "unsigned int""'"); + } + arg2 = static_cast< unsigned int >(val2); + (arg1)->set_k(arg2); + resultobj = SWIG_Py_Void(); + return resultobj; +fail: + return NULL; +} + + SWIGINTERN PyObject *_wrap_MIBloomFilter32_calc_optimal_size(PyObject *self, PyObject *args) { PyObject *resultobj = 0; size_t arg1 ; @@ -69855,6 +69945,7 @@ SWIGINTERN PyMethodDef SwigPyBuiltin__btllib__MIBloomFilterT_uint8_t_t_methods[] { "get_k", _wrap_MIBloomFilter8_get_k, METH_NOARGS, "" }, { "get_hash_fn", _wrap_MIBloomFilter8_get_hash_fn, METH_NOARGS, "" }, { "get_id_occurence_count", _wrap_MIBloomFilter8_get_id_occurence_count, METH_O, "" }, + { "set_k", _wrap_MIBloomFilter8_set_k, METH_O, "" }, { "calc_optimal_size", (PyCFunction)(void(*)(void))_wrap_MIBloomFilter8_calc_optimal_size, METH_STATIC|METH_VARARGS, "" }, { NULL, NULL, 0, NULL } /* Sentinel */ }; @@ -70104,6 +70195,7 @@ SWIGINTERN PyMethodDef SwigPyBuiltin__btllib__MIBloomFilterT_uint16_t_t_methods[ { "get_k", _wrap_MIBloomFilter16_get_k, METH_NOARGS, "" }, { "get_hash_fn", _wrap_MIBloomFilter16_get_hash_fn, METH_NOARGS, "" }, { "get_id_occurence_count", _wrap_MIBloomFilter16_get_id_occurence_count, METH_O, "" }, + { "set_k", _wrap_MIBloomFilter16_set_k, METH_O, "" }, { "calc_optimal_size", (PyCFunction)(void(*)(void))_wrap_MIBloomFilter16_calc_optimal_size, METH_STATIC|METH_VARARGS, "" }, { NULL, NULL, 0, NULL } /* Sentinel */ }; @@ -70353,6 +70445,7 @@ SWIGINTERN PyMethodDef SwigPyBuiltin__btllib__MIBloomFilterT_uint32_t_t_methods[ { "get_k", _wrap_MIBloomFilter32_get_k, METH_NOARGS, "" }, { "get_hash_fn", _wrap_MIBloomFilter32_get_hash_fn, METH_NOARGS, "" }, { "get_id_occurence_count", _wrap_MIBloomFilter32_get_id_occurence_count, METH_O, "" }, + { "set_k", _wrap_MIBloomFilter32_set_k, METH_O, "" }, { "calc_optimal_size", (PyCFunction)(void(*)(void))_wrap_MIBloomFilter32_calc_optimal_size, METH_STATIC|METH_VARARGS, "" }, { NULL, NULL, 0, NULL } /* Sentinel */ }; From e66d9d077e287c5b3c5fed16d3b8b2a54353112e Mon Sep 17 00:00:00 2001 From: jowong04 Date: Fri, 9 May 2025 13:43:12 -0700 Subject: [PATCH 3/8] update tests --- tests/mi_bloom_filter.cpp | 5 +++++ tests/python/test_mi_bloom_filter.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/tests/mi_bloom_filter.cpp b/tests/mi_bloom_filter.cpp index 383d747d..a0f3111c 100644 --- a/tests/mi_bloom_filter.cpp +++ b/tests/mi_bloom_filter.cpp @@ -22,6 +22,11 @@ main() TEST_ASSERT(mi_bf_1.bv_contains({ 100, 200, 300 })); TEST_ASSERT(!mi_bf_1.bv_contains({ 1, 20, 100 })); + unsigned kmer_size = 10; + mi_bf_1.set_k(kmer_size); + unsigned set_kmer_size = mi_bf_1.get_k(); + TEST_ASSERT(set_kmer_size = kmer_size); + uint8_t ID_1 = 12; mi_bf_1.insert_id({ 1, 10, 100 }, ID_1); diff --git a/tests/python/test_mi_bloom_filter.py b/tests/python/test_mi_bloom_filter.py index 8d5c40be..5e91a6b0 100644 --- a/tests/python/test_mi_bloom_filter.py +++ b/tests/python/test_mi_bloom_filter.py @@ -25,6 +25,7 @@ def setUp(self): def set_up_mi_bf_1(self): self.mi_bf_1 = btllib.MIBloomFilter8(1024 * 1024, 3, "ntHash") + self.mi_bf_1.set_k() = 10 for h in self.test_hashes_1: self.mi_bf_1.insert_bv(h) @@ -89,6 +90,10 @@ def test_mibloomfilter_id_occurence(self): self.assertEqual(len(self.test_hashes_1[0]), self.mi_bf_1.get_id_occurence_count(include_saturated)[expected_id]) + def test_mibloomfilter_id_occurence(self): + self.set_up_mi_bf_1() + self.assertEqual(10, self.mi_bf_1.get_k()) + def test_mibloomfilter_random_sampling(self): self.set_up_mi_bf_2() From 8f17857ada32306524a8108274ca647df69ce802 Mon Sep 17 00:00:00 2001 From: jowong04 Date: Fri, 9 May 2025 13:43:54 -0700 Subject: [PATCH 4/8] update docs --- docs/index.html | 2 +- docs/mi__bloom__filter_8hpp_source.html | 80 +++++++++++++------------ 2 files changed, 42 insertions(+), 40 deletions(-) diff --git a/docs/index.html b/docs/index.html index 651ffc0f..675dc6b5 100644 --- a/docs/index.html +++ b/docs/index.html @@ -181,7 +181,7 @@

The following are all the available ninja commands which can be run within build directory:

  • ninja clang-format formats the whitespace in code (requires clang-format 8+).
  • -
  • ninja wrap wraps C++ code for Python (requires SWIG 4.0+).
  • +
  • ninja wrap wraps C++ code for Python (requires SWIG ≥4.0 and <4.3).
  • ninja clang-tidy runs clang-tidy on C++ code and makes sure it passes (requires clang-tidy 8+).
  • ninja builds the tests and wrapper libraries / makes sure they compile.
  • ninja test runs the tests.
  • diff --git a/docs/mi__bloom__filter_8hpp_source.html b/docs/mi__bloom__filter_8hpp_source.html index 3c2f07fe..e4a2fc78 100644 --- a/docs/mi__bloom__filter_8hpp_source.html +++ b/docs/mi__bloom__filter_8hpp_source.html @@ -223,46 +223,48 @@
    240
    242 std::vector<size_t> get_id_occurence_count(const bool& include_saturated);
    243
    -
    246 static size_t calc_optimal_size(size_t entries,
    -
    247 unsigned hash_num,
    -
    248 double occupancy);
    -
    249
    -
    250private:
    -
    251 MIBloomFilter(const std::shared_ptr<MIBloomFilterInitializer>& mibfi);
    -
    252 static void save(const std::string& path,
    -
    253 const cpptoml::table& table,
    -
    254 const char* data,
    -
    255 size_t n);
    -
    256 std::vector<uint64_t> get_rank_pos(const uint64_t* hashes) const;
    -
    257 uint64_t get_rank_pos(const uint64_t hash) const
    -
    258 {
    -
    259 return bv_rank_support(hash % il_bit_vector.size());
    -
    260 }
    -
    261 std::vector<T> get_data(const std::vector<uint64_t>& rank_pos) const;
    -
    262 T get_data(const uint64_t& rank) const { return id_array[rank]; }
    -
    263 void set_data(const uint64_t& pos, const T& id);
    -
    264 void set_saturated(const uint64_t* hashes);
    -
    265
    -
    266 size_t id_array_size = 0;
    -
    267 size_t bv_size = 0;
    -
    268 unsigned kmer_size = 0;
    -
    269 unsigned hash_num = 0;
    -
    270 std::string hash_fn;
    -
    271
    -
    272 sdsl::bit_vector bit_vector;
    -
    273 sdsl::bit_vector_il<BLOCKSIZE> il_bit_vector;
    -
    274 sdsl::rank_support_il<1> bv_rank_support;
    -
    275 std::unique_ptr<std::atomic<uint16_t>[]> counts_array;
    -
    276 std::unique_ptr<std::atomic<T>[]> id_array;
    -
    277
    -
    278 bool bv_insertion_completed = false, id_insertion_completed = false;
    -
    279};
    +
    245 void set_k(unsigned k) { kmer_size = k; }
    +
    246
    +
    249 static size_t calc_optimal_size(size_t entries,
    +
    250 unsigned hash_num,
    +
    251 double occupancy);
    +
    252
    +
    253private:
    +
    254 MIBloomFilter(const std::shared_ptr<MIBloomFilterInitializer>& mibfi);
    +
    255 static void save(const std::string& path,
    +
    256 const cpptoml::table& table,
    +
    257 const char* data,
    +
    258 size_t n);
    +
    259 std::vector<uint64_t> get_rank_pos(const uint64_t* hashes) const;
    +
    260 uint64_t get_rank_pos(const uint64_t hash) const
    +
    261 {
    +
    262 return bv_rank_support(hash % il_bit_vector.size());
    +
    263 }
    +
    264 std::vector<T> get_data(const std::vector<uint64_t>& rank_pos) const;
    +
    265 T get_data(const uint64_t& rank) const { return id_array[rank]; }
    +
    266 void set_data(const uint64_t& pos, const T& id);
    +
    267 void set_saturated(const uint64_t* hashes);
    +
    268
    +
    269 size_t id_array_size = 0;
    +
    270 size_t bv_size = 0;
    +
    271 unsigned kmer_size = 0;
    +
    272 unsigned hash_num = 0;
    +
    273 std::string hash_fn;
    +
    274
    +
    275 sdsl::bit_vector bit_vector;
    +
    276 sdsl::bit_vector_il<BLOCKSIZE> il_bit_vector;
    +
    277 sdsl::rank_support_il<1> bv_rank_support;
    +
    278 std::unique_ptr<std::atomic<uint16_t>[]> counts_array;
    +
    279 std::unique_ptr<std::atomic<T>[]> id_array;
    280
    -
    281} // namespace btllib
    -
    282
    -
    283#include "mi_bloom_filter-inl.hpp"
    -
    284
    -
    285#endif
    +
    281 bool bv_insertion_completed = false, id_insertion_completed = false;
    +
    282};
    +
    283
    +
    284} // namespace btllib
    +
    285
    +
    286#include "mi_bloom_filter-inl.hpp"
    +
    287
    +
    288#endif
    Definition aahash.hpp:12
    From 49d1897a9c412a28d02551ce7115440c6529a9fd Mon Sep 17 00:00:00 2001 From: JW <34543031+jwcodee@users.noreply.github.com> Date: Fri, 9 May 2025 13:59:50 -0700 Subject: [PATCH 5/8] Update azure-pipelines.yml --- azure-pipelines.yml | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 522fabc6..de089dd2 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -11,8 +11,16 @@ jobs: - checkout: self submodules: recursive - - script: echo "##vso[task.prependpath]$CONDA/bin" - displayName: Add conda to PATH + - script: | + mkdir -p ~/miniforge3 + curl -L https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-x86_64.sh -o ~/miniforge3/miniforge.sh + bash ~/miniforge3/miniforge.sh -b -u -p ~/miniforge3 + rm -rf ~/miniforge3/miniforge.sh + ~/miniforge3/bin/conda init bash + ~/miniforge3/bin/conda init zsh + export CONDA=$(realpath ~/miniforge3/bin) + echo "##vso[task.prependpath]$CONDA" + displayName: Install conda - script: conda create --yes --quiet --name btllib_CI displayName: Create Anaconda environment From 592248b1329df810458444ce0391a025deba28d4 Mon Sep 17 00:00:00 2001 From: JW <34543031+jwcodee@users.noreply.github.com> Date: Fri, 9 May 2025 15:48:07 -0700 Subject: [PATCH 6/8] Update azure-pipelines.yml --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index de089dd2..c291c55d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -28,7 +28,7 @@ jobs: - script: | source activate btllib_CI conda install --yes -c conda-forge mamba - mamba install --yes -c conda-forge -c bioconda libcxx compilers clang llvm clang-format=18 clang-tools boost samtools coreutils xz lrzip meson ninja cmake openmp + mamba install --yes -c conda-forge -c bioconda libcxx compilers clang llvm clang-format=18 clang-tools boost samtools coreutils xz lrzip meson ninja cmake<4 openmp pip install gcovr displayName: Install dependencies @@ -101,7 +101,7 @@ jobs: - script: | source activate btllib_CI - mamba install --yes -c conda-forge -c bioconda libcxx compilers llvm clang-format clang-tools boost 'samtools>=1.14' coreutils xz lrzip meson ninja cmake openmp gcovr + mamba install --yes -c conda-forge -c bioconda libcxx compilers llvm clang-format clang-tools boost 'samtools>=1.14' coreutils xz lrzip meson ninja cmake<4 openmp gcovr displayName: 'Install required software' - script: | From 8f7283d4bd23c21fbfd1a61055e89782feffa838 Mon Sep 17 00:00:00 2001 From: JW <34543031+jwcodee@users.noreply.github.com> Date: Sat, 10 May 2025 01:32:48 -0700 Subject: [PATCH 7/8] Update azure-pipelines.yml --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index c291c55d..5f6ccc69 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -28,7 +28,7 @@ jobs: - script: | source activate btllib_CI conda install --yes -c conda-forge mamba - mamba install --yes -c conda-forge -c bioconda libcxx compilers clang llvm clang-format=18 clang-tools boost samtools coreutils xz lrzip meson ninja cmake<4 openmp + mamba install --yes -c conda-forge -c bioconda libcxx compilers clang llvm clang-format=18 clang-tools boost samtools coreutils xz lrzip meson ninja 'cmake<4' openmp pip install gcovr displayName: Install dependencies @@ -101,7 +101,7 @@ jobs: - script: | source activate btllib_CI - mamba install --yes -c conda-forge -c bioconda libcxx compilers llvm clang-format clang-tools boost 'samtools>=1.14' coreutils xz lrzip meson ninja cmake<4 openmp gcovr + mamba install --yes -c conda-forge -c bioconda libcxx compilers llvm clang-format clang-tools boost 'samtools>=1.14' coreutils xz lrzip meson ninja 'cmake<4' openmp gcovr displayName: 'Install required software' - script: | From 15879f950f1d17b15f20cb9c1a86b21057bc16f7 Mon Sep 17 00:00:00 2001 From: JW <34543031+jwcodee@users.noreply.github.com> Date: Mon, 12 May 2025 16:38:08 -0700 Subject: [PATCH 8/8] Update test_mi_bloom_filter.py --- tests/python/test_mi_bloom_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python/test_mi_bloom_filter.py b/tests/python/test_mi_bloom_filter.py index 5e91a6b0..a02bbaee 100644 --- a/tests/python/test_mi_bloom_filter.py +++ b/tests/python/test_mi_bloom_filter.py @@ -25,7 +25,7 @@ def setUp(self): def set_up_mi_bf_1(self): self.mi_bf_1 = btllib.MIBloomFilter8(1024 * 1024, 3, "ntHash") - self.mi_bf_1.set_k() = 10 + self.mi_bf_1.set_k(10) for h in self.test_hashes_1: self.mi_bf_1.insert_bv(h)