mirror of
https://github.com/boostorg/bloom.git
synced 2026-01-19 16:12:10 +00:00
* removed superfluous inline (Alexander Grund) * made hasher equivalence a precondition for &=/|= (Andrzej Krzemienski) * documented exception safety guarantees (Andrzej Krzemienski) * mentioned Bloom filters are called so after Burton H Bloom (Dmitry Arkhipov) * added warning about OOM for very small FPR (Ivan Matek) * stressed config chart x axis is capacity/num elements rather than plain capacity (Ivan Matek) * s/[SIMD] is available/is enabled at compile time (Ivan Matek) * shut down clang-tidy warnings (Ivan Matek) * used "set union" for more clarity (Andrzej Krzemienski) * stressed early on that boost::bloom::filter is _not_ a container (Claudio DeSouza) * added bulk operations to roadmap (Dmitry Arkhipov) * added try_insert to roadmap (Konstantin Savvidy) * added estimated_size to roadmap (Konstantin Savvidy) * added alternative filters to roadmap (Konstantin Savvidy) * used <cstdint> instead of <boost/cstdint.hpp> (Rubén Pérez) * mentioned endianness when serializing filters (Rubén Pérez) * corrected sloppiness about optimum k determination (Tomer Vromen) * added run-time specification of k to roadmap (Tomer Vromen) * added test/CMakeLists.txt (Rubén Pérez) * added CMake-based testing to GHA (Rubén Pérez) (#8) * added <boost/bloom.hpp> (Rubén Pérez) * added Codecov reporting (Rubén Pérez) (#9) * moved from boost::unordered::hash_is_avalanching to ContainerHash's boost::hash_is_avalanching (Ivan Matek/Peter Dimov) * added syntax highlighting to code snippets (Rubén Pérez) * avoided C-style casts in examples (Rubén Pérez) * added acknowledgements section (Peter Turcan) * added Getting Started section (Peter Turcan) * fixed example Jamfile and added example building to CI (Rubén Pérez) (#10) * added diagram about overlapping vs. non-overlapping subarrays (Rubén Pérez/Ivan Matek/Vinnie Falco) * made first code snippet self-contained (Rubén Pérez/Peter Turcan) * added more comments to genome.cpp (Rubén Pérez) * added support for arrays as blocks (Tomer Vromen) (#24) * removed emplace (Seth Heeren/Peter Dimov) (#25) * required the allocator to be of unsigned char (Seth Heeren/Peter Dimov) (#26) * added compile-time validation of Block types (Rubén Pérez) (#27) * added value type to displayed filter names in tables (Tomer Vromen) (#28) * used -march=native rather than -mavx2 (Ivan Matek) * adopted hash strategy with fastrange plus a separate MCG (Kostas Savvidis/Peter Dimov) (#30) * several maintenance commits
196 lines
4.9 KiB
C++
196 lines
4.9 KiB
C++
/* Using Boost.Bloom to check occurrence of DNA sequences in a genome.
|
|
*
|
|
* Copyright 2025 Joaquin M Lopez Munoz.
|
|
* Distributed under the Boost Software License, Version 1.0.
|
|
* (See accompanying file LICENSE_1_0.txt or copy at
|
|
* http://www.boost.org/LICENSE_1_0.txt)
|
|
*
|
|
* See https://www.boost.org/libs/bloom for library home page.
|
|
*/
|
|
|
|
#include <array>
|
|
#include <boost/bloom/filter.hpp>
|
|
#include <boost/bloom/fast_multiblock32.hpp>
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <cstdlib>
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <stdexcept>
|
|
#include <string_view>
|
|
|
|
/* A k-mer is a sliding segment of size k over a sequence of DNA nucleotides
|
|
* (A, C, G, T). k_mer encodes the segment in a 64-bit word using 2 bits per
|
|
* nucleotide.
|
|
*/
|
|
|
|
template<std::size_t K>
|
|
struct k_mer
|
|
{
|
|
static_assert(
|
|
K >= 0 &&
|
|
2 * K <= sizeof(std::uint64_t) * CHAR_BIT);
|
|
|
|
static constexpr std::size_t size()
|
|
{
|
|
return K;
|
|
}
|
|
|
|
void reset()
|
|
{
|
|
data = 0;
|
|
}
|
|
|
|
/* shift the k-mer and append a new nucleotide */
|
|
|
|
k_mer& operator+=(char n)
|
|
{
|
|
static constexpr std::uint64_t mask=
|
|
(((std::uint64_t)1) << (2 * size())) - 1;
|
|
|
|
data <<= 2;
|
|
data &= mask;
|
|
data |= table[(unsigned char)n];
|
|
return *this;
|
|
}
|
|
|
|
std::uint64_t data = 0;
|
|
|
|
using table_type=std::array<unsigned char, UCHAR_MAX>;
|
|
|
|
static constexpr table_type table = [] {
|
|
table_type table{};
|
|
table['A'] = table['a'] = 0;
|
|
table['C'] = table['c'] = 1;
|
|
table['G'] = table['g'] = 2;
|
|
table['T'] = table['t'] = 3;
|
|
return table;
|
|
}();
|
|
};
|
|
|
|
template<std::size_t N>
|
|
std::size_t hash_value(const k_mer<N>& km)
|
|
{
|
|
/* k:mer::data is 8 bytes wide. We use it directly as the associated
|
|
* hash value in 64-bit mode, as std::size_t is the same size; in 32-bit
|
|
* mode, we XOR the high and low portions of data to make it fit into
|
|
* a std::size_t.
|
|
*/
|
|
|
|
if constexpr (sizeof(std::size_t) >= sizeof(std::uint64_t)) {
|
|
return (std::size_t)km.data;
|
|
}
|
|
else{ /* 32-bit mode */
|
|
return (std::size_t)(km.data ^ (km.data >> 32));
|
|
}
|
|
}
|
|
|
|
/* Insert all the k-mers of a given genome in a boost::bloom::filter.
|
|
* Assumed format is FASTA with A, C, G, T.
|
|
* https://en.wikipedia.org/wiki/FASTA_format
|
|
*/
|
|
|
|
using genome_filter = boost::bloom::filter<
|
|
k_mer<20>, /* using k-mers of length 20 */
|
|
1, boost::bloom::fast_multiblock32<8> >;
|
|
|
|
genome_filter make_genome_filter(const char* filename)
|
|
{
|
|
using k_mer = genome_filter::value_type;
|
|
|
|
std::ifstream in(filename, std::ios::ate); /* open at end to tell size */
|
|
if(!in) throw std::runtime_error("can't open file");
|
|
|
|
/* As a rough estimation, we assume that the number of k-mers
|
|
* is approximately equal to the length of the genome --this is
|
|
* overpessimistic due to the likely presence of duplicate k-mers.
|
|
* We set FPR = 1%.
|
|
*/
|
|
|
|
genome_filter f((std::size_t)in.tellg(), 0.01);
|
|
in.seekg(0);
|
|
|
|
std::string line;
|
|
std::size_t width = 0;
|
|
k_mer km;
|
|
while(std::getline(in, line)) {
|
|
if(line.empty()) continue;
|
|
if(line[0] == '>') { /* annotation lines in the FASTA format */
|
|
width = 0;
|
|
km.reset();
|
|
continue;
|
|
}
|
|
std::size_t i = 0;
|
|
|
|
/* don't insert km till it has km.size() nucleotides */
|
|
|
|
for(; width< km.size() - 1 && i < line.size(); ++i) {
|
|
km += line[i];
|
|
++width;
|
|
}
|
|
|
|
for(; i < line.size(); ++i) {
|
|
km += line[i];
|
|
f.insert(km);
|
|
}
|
|
}
|
|
return f;
|
|
}
|
|
|
|
/* We estimate a DNA sequence seq to be contained in a genome if all the k-mers
|
|
* of seq are contained. The calculation of the resulting false positive rate
|
|
* is left as an exercise for the reader.
|
|
*/
|
|
|
|
bool may_contain(const genome_filter& f, std::string_view seq)
|
|
{
|
|
using k_mer = genome_filter::value_type;
|
|
|
|
assert(seq.size() >= k_mer::size());
|
|
|
|
k_mer km;
|
|
auto first = seq.begin(), last = seq.end();
|
|
|
|
/* feed first km.size() -1 nucleotides */
|
|
|
|
for(std::size_t i = 0; i < km.size() - 1; ++i) km += *first++;
|
|
|
|
do{
|
|
km += *first++;
|
|
if(!f.may_contain(km)) return false;
|
|
}while(first != last);
|
|
return true;
|
|
}
|
|
|
|
int main()
|
|
{
|
|
try{
|
|
/* Fruit fly genome (Drosophila melanogaster), available at
|
|
* https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001215.4/
|
|
*/
|
|
|
|
auto f=make_genome_filter(
|
|
"GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna");
|
|
|
|
/* Some DNA sequences */
|
|
|
|
const char* seqs[] = {
|
|
"ataaataagattgCGACTCAAAATTAAgcaataacac", /* chr. X */
|
|
"attatagggagaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctc", /* chr. 3L */
|
|
"agaATTTACTAAGTACTTCTATGAATGGAATTATTATTGGAAACTCTACAA", /* chr. 4 */
|
|
"ATTTACTAAGTACTTCTATCTGCAAATTAACAATTTATCAAACAACTG", /* not present */
|
|
"ataaataagattgCGACTCAAAAGTAAgcaat" /* mutation in chr. X, not present */
|
|
};
|
|
|
|
int i = 0;
|
|
for(auto seq: seqs){
|
|
std::cout << "check sequence " << i++ << ": "
|
|
<< may_contain(f, seq) << "\n";
|
|
}
|
|
}
|
|
catch(const std::exception& e) {
|
|
std::cerr << e.what() << "\n";
|
|
return EXIT_FAILURE;
|
|
}
|
|
}
|