2
0
mirror of https://github.com/boostorg/bloom.git synced 2026-01-19 16:12:10 +00:00
Files
bloom/example/genome.cpp
joaquintides 2592193066 review feedback (#32)
* removed superfluous inline (Alexander Grund)
* made hasher equivalence a precondition for &=/|= (Andrzej Krzemienski)
* documented exception safety guarantees (Andrzej Krzemienski)
* mentioned Bloom filters are called so after Burton H Bloom (Dmitry Arkhipov)
* added warning about OOM for very small FPR (Ivan Matek)
* stressed config chart x axis is capacity/num elements rather than plain capacity (Ivan Matek)
* s/[SIMD] is available/is enabled at compile time (Ivan Matek)
* shut down clang-tidy warnings (Ivan Matek)
* used "set union" for more clarity (Andrzej Krzemienski)
* stressed early on that boost::bloom::filter is _not_ a container (Claudio DeSouza)
* added bulk operations to roadmap (Dmitry Arkhipov)
* added try_insert to roadmap (Konstantin Savvidy)
* added estimated_size to roadmap (Konstantin Savvidy)
* added alternative filters to roadmap (Konstantin Savvidy)
* used <cstdint> instead of <boost/cstdint.hpp> (Rubén Pérez)
* mentioned endianness when serializing filters (Rubén Pérez)
* corrected sloppiness about optimum k determination (Tomer Vromen)
* added run-time specification of k to roadmap (Tomer Vromen)
* added test/CMakeLists.txt (Rubén Pérez)
* added CMake-based testing to GHA (Rubén Pérez) (#8)
* added <boost/bloom.hpp> (Rubén Pérez)
* added Codecov reporting (Rubén Pérez) (#9)
* moved from boost::unordered::hash_is_avalanching to ContainerHash's boost::hash_is_avalanching (Ivan Matek/Peter Dimov)
* added syntax highlighting to code snippets (Rubén Pérez)
* avoided C-style casts in examples (Rubén Pérez)
* added acknowledgements section (Peter Turcan)
* added Getting Started section (Peter Turcan)
* fixed example Jamfile and added example building to CI (Rubén Pérez) (#10)
* added diagram about overlapping vs. non-overlapping subarrays (Rubén Pérez/Ivan Matek/Vinnie Falco)
* made first code snippet self-contained (Rubén Pérez/Peter Turcan)
* added more comments to genome.cpp (Rubén Pérez)
* added support for arrays as blocks (Tomer Vromen) (#24)
* removed emplace (Seth Heeren/Peter Dimov) (#25)
* required the allocator to be of unsigned char (Seth Heeren/Peter Dimov) (#26)
* added compile-time validation of Block types (Rubén Pérez) (#27)
* added value type to displayed filter names in tables (Tomer Vromen) (#28)
* used -march=native rather than -mavx2 (Ivan Matek)
* adopted hash strategy with fastrange plus a separate MCG (Kostas Savvidis/Peter Dimov) (#30)
* several maintenance commits
2025-06-24 23:27:54 +02:00

196 lines
4.9 KiB
C++

/* Using Boost.Bloom to check occurrence of DNA sequences in a genome.
*
* Copyright 2025 Joaquin M Lopez Munoz.
* Distributed under the Boost Software License, Version 1.0.
* (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*
* See https://www.boost.org/libs/bloom for library home page.
*/
#include <array>
#include <boost/bloom/filter.hpp>
#include <boost/bloom/fast_multiblock32.hpp>
#include <cassert>
#include <cstdint>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <stdexcept>
#include <string_view>
/* A k-mer is a sliding segment of size k over a sequence of DNA nucleotides
* (A, C, G, T). k_mer encodes the segment in a 64-bit word using 2 bits per
* nucleotide.
*/
template<std::size_t K>
struct k_mer
{
static_assert(
K >= 0 &&
2 * K <= sizeof(std::uint64_t) * CHAR_BIT);
static constexpr std::size_t size()
{
return K;
}
void reset()
{
data = 0;
}
/* shift the k-mer and append a new nucleotide */
k_mer& operator+=(char n)
{
static constexpr std::uint64_t mask=
(((std::uint64_t)1) << (2 * size())) - 1;
data <<= 2;
data &= mask;
data |= table[(unsigned char)n];
return *this;
}
std::uint64_t data = 0;
using table_type=std::array<unsigned char, UCHAR_MAX>;
static constexpr table_type table = [] {
table_type table{};
table['A'] = table['a'] = 0;
table['C'] = table['c'] = 1;
table['G'] = table['g'] = 2;
table['T'] = table['t'] = 3;
return table;
}();
};
template<std::size_t N>
std::size_t hash_value(const k_mer<N>& km)
{
/* k:mer::data is 8 bytes wide. We use it directly as the associated
* hash value in 64-bit mode, as std::size_t is the same size; in 32-bit
* mode, we XOR the high and low portions of data to make it fit into
* a std::size_t.
*/
if constexpr (sizeof(std::size_t) >= sizeof(std::uint64_t)) {
return (std::size_t)km.data;
}
else{ /* 32-bit mode */
return (std::size_t)(km.data ^ (km.data >> 32));
}
}
/* Insert all the k-mers of a given genome in a boost::bloom::filter.
* Assumed format is FASTA with A, C, G, T.
* https://en.wikipedia.org/wiki/FASTA_format
*/
using genome_filter = boost::bloom::filter<
k_mer<20>, /* using k-mers of length 20 */
1, boost::bloom::fast_multiblock32<8> >;
genome_filter make_genome_filter(const char* filename)
{
using k_mer = genome_filter::value_type;
std::ifstream in(filename, std::ios::ate); /* open at end to tell size */
if(!in) throw std::runtime_error("can't open file");
/* As a rough estimation, we assume that the number of k-mers
* is approximately equal to the length of the genome --this is
* overpessimistic due to the likely presence of duplicate k-mers.
* We set FPR = 1%.
*/
genome_filter f((std::size_t)in.tellg(), 0.01);
in.seekg(0);
std::string line;
std::size_t width = 0;
k_mer km;
while(std::getline(in, line)) {
if(line.empty()) continue;
if(line[0] == '>') { /* annotation lines in the FASTA format */
width = 0;
km.reset();
continue;
}
std::size_t i = 0;
/* don't insert km till it has km.size() nucleotides */
for(; width< km.size() - 1 && i < line.size(); ++i) {
km += line[i];
++width;
}
for(; i < line.size(); ++i) {
km += line[i];
f.insert(km);
}
}
return f;
}
/* We estimate a DNA sequence seq to be contained in a genome if all the k-mers
* of seq are contained. The calculation of the resulting false positive rate
* is left as an exercise for the reader.
*/
bool may_contain(const genome_filter& f, std::string_view seq)
{
using k_mer = genome_filter::value_type;
assert(seq.size() >= k_mer::size());
k_mer km;
auto first = seq.begin(), last = seq.end();
/* feed first km.size() -1 nucleotides */
for(std::size_t i = 0; i < km.size() - 1; ++i) km += *first++;
do{
km += *first++;
if(!f.may_contain(km)) return false;
}while(first != last);
return true;
}
int main()
{
try{
/* Fruit fly genome (Drosophila melanogaster), available at
* https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000001215.4/
*/
auto f=make_genome_filter(
"GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna");
/* Some DNA sequences */
const char* seqs[] = {
"ataaataagattgCGACTCAAAATTAAgcaataacac", /* chr. X */
"attatagggagaaatatgatcgcgtatgcgagagtagtgccaacatattgtgctc", /* chr. 3L */
"agaATTTACTAAGTACTTCTATGAATGGAATTATTATTGGAAACTCTACAA", /* chr. 4 */
"ATTTACTAAGTACTTCTATCTGCAAATTAACAATTTATCAAACAACTG", /* not present */
"ataaataagattgCGACTCAAAAGTAAgcaat" /* mutation in chr. X, not present */
};
int i = 0;
for(auto seq: seqs){
std::cout << "check sequence " << i++ << ": "
<< may_contain(f, seq) << "\n";
}
}
catch(const std::exception& e) {
std::cerr << e.what() << "\n";
return EXIT_FAILURE;
}
}