OpenMS  3.0.0
AhoCorasickAmbiguous.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2022.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 #include <OpenMS/CONCEPT/Macros.h>
39 
40 #include <cassert>
41 #include <functional> // for std::hash
42 #include <limits>
43 #include <queue>
44 #include <string>
45 #include <unordered_map>
46 #include <vector>
47 
48 namespace OpenMS
49 {
52  constexpr char const AAtoChar[28] = {
53  'A', // 00 Ala Alanine
54  'Y', // 01 Tyr Tyrosine
55  'C', // 02 Cys Cysteine
56  'D', // 03 Asp Aspartic Acid // B
57  'N', // 04 Asn Asparagine // B
58  'F', // 05 Phe Phenylalanine
59  'G', // 06 Gly Glycine
60  'H', // 07 His Histidine
61  'I', // 08 Ile Isoleucine // J
62  'L', // 09 Leu Leucine // J
63  'K', // 10 Lys Lysine
64  'W', // 11 Trp Tryptophan
65  'M', // 12 Met Methionine
66  'O', // 13 Pyl Pyrrolysine
67  'P', // 14 Pro Proline
68  'E', // 15 Glu Glutamic Acid // Z
69  'Q', // 16 Gln Glutamine // Z
70  'R', // 17 Arg Arginine
71  'S', // 18 Ser Serine
72  'T', // 19 Thr Threonine
73  'U', // 20 Selenocysteine
74  'V', // 21 Val Valine
75  // ambiguous AAs start here (index: 22...25)
76  'B', // 22 Aspartic Acid, Asparagine $ // the ambAA's need to be consecutive (B,J,Z,X,$)
77  'J', // 23 Leucine, Isoleucine $
78  'Z', // 24 Glutamic Acid, Glutamine $
79  'X', // 25 Unknown
80  // non-regular AA's, which are special
81  '$', // 26 superAA, i.e. it models a mismatch, which can be anything, including AAAs
82  '?', // 27 invalid AA (will usually be skipped) -- must be the last AA (AA::operator++ and others rely on it)
83  };
84 
86  constexpr char const CharToAA[128] = {
87  // ASCII char (7-bit Int with values from 0..127) --> amino acid
88  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 0
89  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 1
90  // $
91  27, 27, 27, 27, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 2
92  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, // 3
93 
94  // , A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,
95  27, 00, 22, 02, 03, 15, 05, 06, 07, 8, 23, 10, 9, 12, 04, 13, // 4
96 
97  // P, Q, R, S, T, U, V, W, X, Y, Z, , , , , ,
98  14, 16, 17, 18, 19, 20, 21, 11, 25, 01, 24, 27, 27, 27, 27, 27, // 5
99 
100  // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o,
101  27, 00, 22, 02, 03, 15, 05, 06, 07, 8, 23, 10, 9, 12, 04, 13, // 6
102 
103  // p, q, r, s, t, u, v, w, x, y, z, , , , , ,
104  14, 16, 17, 18, 19, 20, 21, 11, 25, 01, 24, 27, 27, 27, 27, 27, // 7
105  };
106 
109  struct OPENMS_DLLAPI Hit {
110  using T = uint32_t;
111  Hit() = default;
112  Hit(T needle_index, T needle_length, T query_pos) : needle_index(needle_index), needle_length(needle_length), query_pos(query_pos) {};
113  T needle_index;
116  bool operator==(const Hit& rhs) const
117  {
118  return needle_index == rhs.needle_index && needle_length == rhs.needle_length && query_pos == rhs.query_pos;
119  }
120  };
121 
124  struct OPENMS_DLLAPI AA
125  {
127  constexpr AA() : aa_(AA('?').aa_)
128  {
129  }
130 
134  constexpr explicit AA(const char c) : aa_(CharToAA[(unsigned char)c])
135  {
136  }
137 
139  constexpr uint8_t operator()() const
140  {
141  return aa_;
142  }
143 
145  constexpr bool operator==(const AA other) const
146  {
147  return aa_ == other.aa_;
148  }
149 
151  constexpr bool operator<=(const AA other) const
152  {
153  return aa_ <= other.aa_;
154  }
155 
157  constexpr bool isAmbiguous() const
158  {
159  return aa_ >= AA('B').aa_;
160  }
161 
163  constexpr bool isValid() const
164  {
165  return aa_ != AA('?').aa_;
166  }
167 
169  constexpr bool isValidForPeptide() const
170  {
171  return aa_ <= AA('X').aa_; // '$' or '?'
172  }
173 
175  constexpr AA& operator++()
176  {
177  ++aa_;
178  assert(aa_ <= AA('?').aa_); // make sure we don't overflow
179  return *this;
180  }
181 
183  constexpr AA operator++(int)
184  {
185  AA r(*this);
186  ++aa_;
187  assert(aa_ <= AA('?').aa_); // make sure we don't overflow
188  return r;
189  }
190 
192  constexpr AA operator-(const AA rhs) const
193  {
194  AA r(*this);
195  r.aa_ -= rhs.aa_;
196  return r;
197  }
198 
199  private:
200  uint8_t aa_;
201  };
202 
205  class OPENMS_DLLAPI Index
206  {
207  public:
208  using T = uint32_t;
210  Index() = default;
211 
213  Index(T val) : i_(val) {};
214 
216  bool isInvalid() const;
217 
219  bool isValid() const;
220 
222  T operator()() const;
223 
225  bool operator==(const Index other) const;
226 
228  T& pos();
229 
230  private:
231  T i_ = std::numeric_limits<T>::max();
232  };
233  } // namespace OpenMS
234 
235 // this needs to go into namespace std
236 template<> struct std::hash<OpenMS::Index>
237 {
238  std::size_t operator()(OpenMS::Index const& s) const noexcept
239  {
240  return std::hash<OpenMS::Index::T> {}(s());
241  }
242 };
243 
244 namespace OpenMS
245 {
248  struct OPENMS_DLLAPI ACNode
249  {
251  ACNode() {};
252 
254  ACNode(const AA label, const uint8_t depth) : edge(label)
255  {
256  depth_and_hits.depth = depth;
257  }
258 
260  struct DepthHits {
262  {
263  memset(this, 0, sizeof *this); // make sure bitfields are 0; C++20 allows {} initialization ...
264  };
265  uint8_t has_hit : 1;
266  // we could add another bit here to distinguish between a local hit and suffix hit, but on Windows, this slows it down
267  uint8_t depth : 7;
268  };
269 
270  using ChildCountType = uint8_t;
271 
272  Index suffix {0};
273  Index first_child {0};
274  // there is room for optimization here, by pulling edge labels into a separate vector (allows for SSE-enabled search of children)
275  AA edge {0};
276  ChildCountType nr_children = 0;
278  };
279 
280  // forward declaration
281  struct ACTrieState;
282 
284  struct OPENMS_DLLAPI ACSpawn
285  {
287  ACSpawn() = delete;
288 
290  ACSpawn(std::string::const_iterator query_pos, Index tree_pos, uint8_t max_aa, uint8_t max_mm, uint8_t max_prefix_loss);
291 
293  size_t textPos(const ACTrieState& state) const;
294 
297  AA nextValidAA(const ACTrieState& state);
298 
299  std::string::const_iterator it_query;
301  uint8_t max_aaa_leftover {0};
302  uint8_t max_mm_leftover {0};
303  uint8_t max_prefix_loss_leftover {0};
304  };
305 
308  OPENMS_DLLAPI AA nextValidAA(const std::string::const_iterator end, std::string::const_iterator& it_q);
309 
312  struct OPENMS_DLLAPI ACTrieState
313  {
314  friend ACSpawn;
317  void setQuery(const std::string& haystack);
318 
320  size_t textPos() const;
321 
323  std::string::const_iterator textPosIt() const;
324 
326  const std::string& getQuery() const;
327 
330  AA nextValidAA();
331 
332  std::vector<Hit> hits;
334  std::queue<ACSpawn> spawns;
335 
336  private:
337  std::string query_;
338  std::string::const_iterator it_q_;
339  };
340 
342  class OPENMS_DLLAPI ACTrie
343  {
344  public:
351  ACTrie(uint32_t max_aaa = 0, uint32_t max_mm = 0);
352 
354  ~ACTrie();
355 
359  void addNeedle(const std::string& needle);
360 
364  void addNeedles(const std::vector<std::string>& needles);
365 
368  void addNeedlesAndCompress(const std::vector<std::string>& needles);
369 
377  void compressTrie();
378 
380  size_t getNeedleCount() const;
381 
384  void setMaxAAACount(const uint32_t max_aaa);
385 
387  uint32_t getMaxAAACount() const;
388 
391  void setMaxMMCount(const uint32_t max_mm);
392 
394  uint32_t getMaxMMCount() const;
395 
399  bool nextHits(ACTrieState& state) const;
400 
403  void getAllHits(ACTrieState& state) const;
404 
405  private:
409  bool nextHitsNoClear_(ACTrieState& state) const;
410 
414  Index add_(const Index from, const AA edge);
415 
424  bool addHits_(Index i, const size_t text_pos, std::vector<Hit>& hits) const;
425 
427  bool addHitsSpawn_(Index i, const ACSpawn& spawn, const size_t text_pos, std::vector<Hit>& hits, const int current_spawn_depths) const;
428 
432  Index follow_(const Index i, const AA edge) const;
433 
436  bool followSpawn_(ACSpawn& spawn, const AA edge, ACTrieState& state) const;
437 
440  Index stepMaster_(const Index i, const AA edge, ACTrieState& state) const;
441 
444  bool stepSpawn_(ACSpawn& spawn, ACTrieState& state) const;
445 
448  void createSpawns_(const Index i, const AA fromAA, const AA toAA, ACTrieState& state, const uint32_t aaa_left, const uint32_t mm_left) const;
449 
451  void createSubSpawns_(const ACSpawn& prototype, const AA fromAA, const AA toAA, ACTrieState& state) const;
452 
454  void createMMSpawns_(const Index i, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState& state, const uint32_t aaa_left, const uint32_t mm_left) const;
455 
457  void createMMSubSpawns_(const ACSpawn& prototype, const AA except_fromAA, const AA except_toAA, const AA except_edge, ACTrieState& state) const;
458 
460  Index findChildNaive_(Index parent, AA child_label);
461 
463  Index findChildBFS_(const Index parent, const AA child_label) const;
464 
465  std::vector<ACNode> trie_;
466  uint32_t needle_count_ {0};
467  uint32_t max_aaa_ {0};
468  uint32_t max_mm_ {0};
469 
471  std::unordered_map<Index, std::vector<uint32_t>> umap_index2needles_;
473  std::unordered_map<Index, std::vector<Index>> umap_index2children_naive_;
474  };
475 
476 } // namespace OpenMS
477 
constexpr AA & operator++()
Pre-increment operator (advance to next AA)
Definition: AhoCorasickAmbiguous.h:175
Definition: AhoCorasickAmbiguous.h:109
constexpr bool operator==(const AA other) const
equality operator
Definition: AhoCorasickAmbiguous.h:145
constexpr AA operator-(const AA rhs) const
Decrement operator.
Definition: AhoCorasickAmbiguous.h:192
DepthHits depth_and_hits
depth of node in the tree and one bit if a needle ends in this node or any of its suffices ...
Definition: AhoCorasickAmbiguous.h:277
constexpr bool operator<=(const AA other) const
less-or-equal operator
Definition: AhoCorasickAmbiguous.h:151
std::queue< ACSpawn > spawns
initial spawn points which are currently active and need processing
Definition: AhoCorasickAmbiguous.h:334
Index(T val)
C&#39;tor from T.
Definition: AhoCorasickAmbiguous.h:213
constexpr bool isValidForPeptide() const
is the AA a letter, i.e. A-Z or a-z?
Definition: AhoCorasickAmbiguous.h:169
std::string::const_iterator it_q_
position in query
Definition: AhoCorasickAmbiguous.h:338
internal struct to steal one bit from depth to use as hit indicator
Definition: AhoCorasickAmbiguous.h:260
constexpr bool isValid() const
is AA a letter or &#39;$&#39; ?
Definition: AhoCorasickAmbiguous.h:163
constexpr char const CharToAA[128]
Conversion table from 7-bit ASCII char to internal value representation for an amino acid (AA) ...
Definition: AhoCorasickAmbiguous.h:86
uint32_t T
Definition: AhoCorasickAmbiguous.h:208
std::string query_
current query ( = haystack = text)
Definition: AhoCorasickAmbiguous.h:337
bool operator==(const IDBoostGraph::ProteinGroup &lhs, const IDBoostGraph::ProteinGroup &rhs)
const double c
Definition: Constants.h:209
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
static String suffix(const String &this_s, size_t length)
Definition: StringUtilsSimple.h:156
std::vector< Hit > hits
current hits found
Definition: AhoCorasickAmbiguous.h:332
Definition: AhoCorasickAmbiguous.h:312
bool operator==(const Hit &rhs) const
Definition: AhoCorasickAmbiguous.h:116
uint8_t aa_
internal representation as 1-byte integer
Definition: AhoCorasickAmbiguous.h:200
ACNode()
Default C&#39;tor.
Definition: AhoCorasickAmbiguous.h:251
std::size_t operator()(OpenMS::Index const &s) const noexcept
Definition: AhoCorasickAmbiguous.h:238
constexpr char const AAtoChar[28]
Definition: AhoCorasickAmbiguous.h:52
std::vector< ACNode > trie_
the trie, in either naive structure or BFS order (after compressTrie)
Definition: AhoCorasickAmbiguous.h:465
uint32_t T
Definition: AhoCorasickAmbiguous.h:110
std::string::const_iterator it_query
position in query
Definition: AhoCorasickAmbiguous.h:299
Index tree_pos
position in trie
Definition: AhoCorasickAmbiguous.h:300
ACNode(const AA label, const uint8_t depth)
C&#39;tor from an edge label (from parent to this node) and a depth in the tree.
Definition: AhoCorasickAmbiguous.h:254
constexpr uint8_t operator()() const
yields the internal 8bit representation
Definition: AhoCorasickAmbiguous.h:139
constexpr AA operator++(int)
Post-increment operator (advance to next AA)
Definition: AhoCorasickAmbiguous.h:183
An Aho Corasick trie (a set of nodes with suffix links mainly)
Definition: AhoCorasickAmbiguous.h:342
Index tree_pos
position in trie (for the master)
Definition: AhoCorasickAmbiguous.h:333
Definition: AhoCorasickAmbiguous.h:124
std::unordered_map< Index, std::vector< Index > > umap_index2children_naive_
maps the child nodes of a node for the naive tree; only needed during naive trie construction; storin...
Definition: AhoCorasickAmbiguous.h:473
Definition: AhoCorasickAmbiguous.h:248
DepthHits()
Definition: AhoCorasickAmbiguous.h:261
uint8_t depth
depth of node in the trie
Definition: AhoCorasickAmbiguous.h:267
T needle_length
Definition: AhoCorasickAmbiguous.h:114
std::unordered_map< Index, std::vector< uint32_t > > umap_index2needles_
maps a node to which needles end there (valid for both naive and BFS tree)
Definition: AhoCorasickAmbiguous.h:471
constexpr AA()
Default C&#39;tor; creates an invalid AA.
Definition: AhoCorasickAmbiguous.h:127
Definition: AhoCorasickAmbiguous.h:205
T query_pos
Definition: AhoCorasickAmbiguous.h:115
AA nextValidAA(const std::string::const_iterator end, std::string::const_iterator &it_q)
uint8_t ChildCountType
Definition: AhoCorasickAmbiguous.h:270
friend ACSpawn
Definition: AhoCorasickAmbiguous.h:314
T needle_index
Definition: AhoCorasickAmbiguous.h:112
a spin-off search path through the trie, which can deal with ambiguous AAs and mismatches ...
Definition: AhoCorasickAmbiguous.h:284
constexpr bool isAmbiguous() const
is AA a &#39;B&#39;, &#39;J&#39;, &#39;Z&#39;, &#39;X&#39;, or &#39;$&#39; ?
Definition: AhoCorasickAmbiguous.h:157
constexpr AA(const char c)
Definition: AhoCorasickAmbiguous.h:134