OpenMS  2.7.0
AhoCorasickAmbiguous.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2021.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Chris Bielow, Tobias Rausch $
33 // --------------------------------------------------------------------------
34 
35 
36 #pragma once
37 
41 #include <OpenMS/CONCEPT/Macros.h>
42 
43 #ifdef NDEBUG
44 #define DEBUG_ONLY if (false)
45 #else
46 #define DEBUG_ONLY if (false)
47 #endif
48 
49 // the SeqAn implementation comes first. To use the OpenMS interface, see below.
50 
51 namespace seqan
52 {
53 
54  // we will use AAcid Tables from SeqAn 2.x here, since they are complete
55 
56  template <typename T = void>
58  {
59  static char const VALUE[27];
60  };
61  template <typename T>
63  { // B = D|N, Z = E|Q, J = I|L
64  // we re-order the aminoacids such that ambiguous AA's are consecutive (which saves effort during their enumeration)
65  'A', // 00 Ala Alanine
66  'Y', // 23 Tyr Tyrosine ! 1(Y)
67  'C', // 02 Cys Cysteine
68  'D', // 03 Asp Aspartic Acid // B
69  'N', // 13 Asn Asparagine // B ! 4(N)
70  'F', // 05 Phe Phenylalanine
71  'G', // 06 Gly Glycine
72  'H', // 07 His Histidine
73  'I', // 08 Ile Isoleucine // J
74  'L', // 11 Leu Leucine // J ! 9(L)
75  'K', // 10 Lys Lysine ! 10(K)
76  'W', // 22 Trp Tryptophan ! 11(W)
77  'M', // 12 Met Methionine
78  'O', // 14 Pyl Pyrrolysine ! 13(O)
79  'P', // 15 Pro Proline ! 14(P)
80  'E', // 04 Glu Glutamic Acid // Z ! 15(E)
81  'Q', // 16 Gln Glutamine // Z
82  'R', // 17 Arg Arginine
83  'S', // 18 Ser Serine
84  'T', // 19 Thr Threonine
85  'U', // 20 Selenocysteine
86  'V', // 21 Val Valine
87  'B', // 01 Aspartic Acid, Asparagine $ 22(B) // the ambAA's need to be consecutive (B,J,Z,X)
88  'J', // 09 Leucine, Isoleucine $ 23(J)
89  'Z', // 24 Glutamic Acid, Glutamine $
90  'X', // 25 Unknown (25)
91  '*' // 26 Terminator
92  };
93 
94  template <typename T = void>
96  {
97  static char const VALUE[256];
98  };
99 
100  template <typename T>
101  char const OMS_TranslateTableCharToAA_<T>::VALUE[256] =
102  { // char --> amino acid (unsigned char with values from 0..26)
103  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //0
104  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //1
105  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 25, 25, 25, 25, 25, //2
106  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //3
107 
108  // , A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,
109  25, 0, 22, 2, 3, 15, 5, 6, 7, 8, 23, 10, 9, 12, 4, 13, //4
110 
111  // P, Q, R, S, T, U, V, W, X, Y, Z, , , , , ,
112  14, 16, 17, 18, 19, 20, 21, 11, 25, 1, 24, 25, 25, 25, 25, 25, //5
113 
114  // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o,
115  25, 0, 22, 2, 3, 15, 5, 6, 7, 8, 23, 10, 9, 12, 4, 13, //6
116 
117  // p, q, r, s, t, u, v, w, x, y, z, , , , , ,
118  14, 16, 17, 18, 19, 20, 21, 11, 25, 1, 24, 25, 25, 25, 25, //7
119 
120  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //8
121  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //9
122  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //10
123  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //11
124  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //12
125  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //13
126  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //14
127  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25 //15
128  };
129 
130  template <typename T = void>
132  {
133  static char const VALUE[256];
134  };
135 
136  template <typename T>
137  char const OMS_TranslateTableByteToAA_<T>::VALUE[256] =
138  {
139  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //0
140  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 25, 25, 25, 25, 25, //1
141  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //2
142  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //3
143  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //4
144  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //5
145  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //6
146  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //7
147  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //8
148  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //9
149  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //10
150  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //11
151  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //12
152  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //13
153  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //14
154  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25 //15
155  };
156  // --------------------------------------------------------------------------
157  // Amino Acid
158  // --------------------------------------------------------------------------
159  struct AAcid_ {};
160  typedef SimpleType<unsigned char, AAcid_> AAcid;
161 
162  template <> struct ValueSize<AAcid>
163  {
164  typedef uint8_t Type;
165  static const Type VALUE = 27;
166  };
167 
168  template <> struct BitsPerValue<AAcid>
169  {
170  typedef uint8_t Type;
171  static const Type VALUE = 5;
172  };
173 
175  {
176  static const AAcid _result = AAcid('X');
177  return _result;
178  }
179  inline void assign(char & c_target, AAcid const & source)
180  {
181  c_target = OMS_TranslateTableAAToChar_<>::VALUE[source.value];
182  }
183 
184  template <>
185  struct CompareTypeImpl<AAcid, uint8_t>
186  {
187  typedef AAcid Type;
188  };
189 
190  inline void assign(AAcid & target, uint8_t c_source)
191  {
192  target.value = OMS_TranslateTableByteToAA_<>::VALUE[c_source];
193  }
194 
195  template <>
196  struct CompareTypeImpl<AAcid, char>
197  {
198  typedef AAcid Type;
199  };
200 
201  inline void assign(AAcid & target, char c_source)
202  {
203  target.value = OMS_TranslateTableCharToAA_<>::VALUE[(unsigned char)c_source];
204  }
205 
206  typedef String<AAcid, Alloc<void> > AAString; // identical to seqan::Peptide, but without the misnomer (or how would you encode proteins? as seqan::Peptide?)
208 
210 
211  struct FuzzyAC_;
212  typedef Tag<FuzzyAC_> FuzzyAC;
213 
215  template <typename TNeedle>
216  struct Spawn
217  {
218  typedef typename Size<TNeedle>::Type TSize;
219  typedef typename Value<TNeedle>::Type TKeyword;
220  typedef typename Value<TKeyword>::Type TAlphabet;
221  typedef Graph<Automaton<TAlphabet> > TGraph;
222  typedef typename VertexDescriptor<TGraph>::Type TVert;
228 
229  Spawn(TVert init_state, KeyWordLengthType current_depth, KeyWordLengthType aaa_seen, KeyWordLengthType mm_seen) :
230  current_state(init_state),
231  max_depth_decrease(current_depth),
232  ambAA_seen(aaa_seen),
233  mismatches_seen(mm_seen)
234  {}
235 
236  Spawn(const Spawn& other) = default;
237 
238  private:
239  Spawn();
241  };
242 
243  template <typename TNeedle>
245  {
246 
248  : spawns(),
249  data_lastState(0), // a bit of cheating, but we know that root==0
253  {}
254 
255  void reset()
256  {
257  spawns.clear();
258  data_lastState = 0; // a bit of cheating, but we know that root==0
259  clear(hits_endPositions);
260  data_keywordIndex = 0;
261  data_needleLength = 0;
262  }
263 
264  typedef typename Size<TNeedle>::Type TSize;
265  typedef typename Value<TNeedle>::Type TKeyword;
266  typedef typename Value<TKeyword>::Type TAlphabet;
267  typedef Graph<Automaton<TAlphabet> > TGraph;
268  typedef typename VertexDescriptor<TGraph>::Type TVert;
269  typedef __uint8 KeyWordLengthType;
270  typedef typename std::list<Spawn<TNeedle> > Spawns;
271  typedef typename std::list<Spawn<TNeedle> >::iterator SpawnIt;
272  typedef typename std::list<Spawn<TNeedle> >::const_iterator SpawnCIt;
273 
274  // "working" set; changes with every hit
277  // for hit reporting:
278  String<TSize> hits_endPositions;
281  };
282 
283  template <typename TNeedle>
284  class Pattern<TNeedle, FuzzyAC>
285  {
286  private:
287  Pattern(Pattern const& other);
288  Pattern const& operator=(Pattern const & other);
289  public:
291  : nilVal(getNil<TVert>()),
292  max_ambAA(0),
293  max_mmAA(0)
294  {}
295 
296  //typedef typename Size<TNeedle>::Type TSize; // defaults to uint64, but uint32 is enough...
297  typedef uint32_t TSize; // max. number of peptides allowed (4 billion; checked in init())
298  typedef typename Value<TNeedle>::Type TKeyword;
299  typedef typename Value<TKeyword>::Type TAlphabet;
300  typedef Graph<Automaton<TAlphabet> > TGraph;
301  typedef typename VertexDescriptor<TGraph>::Type TVert;
302  typedef __uint8 KeyWordLengthType;
303 
304  // constant after C'tor;
305  const TVert nilVal;
306 
307  // "constant" data, after construction of trie
308  Holder<TNeedle> data_host;
310  String<String<TSize> > data_map_outputNodes;
311  String<KeyWordLengthType> data_node_depth;
314 
315 #ifndef NDEBUG
316  String<TVert> parentMap;
317 #endif
318 
319 
326  void init(TNeedle const& ndl, KeyWordLengthType max_AAA = 3, KeyWordLengthType max_mm = 0)
327  {
328  SEQAN_CHECKPOINT
329  max_ambAA = max_AAA;
330  max_mmAA = max_mm;
331  if (std::numeric_limits<TSize>::max() < length(ndl)) // check 4 billion peptide limit; use length(ndl) directly, since TSize might be too small
332  {
333  throw OpenMS::Exception::InvalidValue(__FILE__, __LINE__, "Pattern<FuzzyAC>(PeptideSet)", std::string("Input contains more than 2^32 peptides. Cannot create trie.").c_str(), OpenMS::String(length(ndl)));
334  }
335  TSize ln = (TSize)length(ndl);
336  for (TSize i = 0; i < ln; ++i)
337  {
338  if (length(ndl[i]) > std::numeric_limits<KeyWordLengthType>::max())
339  {
340  throw OpenMS::Exception::InvalidValue(__FILE__, __LINE__, "Pattern<FuzzyAC>(PeptideSet)", std::string("Input peptide to FuzzyAC must NOT be longer than 255 chars!").c_str(), std::string(begin(ndl[i]), end(ndl[i])));
341  }
342  TSize lni = (TSize)length(ndl[i]);
343  for (TSize j = 0; j < lni; ++j)
344  {
345  if (isAmbiguous(ndl[i][j])) // this check is important -- find() code below relies on no ambiguous chars being present!
346  {
347  throw OpenMS::Exception::InvalidValue(__FILE__, __LINE__, "Pattern<FuzzyAC>(PeptideSet)", std::string("Input peptide to FuzzyAC must NOT contain ambiguous amino acids (B/J/Z/X)!").c_str(), std::string(begin(ndl[i]), end(ndl[i])));
348  }
349  }
350  }
351  setHost(*this, ndl);
352  }
353 
355  {
356  SEQAN_CHECKPOINT
357  }
358  //____________________________________________________________________________
359  };
360 
361 
363  // Host Metafunctions
365 
366  template <typename TNeedle>
367  struct Host< Pattern<TNeedle, FuzzyAC> >
368  {
369  typedef TNeedle Type;
370  };
371 
372  template <typename TNeedle>
373  struct Host< Pattern<TNeedle, FuzzyAC> const>
374  {
375  typedef TNeedle const Type;
376  };
377 
378 
380  // Functions
382 
387  template<typename T>
388  inline void _getSpawnRange(const AAcid c, T& idxFirst, T& idxLast)
389  {
390  // jump table: // start of spawns // end of spawns (including)
391  static const T jump[4][2] = { { ordValue(AAcid('D')), ordValue(AAcid('N')) }, // B = D,N
392  { ordValue(AAcid('I')), ordValue(AAcid('L')) }, // J = I,L
393  { ordValue(AAcid('E')), ordValue(AAcid('Q')) }, // Z = E,Q
394  { 0 , 21 } }; // X = A..V
395  static const T ord_b = ordValue(AAcid('B'));
396 #ifndef NDEBUG
397  assert(ordValue(AAcid('N')) - ordValue(AAcid('D')) == 1); // must be neighbours
398  assert(ordValue(AAcid('Q')) - ordValue(AAcid('E')) == 1); // must be neighbours
399  assert(ordValue(AAcid('V')) == 21); // make sure the table is ordered as we expect
400  // row of jump table:
401  assert(ord_b == 22); // make sure the table is ordered as we expect
402  assert(ordValue(AAcid('J')) == 23); // make sure the table is ordered as we expect
403  assert(ordValue(AAcid('Z')) == 24); // make sure the table is ordered as we expect
404  assert(ordValue(AAcid('X')) == 25); // make sure the table is ordered as we expect
405 #endif
406  idxFirst = jump[ordValue(c) - ord_b][0];
407  idxLast = jump[ordValue(c) - ord_b][1];
408  }
409 
410 
411  template <typename TNeedle>
413  {
414  //OpenMS::StopWatch sw;
415  //sw.start();
416  typedef typename Position<TNeedle>::Type TPosition;
417  typedef typename Value<TNeedle>::Type TKeyword;
418  typedef typename Value<TKeyword>::Type TAlphabet;
419  typedef Graph<Automaton<TAlphabet> > TGraph;
420  typedef typename VertexDescriptor<TGraph>::Type TVert;
421  TVert nilVal = getNil<TVert>();
422 
423  // Create regular trie
424  createTrie(me.data_graph, me.data_map_outputNodes, host(me));
425 
426  // Create parent map
427  String<TVert> parentMap;
428  String<TAlphabet> parentCharMap;
429  resizeVertexMap(me.data_graph, parentMap);
430  resizeVertexMap(me.data_graph, parentCharMap);
431  // init all nodes to Nil
432  for (TPosition i = 0; i < length(parentMap); ++i) {
433  assignProperty(parentMap, i, nilVal);
434  }
435  typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
436  TEdgeIterator itEd(me.data_graph);
437  //int count_prop = 0;
438  for (; !atEnd(itEd); goNext(itEd)) {
439  // property, vertex , value
440  assignProperty(parentMap, targetVertex(itEd), sourceVertex(itEd));
441  assignProperty(parentCharMap, targetVertex(itEd), label(itEd));
442  //++count_prop;
443  }
444  //std::cout << "\nassigned " << count_prop << " props to " << length(parentMap) << " values\n";
445 #ifndef NDEBUG
446  me.parentMap = parentMap;
447 #endif
448 
449  // Build AC
450  TVert root = getRoot(me.data_graph);
451  // properties....
452  String<TVert> data_map_failurelink;
453  resizeVertexMap(me.data_graph, data_map_failurelink);
454  assignProperty(data_map_failurelink, root, nilVal);
455 
456  resizeVertexMap(me.data_graph, me.data_node_depth);
457  assignProperty(me.data_node_depth, root, 0);
458 
459  // Bfs Traversal
460  typedef typename Iterator<TGraph, BfsIterator>::Type TBfsIterator;
461  TBfsIterator it(me.data_graph, root);
462  typedef typename ValueSize<AAcid>::Type TSize;
463  TSize idxAAFirst, idxAALast;
464  _getSpawnRange('X', idxAAFirst, idxAALast);
465  // create nextMove function for root (point to itself)
466  for (TSize idx = idxAAFirst; idx <= idxAALast; ++idx)
467  {
468  if (getSuccessor(me.data_graph, root, AAcid(idx)) == nilVal) addEdge(me.data_graph, root, root, AAcid(idx));
469  }
470 
471  goNext(it); // skip root
472 
473  // connectivity statistics
474  //std::map<int, int> connectivity;
475 
476  for (; !atEnd(it); goNext(it))
477  {
478  const typename GetValue<TBfsIterator>::Type itval = *it; // dereferencing *it will give an index into the property array!
479 
480  //++connectivity[outDegree(me.data_graph, itval)];
481 
482  // set depth of current node using: depths(parent) + 1
483  TVert parent = getProperty(parentMap, itval);
484  assignProperty(me.data_node_depth, itval, getProperty(me.data_node_depth, parent) + 1);
485 
486 
490  // sigma: edge label
491  TAlphabet sigma = getProperty(parentCharMap, itval);
492  // take suffix link of parent and try to go down with sigma
493  TVert down = getProperty(data_map_failurelink, parent);
494  while ((down != nilVal) &&
495  (getSuccessor(me.data_graph, down, sigma) == nilVal))
496  {
497  down = getProperty(data_map_failurelink, down);
498  }
499  if (down != nilVal)
500  { // we found an edge to follow down
501  assignProperty(data_map_failurelink, itval, getSuccessor(me.data_graph, down, sigma));
502  // output function
503  String<TPosition> endPositions = getProperty(me.data_map_outputNodes, getProperty(data_map_failurelink, itval));
504  if (!empty(endPositions))
505  {
506  // get current end positions (full path) ...
507  String<TPosition> endPositionsCurrent = getProperty(me.data_map_outputNodes, itval);
508  typedef typename Iterator<String<TPosition>, Rooted >::Type TStringIterator;
509  // .. and append all patterns which are a suffix
510  TStringIterator sit = begin(endPositions);
511  for (;!atEnd(sit); goNext(sit))
512  {
513  appendValue(endPositionsCurrent, *sit);
514  }
515  assignProperty(me.data_map_outputNodes, itval, endPositionsCurrent);
516  }
517  }
518  else { // no suffix exists: point suffix link of current node to root
519  assignProperty(data_map_failurelink, itval, root);
520  }
521 
522  // create nextMove function
523  for (TSize idx = idxAAFirst; idx <= idxAALast; ++idx)
524  {
525  if (getSuccessor(me.data_graph, itval, AAcid(idx)) == nilVal)
526  { // no child:
527  const TVert& target = getSuccessor(me.data_graph, getProperty(data_map_failurelink, itval) , AAcid(idx));
528  addEdge(me.data_graph, itval, target, AAcid(idx));
529  }
530 
531  }
532  }
533 
534  //sw.stop();
535  //std::cout << OpenMS::String(" Trie build time: ") + sw.getClockTime() + " s (wall), " + sw.getCPUTime() + " s (CPU)).\n\n";
536 
537  /*for (std::map<int,int>::const_iterator it = connectivity.begin(); it != connectivity.end(); ++it)
538  {
539  std::cout << " conn: " << it->first << " : " << it->second << "x\n";
540  }*/
541 
542  }
543 
544  template <typename TNeedle, typename TNeedle2>
545  void setHost(Pattern<TNeedle, FuzzyAC> & me, TNeedle2 const & needle)
546  {
547  SEQAN_CHECKPOINT;
548  SEQAN_ASSERT_NOT(empty(needle));
549  setValue(me.data_host, needle);
550  clear(me.data_graph);
551  clear(me.data_map_outputNodes);
552  _createAcTrie(me);
553  }
554  template <typename TNeedle, typename TNeedle2>
555  inline void setHost(Pattern<TNeedle, FuzzyAC> & me, TNeedle2 & needle)
556  {
557  setHost(me, reinterpret_cast<TNeedle2 const &>(needle));
558  }
559  //____________________________________________________________________________
560 
561  //____________________________________________________________________________
562  template <typename TNeedle>
563  inline typename Size<TNeedle>::Type position(const PatternAuxData<TNeedle>& dh)
564  {
565  return dh.data_keywordIndex;
566  }
567 
568  template <typename TFinder, typename TNeedle>
569  inline void _reportHit(TFinder& finder, const Pattern<TNeedle, FuzzyAC>& me, PatternAuxData<TNeedle>& dh) {
570  size_t idx_endPosVec = length(dh.hits_endPositions) - 1;
571  dh.data_keywordIndex = dh.hits_endPositions[idx_endPosVec];
572  resize(dh.hits_endPositions, idx_endPosVec); // pop last hit
573  dh.data_needleLength = length(value(host(me), dh.data_keywordIndex)) - 1;
574  finder -= dh.data_needleLength; // position finder at beginning of hit
575  _setFinderLength(finder, dh.data_needleLength + 1);
576  _setFinderEnd(finder, position(finder) + length(finder)); // end of match within haystack
577  return;
578  }
579 
580  inline bool isAmbiguous(AAcid c)
581  { // relies on the fact that ambiguous AA's are occurring in a block
582  static const typename ValueSize<AAcid>::Type vB = ordValue(AAcid('B')); // D,N
583  static const typename ValueSize<AAcid>::Type vX = ordValue(AAcid('X')); // all
584  return (vB <= ordValue(c) && ordValue(c) <= vX);
585  }
586 
587  inline bool isAmbiguous(const AAString& s)
588  {
589  for (AAStringIterator it = begin(s); it != end(s); ++it)
590  {
591  if (isAmbiguous(*it)) return true;
592  }
593  return false;
594  }
595 
596 
597 
598  // called by spawns only
599  // returns false if it reached the 'root', true otherwise
600  template <typename TNeedle>
603  Spawn<TNeedle>& spawn,
604  const AAcid c) // ALWAYS ambiguous!!!!
605  {
606  assert(isAmbiguous(c));
607 
608  DEBUG_ONLY std::cout << "\n\ntrying to spawn from spawn on AAA: " << c << " with path: " << getPath(me, spawn.current_state) << "\n";
609  typedef typename ValueSize<AAcid>::Type TSize;
610  TSize idxFirst, idxLast;
611  _getSpawnRange(c, idxFirst, idxLast);
612  for (TSize idx = idxFirst + 1; idx <= idxLast; ++idx) // first iteration is left for the parent
613  {
614  Spawn<TNeedle> spawn2 = spawn; // a potential spawn
615  //std::cout << "spawn aa is " << AAcid(idx) << "\n";
616  if (_consumeChar(me, dh, spawn2, AAcid(idx)))
617  {
618  // Spawn2 inherits the depths from its parent
619  dh.spawns.push_front(spawn2);
620  DEBUG_ONLY std::cout << " Spawn from Spawn '" << getPath(me, spawn2.current_state) << "' created at d: " << int(spawn2.max_depth_decrease) << " AA-seen: " << int(spawn2.ambAA_seen) << "\n";
621  }
622  }
623  bool r = _consumeChar(me, dh, spawn, AAcid(idxFirst));
624  if (r)
625  {
626  DEBUG_ONLY std::cout << " Spawn from Spawn '" << getPath(me, spawn.current_state) << "' created at d: " << int(spawn.max_depth_decrease) << " AA-seen: " << int(spawn.ambAA_seen) << "\n";
627  }
628  return r;
629  }
630 
631 #ifdef NDEBUG
632  template<class TNeedle> inline std::string getPath(const Pattern<TNeedle, FuzzyAC>& /*me*/, typename Pattern<TNeedle, FuzzyAC>::TVert /*current_state*/)
633  {
634  return "";
635  }
636 #else
638  template<class TNeedle> inline std::string getPath(const Pattern<TNeedle, FuzzyAC>& me, typename Pattern<TNeedle, FuzzyAC>::TVert current_state)
639  {
640  if (getRoot(me.data_graph) == current_state) return "";
641 
642  typename Pattern<TNeedle, FuzzyAC>::TVert suffix_node = getProperty(me.parentMap, current_state);
643  typename Iterator<typename Pattern<TNeedle, FuzzyAC>::TGraph const, OutEdgeIterator>::Type it(me.data_graph, suffix_node);
644  while(targetVertex(it) != current_state) ++it;
645  char c = (label(it));
646  return getPath(me, suffix_node) + c;
647  }
648 #endif
649 
650  template<class TNeedle> inline void addHits(const Pattern<TNeedle, FuzzyAC>& me, PatternAuxData<TNeedle>& dh, const Spawn<TNeedle>& spawn)
651  {
652  typedef typename Pattern<TNeedle, FuzzyAC>::TSize TSize;
653  const String<TSize>& needle_hits = getProperty(me.data_map_outputNodes, spawn.current_state);
654  //DEBUG_ONLY std::cout << "spawn at path: " << getPath(me, spawn.current_state) << "\n";
655  if (length(needle_hits))
656  {
657  int path_length = getProperty(me.data_node_depth, spawn.current_state); // == length of current path to spawn
658  int unambiguous_suffix_length = path_length - spawn.max_depth_decrease; // == length of suffix peptide which does not contain AAA
659  DEBUG_ONLY std::cout << " spawn adding hits which are at least " << unambiguous_suffix_length << " chars long (thus contain the AAA).\n";
660 
661  // but only report those which contain the AAA
662  for (auto it = begin(needle_hits); it != end(needle_hits); ++it)
663  {
664  int hit_length = (int)length(value(host(me), *it));
665  if (hit_length < unambiguous_suffix_length) break; // assumption: terminalStateMap is sorted by length of hits! ... uiuiui...
666  DEBUG_ONLY std::cout << " add spawn hit: needle #" << *it << " as " << (value(host(me), *it)) << "\n";
667  append(dh.hits_endPositions, *it); // append hits which still contain the AAA
668  }
669  }
670  }
671  template<class TNeedle> inline void addHits(const Pattern<TNeedle, FuzzyAC>& me, PatternAuxData<TNeedle>& dh, const typename Pattern<TNeedle, FuzzyAC>::TVert& current_state)
672  {
673  typedef typename Pattern<TNeedle, FuzzyAC>::TSize TSize;
674  //DEBUG_ONLY std::cout << "master at path: " << getPath(me, current_state) << "\n";
675  const String<TSize>& needle_hits = getProperty(me.data_map_outputNodes, current_state);
676  if (length(needle_hits))
677  {
678  DEBUG_ONLY std::cout << "master's new hits: total " << length(needle_hits) << " hits\n";
679  append(dh.hits_endPositions, getProperty(me.data_map_outputNodes, current_state)); // indices into TNeedle!
680  }
681  }
682 
683 
692  template <typename TNeedle>
695  typename Pattern<TNeedle, FuzzyAC>::TVert& current,
696  const AAcid c)
697  {
698  //DEBUG_ONLY std::cout << "consuming real char " << c << " ";
699  current = getSuccessor(me.data_graph, current, c);
700  assert(current != me.nilVal);
701  if (current == getRoot(me.data_graph)) {
702  return false;
703  }
704  addHits(me, dh, current);
705  return true;
706  }
707 
715  template <typename TNeedle>
718  Spawn<TNeedle>& spawn,
719  const AAcid c)
720  {
721  //DEBUG_ONLY std::cout << "consuming real char " << c << " ";
722  typename Pattern<TNeedle, FuzzyAC>::TVert successor = getSuccessor(me.data_graph, spawn.current_state, c);
723  assert(successor != me.nilVal);
724  // check if prefix was lost
725  if (successor == getRoot(me.data_graph)) return false;
726 
727  if (getProperty(me.data_node_depth, spawn.current_state) >= getProperty(me.data_node_depth, successor))
728  { // went at least one level up (and maybe one down again, hence equality)
729  typedef typename Pattern<TNeedle, FuzzyAC>::KeyWordLengthType KeyWordLengthType;
730  const KeyWordLengthType up_count = 1 + getProperty(me.data_node_depth, spawn.current_state) - getProperty(me.data_node_depth, successor);
731  // the +1 is not valid if we end up at root (where the spawn is dead anyway - see above)
732  if (up_count > spawn.max_depth_decrease)
733  {
734  //DEBUG_ONLY std::cout << "spawn died while going up (AAA out of scope)\n";
735  //spawn.current_state = getRoot(me.data_graph); // reset to root -- not required
736  return false; // this spawn just threw away its reason of existence (i.e. the AAA). Die!
737  }
738  spawn.max_depth_decrease -= up_count;
739  }
740  spawn.current_state = successor;
741  addHits(me, dh, spawn); // use spawn version for length checking!
742  return true;
743  }
744 
745  //
746  // This is called by Spawns only!
747  // Returns false if it reached the 'root', true otherwise
748  template <typename TNeedle>
751  Spawn<TNeedle>& spawn,
752  const AAcid c)
753  {
754  typedef typename ValueSize<AAcid>::Type TSize;
755 
756  bool try_ambAA = spawn.ambAA_seen < me.max_ambAA;
757 
758  if (spawn.mismatches_seen < me.max_mmAA)
759  { // try all AA's
760  TSize idx_first(-1), idx_last(-1);
761  _getSpawnRange(AAcid('X'), idx_first, idx_last);
762  // .. except c or its ambiguities
763  TSize idx_AAfirst(ordValue(c)), idx_AAlast(ordValue(c));
764  if (try_ambAA && isAmbiguous(c))
765  { // if ambAA below will cover it, we will NOT use mismatches (this would be a duplicate)
766  _getSpawnRange(c, idx_AAfirst, idx_AAlast);
767  }
768  for (; idx_first <= idx_last; ++idx_first)
769  {
770  if (idx_first == idx_AAfirst)
771  { // ignore this range
772  idx_first = idx_AAlast;
773  continue;
774  }
775  Spawn<TNeedle> spawn2 = spawn; // a potential spawn
776  //std::cout << "spawn mm is " << AAcid(idx_first) << "\n";
777  if (_consumeChar(me, dh, spawn2, AAcid(idx_first)))
778  { // Spawn2 inherits the depths from its parent
779  ++spawn2.mismatches_seen;
780  dh.spawns.push_front(spawn2);
781  DEBUG_ONLY std::cout << " Spawn from Spawn '" << getPath(me, spawn2.current_state) << "' created at d: " << int(spawn2.max_depth_decrease) << " MM-seen: " << int(spawn2.mismatches_seen) << "\n";
782  }
783  }
784  }
785  // see if the Spawn can take another ambAA...
786  if (isAmbiguous(c))
787  {
788  if (!try_ambAA) return false; // we are at max and cannot consume more ambAA's; do not even try to create sub-spawns
789  ++spawn.ambAA_seen; // increase ambAA count -- also for sub-Spawns which will follow from here...
790  DEBUG_ONLY std::cout << "\n\ntrying to spawn from spawn on AAA: " << c << " with path: " << getPath(me, spawn.current_state) << "\n";
791  TSize idxFirst, idxLast;
792  _getSpawnRange(c, idxFirst, idxLast);
793  for (; idxFirst < idxLast; ++idxFirst) // last iteration is left for the parent
794  {
795  Spawn<TNeedle> spawn2 = spawn; // a potential spawn
796  //std::cout << "spawn aa is " << AAcid(idx) << "\n";
797  if (_consumeChar(me, dh, spawn2, AAcid(idxFirst)))
798  {
799  // Spawn2 inherits the depths from its parent
800  dh.spawns.push_front(spawn2);
801  DEBUG_ONLY std::cout << " Spawn from Spawn '" << getPath(me, spawn2.current_state) << "' created at d: " << int(spawn2.max_depth_decrease) << " AA-seen: " << int(spawn2.ambAA_seen) << "\n";
802  }
803  }
804  bool r = _consumeChar(me, dh, spawn, AAcid(idxLast));
805  if (r)
806  {
807  DEBUG_ONLY std::cout << " Spawn from Spawn '" << getPath(me, spawn.current_state) << "' created at d: " << int(spawn.max_depth_decrease) << " AA-seen: " << int(spawn.ambAA_seen) << "\n";
808  }
809  return r;
810  }
811  // UNambiguous
812  return _consumeChar(me, dh, spawn, c);
813  }
814 
815  // This is called by the master thread only!
816  // returns false if it reached the 'root', true otherwise
817  template <typename TNeedle>
820  const AAcid c)
821  {
822  typedef typename ValueSize<AAcid>::Type TSize;
823  typedef typename Pattern<TNeedle, FuzzyAC>::TVert TVert;
824 
825  bool consider_ambAA = me.max_ambAA > 0;
826 
827  if (me.max_mmAA > 0)
828  { // try all AA's
829  TSize idx_first(-1), idx_last(-1);
830  _getSpawnRange(AAcid('X'), idx_first, idx_last);
831  // .. except c or its ambiguities
832  TSize idx_AAfirst(ordValue(c)), idx_AAlast(ordValue(c));
833  if (consider_ambAA && isAmbiguous(c))
834  {
835  _getSpawnRange(c, idx_AAfirst, idx_AAlast);
836  }
837  for (; idx_first <= idx_last; ++idx_first)
838  {
839  if (idx_first == idx_AAfirst)
840  { // ignore this range
841  idx_first = idx_AAlast;
842  continue;
843  }
844  TVert node_spawn = dh.data_lastState; // last state of master
845  if (_consumeChar(me, dh, node_spawn, AAcid(idx_first))) // call this using master's _consumeChar(), since it might pass through root (which is allowed), but should not die.
846  { // spawn from current position; push front to flag as 'processed' for the current input char
847  // depths is 'current_depth - 1' (must be computed here!); mmAA-count: fixed to 1 (since spawned from master)
848  dh.spawns.push_front(Spawn<TNeedle>(node_spawn, getProperty(me.data_node_depth, node_spawn) - 1, 0, 1));
849  DEBUG_ONLY std::cout << " Init Spawn from Master consuming '" << AAcid(idx_first) << "\n";
850  }
851  }
852  }
853  if (isAmbiguous(c))
854  {
855  if (consider_ambAA)
856  {
857  DEBUG_ONLY std::cout << "found AAA: " << c << "\n";
858 
859  TSize idx_first, idx_last;
860  _getSpawnRange(c, idx_first, idx_last);
861  for (; idx_first <= idx_last; ++idx_first)
862  {
863  TVert node_spawn = dh.data_lastState; // last state of master
864  if (_consumeChar(me, dh, node_spawn, AAcid(idx_first))) // call this using master's _consumeChar(), since it might pass through root (which is allowed), but should not die.
865  { // spawn from current position; push front to flag as 'processed' for the current input char
866  // depths is 'current_depth - 1' (must be computed here!); ambAA-count: fixed to 1 (first AAA, since spawned from master)
867  dh.spawns.push_front(Spawn<TNeedle>(node_spawn, getProperty(me.data_node_depth, node_spawn) - 1, 1, 0));
868  DEBUG_ONLY std::cout << " Init Spawn from Master consuming '" << AAcid(idx_first) << "\n";
869  }
870  }
871  }
872  else
873  { // do not spawn anything
874  DEBUG_ONLY std::cout << " --> Main found AAA, but none allowed. Resetting to root. No spawns created.\n";
875  }
876  dh.data_lastState = getRoot(me.data_graph); // reset Master to root
877  return;
878  } // end: ambiguous
879 
880  // 'c' is UN-ambiguous
881  _consumeChar(me, dh, dh.data_lastState, c); // adds hits as well
882  }
883 
884 
885  template <typename TFinder, typename TNeedle>
886  inline bool find(TFinder& finder, const Pattern<TNeedle, FuzzyAC>& me, PatternAuxData<TNeedle>& dh)
887  {
888  if (empty(finder))
889  {
890  _finderSetNonEmpty(finder);
891  }
892  else
893  {
894  finder += dh.data_needleLength; // restore last consumed position in haystack
895  if (!empty(dh.hits_endPositions))
896  { // Process left-over hits
897  _reportHit(finder, me, dh);
898  return true;
899  }
900  // nothing to report at this point
901  ++finder; // advance to next position
902  }
903 
904  while (!atEnd(finder))
905  {
906  const AAcid c = *finder;
907  DEBUG_ONLY std::cout << "\n\n-- consuming " << c << " ---\n";
908  // spawns; do them first, since we might add new (but settled) spawns in main-thread & sub-spawns
909  if (!dh.spawns.empty())
910  {
911  typename PatternAuxData<TNeedle>::SpawnIt it = dh.spawns.begin();
912  //DEBUG_ONLY std::cout << " --> Spawns (" << dh.spawns.size() << " alive):\n";
913  while (it != dh.spawns.end())
914  {
915  if (!_spawnConsumeChar(me, dh, *it, c)) // might create new spawns
916  { // spawn reached root --> kill it
917  it = dh.spawns.erase(it); // remove and advance
918  //DEBUG_ONLY std::cout << " Killed spawn (" << dh.spawns.size() << " alive):\n";
919  }
920  else
921  { // next spawn
922  ++it;
923  }
924  }
925  }
926  // main thread
927  DEBUG_ONLY std::cout << " --> Main; d: " << int(getProperty(me.data_node_depth, dh.data_lastState)) << ")\n";
928  _masterConsumeChar(me, dh, c); // might create new spawns
929  DEBUG_ONLY std::cout << " <-- Main end; d: " << int(getProperty(me.data_node_depth, dh.data_lastState)) << ")\n";
930 
931  // print current states
932  DEBUG_ONLY std::cout << " --> POST: Main state: " << getPath(me, dh.data_lastState) << "\n";
933  for (typename PatternAuxData<TNeedle>::SpawnIt it = dh.spawns.begin(); it != dh.spawns.end(); ++it)
934  {
935  DEBUG_ONLY std::cout << " --> POST: Spawn state: " << getPath(me, it->current_state) << "\n";
936  }
937 
938  if (!empty(dh.hits_endPositions))
939  {
940  _reportHit(finder, me, dh);
941  return true;
942  }
943 
944  ++finder;
945  }
946  return false;
947  }
948 
949 }// namespace SEQAN_NAMESPACE_MAIN
950 
951 namespace OpenMS
952 {
953 
955  // FuzzyAC
957 
973  {
974  public:
975  typedef typename ::seqan::StringSet<::seqan::AAString> PeptideDB;
976  typedef typename ::seqan::Pattern<PeptideDB, ::seqan::FuzzyAC> FuzzyACPattern;
977 
993  static void initPattern(const PeptideDB& pep_db, const int aaa_max, const int mm_max, FuzzyACPattern& pattern)
994  {
995  pattern.init(pep_db, KeyWordLengthType(aaa_max), KeyWordLengthType(mm_max));
996  }
997 
1003  : finder_(),
1004  protein_(),
1005  dh_()
1006  {
1007  }
1015  AhoCorasickAmbiguous(const String& protein_sequence)
1016  : finder_(),
1017  protein_(),
1018  dh_()
1019  {
1020  setProtein(protein_sequence);
1021  }
1022 
1026  void setProtein(const String& protein_sequence)
1027  {
1028  protein_ = protein_sequence.c_str(); // we need an internal copy, since finder_ only keeps a pointer
1029  finder_ = ::seqan::Finder<seqan::AAString>(protein_);
1030  dh_.reset();
1031  }
1032 
1039  bool findNext(const FuzzyACPattern& pattern)
1040  {
1041  return ::seqan::find(finder_, pattern, dh_);
1042  }
1043 
1050  {
1052  }
1053 
1060  {
1061  return (Int)position(finder_);
1062  }
1063 
1064  private:
1065  typedef typename FuzzyACPattern::KeyWordLengthType KeyWordLengthType;
1066 
1067  // member
1068  ::seqan::Finder<seqan::AAString> finder_;
1071  }; // class FuzzyAC
1072 
1073 } // namespace OpenMS
1074 
#define DEBUG_ONLY
Definition: AhoCorasickAmbiguous.h:46
Extended Aho-Corasick algorithm capable of matching ambiguous amino acids in the pattern (i....
Definition: AhoCorasickAmbiguous.h:973
::seqan::StringSet<::seqan::AAString > PeptideDB
Definition: AhoCorasickAmbiguous.h:975
void setProtein(const String &protein_sequence)
Reset to new protein sequence. All previous data is forgotten.
Definition: AhoCorasickAmbiguous.h:1026
::seqan::Finder< seqan::AAString > finder_
locate the next peptide hit in protein
Definition: AhoCorasickAmbiguous.h:1068
AhoCorasickAmbiguous(const String &protein_sequence)
Prepare to start searching for hits in a new protein sequence.
Definition: AhoCorasickAmbiguous.h:1015
Int getHitProteinPosition()
Offset into protein sequence where hit was found.
Definition: AhoCorasickAmbiguous.h:1059
::seqan::PatternAuxData< PeptideDB > dh_
auxiliary data to hold a state after searching
Definition: AhoCorasickAmbiguous.h:1070
FuzzyACPattern::KeyWordLengthType KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:1065
static void initPattern(const PeptideDB &pep_db, const int aaa_max, const int mm_max, FuzzyACPattern &pattern)
Construct a trie from a set of peptide sequences (which are to be found in a protein).
Definition: AhoCorasickAmbiguous.h:993
AhoCorasickAmbiguous()
Default Ctor; call setProtein() before using findNext().
Definition: AhoCorasickAmbiguous.h:1002
::seqan::AAString protein_
the protein sequence - we need to store it since the finder only keeps a pointer to protein when cons...
Definition: AhoCorasickAmbiguous.h:1069
bool findNext(const FuzzyACPattern &pattern)
Enumerate hits.
Definition: AhoCorasickAmbiguous.h:1039
Size getHitDBIndex()
Get index of hit into peptide database of the pattern.
Definition: AhoCorasickAmbiguous.h:1049
::seqan::Pattern< PeptideDB, ::seqan::FuzzyAC > FuzzyACPattern
Definition: AhoCorasickAmbiguous.h:976
Invalid value exception.
Definition: Exception.h:329
A more convenient string class.
Definition: String.h:61
Definition: AhoCorasickAmbiguous.h:285
String< TVert > parentMap
allows to find parent of each node
Definition: AhoCorasickAmbiguous.h:316
void init(TNeedle const &ndl, KeyWordLengthType max_AAA=3, KeyWordLengthType max_mm=0)
Pattern Ctor with vector of needles, i.e. keywords/peptides.
Definition: AhoCorasickAmbiguous.h:326
Pattern()
Definition: AhoCorasickAmbiguous.h:290
Holder< TNeedle > data_host
holds needles, i.e. Peptides
Definition: AhoCorasickAmbiguous.h:308
TGraph data_graph
regular trie data
Definition: AhoCorasickAmbiguous.h:309
Graph< Automaton< TAlphabet > > TGraph
Definition: AhoCorasickAmbiguous.h:300
String< String< TSize > > data_map_outputNodes
regular trie data – plus: this gets augmented with all suffix traversals which are output nodes
Definition: AhoCorasickAmbiguous.h:310
Pattern(Pattern const &other)
Value< TKeyword >::Type TAlphabet
Definition: AhoCorasickAmbiguous.h:299
KeyWordLengthType max_ambAA
ambiguity; default: 3
Definition: AhoCorasickAmbiguous.h:312
const TVert nilVal
NULL pointer for trie; e.g. returned when no successor is found.
Definition: AhoCorasickAmbiguous.h:305
KeyWordLengthType max_mmAA
mismatches; default: 0
Definition: AhoCorasickAmbiguous.h:313
uint32_t TSize
Definition: AhoCorasickAmbiguous.h:297
VertexDescriptor< TGraph >::Type TVert
Definition: AhoCorasickAmbiguous.h:301
~Pattern()
Definition: AhoCorasickAmbiguous.h:354
String< KeyWordLengthType > data_node_depth
depths of each graph node
Definition: AhoCorasickAmbiguous.h:311
Value< TNeedle >::Type TKeyword
Definition: AhoCorasickAmbiguous.h:298
Pattern const & operator=(Pattern const &other)
__uint8 KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:302
int Int
Signed integer type.
Definition: Types.h:102
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
const double c
Definition: Constants.h:209
void append(const T &i, String &target)
Definition: StringUtils.h:119
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
Definition: AhoCorasickAmbiguous.h:52
TNeedle Type
Definition: AhoCorasickAmbiguous.h:369
bool _spawnConsumeChar(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, Spawn< TNeedle > &spawn, const AAcid c)
Definition: AhoCorasickAmbiguous.h:749
AAcid unknownValueImpl(AAcid *)
Definition: AhoCorasickAmbiguous.h:174
void addHits(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, const Spawn< TNeedle > &spawn)
Definition: AhoCorasickAmbiguous.h:650
TNeedle const Type
Definition: AhoCorasickAmbiguous.h:375
void _reportHit(TFinder &finder, const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:569
bool _consumeChar(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, typename Pattern< TNeedle, FuzzyAC >::TVert &current, const AAcid c)
Universal UN-ambiguous char consumer. Works for Master and Spawns.
Definition: AhoCorasickAmbiguous.h:693
void _getSpawnRange(const AAcid c, T &idxFirst, T &idxLast)
given an ambAA c, return a range of AA's (including idxLast) which need to be spawned.
Definition: AhoCorasickAmbiguous.h:388
Tag< FuzzyAC_ > FuzzyAC
Definition: AhoCorasickAmbiguous.h:211
void _createAcTrie(Pattern< TNeedle, FuzzyAC > &me)
Definition: AhoCorasickAmbiguous.h:412
bool find(TFinder &finder, const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:886
void _masterConsumeChar(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, const AAcid c)
Definition: AhoCorasickAmbiguous.h:818
void setHost(Pattern< TNeedle, FuzzyAC > &me, TNeedle2 const &needle)
Definition: AhoCorasickAmbiguous.h:545
std::string getPath(const Pattern< TNeedle, FuzzyAC > &me, typename Pattern< TNeedle, FuzzyAC >::TVert current_state)
for debug only
Definition: AhoCorasickAmbiguous.h:638
bool isAmbiguous(AAcid c)
Definition: AhoCorasickAmbiguous.h:580
AAcid Type
Definition: AhoCorasickAmbiguous.h:187
String< AAcid, Alloc< void > > AAString
Definition: AhoCorasickAmbiguous.h:206
SimpleType< unsigned char, AAcid_ > AAcid
Definition: AhoCorasickAmbiguous.h:160
bool _createSecondarySpawns(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, Spawn< TNeedle > &spawn, const AAcid c)
Definition: AhoCorasickAmbiguous.h:601
Size< TNeedle >::Type position(const PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:563
void assign(char &c_target, AAcid const &source)
Definition: AhoCorasickAmbiguous.h:179
Iterator< const AAString, Rooted >::Type AAStringIterator
Definition: AhoCorasickAmbiguous.h:207
Definition: AhoCorasickAmbiguous.h:159
uint8_t Type
Definition: AhoCorasickAmbiguous.h:170
Definition: AhoCorasickAmbiguous.h:58
static char const VALUE[27]
Definition: AhoCorasickAmbiguous.h:59
Definition: AhoCorasickAmbiguous.h:132
static char const VALUE[256]
Definition: AhoCorasickAmbiguous.h:133
Definition: AhoCorasickAmbiguous.h:96
static char const VALUE[256]
Definition: AhoCorasickAmbiguous.h:97
Definition: AhoCorasickAmbiguous.h:245
std::list< Spawn< TNeedle > >::const_iterator SpawnCIt
Definition: AhoCorasickAmbiguous.h:272
Graph< Automaton< TAlphabet > > TGraph
Definition: AhoCorasickAmbiguous.h:267
PatternAuxData()
Definition: AhoCorasickAmbiguous.h:247
Value< TKeyword >::Type TAlphabet
Definition: AhoCorasickAmbiguous.h:266
TSize data_needleLength
Last length of needle to reposition finder.
Definition: AhoCorasickAmbiguous.h:280
std::list< Spawn< TNeedle > >::iterator SpawnIt
Definition: AhoCorasickAmbiguous.h:271
std::list< Spawn< TNeedle > > Spawns
Definition: AhoCorasickAmbiguous.h:270
VertexDescriptor< TGraph >::Type TVert
Definition: AhoCorasickAmbiguous.h:268
void reset()
Definition: AhoCorasickAmbiguous.h:255
TSize data_keywordIndex
Current keyword that produced a hit.
Definition: AhoCorasickAmbiguous.h:279
TVert data_lastState
Last state of master instance in the trie.
Definition: AhoCorasickAmbiguous.h:276
Value< TNeedle >::Type TKeyword
Definition: AhoCorasickAmbiguous.h:265
__uint8 KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:269
String< TSize > hits_endPositions
All remaining keyword indices.
Definition: AhoCorasickAmbiguous.h:278
Size< TNeedle >::Type TSize
Definition: AhoCorasickAmbiguous.h:264
Spawns spawns
spawn instances currently walking the tree
Definition: AhoCorasickAmbiguous.h:275
state of an AC spawn, operating on a trie
Definition: AhoCorasickAmbiguous.h:217
TVert current_state
Definition: AhoCorasickAmbiguous.h:224
Graph< Automaton< TAlphabet > > TGraph
Definition: AhoCorasickAmbiguous.h:221
KeyWordLengthType ambAA_seen
number of ambAA's which the spawn has seen
Definition: AhoCorasickAmbiguous.h:226
Spawn(const Spawn &other)=default
Value< TKeyword >::Type TAlphabet
Definition: AhoCorasickAmbiguous.h:220
Pattern< TNeedle, FuzzyAC >::KeyWordLengthType KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:223
VertexDescriptor< TGraph >::Type TVert
Definition: AhoCorasickAmbiguous.h:222
KeyWordLengthType max_depth_decrease
maximum loss in depths of traversed nodes (both while reporting hits and changing its own state)
Definition: AhoCorasickAmbiguous.h:225
Spawn(TVert init_state, KeyWordLengthType current_depth, KeyWordLengthType aaa_seen, KeyWordLengthType mm_seen)
Definition: AhoCorasickAmbiguous.h:229
Value< TNeedle >::Type TKeyword
Definition: AhoCorasickAmbiguous.h:219
Spawn & operator=(const Spawn &)
KeyWordLengthType mismatches_seen
number of mismatches the spawn has seen
Definition: AhoCorasickAmbiguous.h:227
Size< TNeedle >::Type TSize
Definition: AhoCorasickAmbiguous.h:218
uint8_t Type
Definition: AhoCorasickAmbiguous.h:164