OpenMS  2.5.0
AhoCorasickAmbiguous.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2020.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Chris Bielow, Tobias Rausch $
33 // --------------------------------------------------------------------------
34 
35 
36 #pragma once
37 
41 #include <OpenMS/CONCEPT/Macros.h>
42 
43 #ifdef NDEBUG
44 #define DEBUG_ONLY if (false)
45 #else
46 #define DEBUG_ONLY if (false)
47 #endif
48 
49 // the SeqAn implementation comes first. To use the OpenMS interface, see below.
50 
51 namespace seqan
52 {
53 
54  // we will use AAcid Tables from SeqAn 2.x here, since they are complete
55 
56  template <typename T = void>
58  {
59  static char const VALUE[27];
60  };
61  template <typename T>
63  { // B = D|N, Z = E|Q, J = I|L
64  // we re-order the aminoacids such that ambiguous AA's are consecutive (which saves effort during their enumeration)
65  'A', // 00 Ala Alanine
66  'Y', // 23 Tyr Tyrosine ! 1(Y)
67  'C', // 02 Cys Cysteine
68  'D', // 03 Asp Aspartic Acid // B
69  'N', // 13 Asn Asparagine // B ! 4(N)
70  'F', // 05 Phe Phenylalanine
71  'G', // 06 Gly Glycine
72  'H', // 07 His Histidine
73  'I', // 08 Ile Isoleucine // J
74  'L', // 11 Leu Leucine // J ! 9(L)
75  'K', // 10 Lys Lysine ! 10(K)
76  'W', // 22 Trp Tryptophan ! 11(W)
77  'M', // 12 Met Methionine
78  'O', // 14 Pyl Pyrrolysine ! 13(O)
79  'P', // 15 Pro Proline ! 14(P)
80  'E', // 04 Glu Glutamic Acid // Z ! 15(E)
81  'Q', // 16 Gln Glutamine // Z
82  'R', // 17 Arg Arginine
83  'S', // 18 Ser Serine
84  'T', // 19 Thr Threonine
85  'U', // 20 Selenocysteine
86  'V', // 21 Val Valine
87  'B', // 01 Aspartic Acid, Asparagine $ 22(B) // the AmbAA's need to be consecutive (B,J,Z,X)
88  'J', // 09 Leucine, Isoleucine $ 23(J)
89  'Z', // 24 Glutamic Acid, Glutamine $
90  'X', // 25 Unknown (25)
91  '*' // 26 Terminator
92  };
93 
94  template <typename T = void>
96  {
97  static char const VALUE[256];
98  };
99 
100  template <typename T>
101  char const OMS_TranslateTableCharToAA_<T>::VALUE[256] =
102  { // char --> amino acid (unsigned char with values from 0..26)
103  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //0
104  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //1
105  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 25, 25, 25, 25, 25, //2
106  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //3
107 
108  // , A, B, C, D, E, F, G, H, I, J, K, L, M, N, O,
109  25, 0, 22, 2, 3, 15, 5, 6, 7, 8, 23, 10, 9, 12, 4, 13, //4
110 
111  // P, Q, R, S, T, U, V, W, X, Y, Z, , , , , ,
112  14, 16, 17, 18, 19, 20, 21, 11, 25, 1, 24, 25, 25, 25, 25, 25, //5
113 
114  // , a, b, c, d, e, f, g, h, i, j, k, l, m, n, o,
115  25, 0, 22, 2, 3, 15, 5, 6, 7, 8, 23, 10, 9, 12, 4, 13, //6
116 
117  // p, q, r, s, t, u, v, w, x, y, z, , , , , ,
118  14, 16, 17, 18, 19, 20, 21, 11, 25, 1, 24, 25, 25, 25, 25, //7
119 
120  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //8
121  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //9
122  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //10
123  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //11
124  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //12
125  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //13
126  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //14
127  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25 //15
128  };
129 
130  template <typename T = void>
132  {
133  static char const VALUE[256];
134  };
135 
136  template <typename T>
137  char const OMS_TranslateTableByteToAA_<T>::VALUE[256] =
138  {
139  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, //0
140  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 25, 25, 25, 25, 25, //1
141  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //2
142  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //3
143  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //4
144  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //5
145  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //6
146  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //7
147  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //8
148  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //9
149  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //10
150  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //11
151  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //12
152  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //13
153  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, //14
154  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25 //15
155  };
156  // --------------------------------------------------------------------------
157  // Amino Acid
158  // --------------------------------------------------------------------------
159  struct AAcid_ {};
160  typedef SimpleType<unsigned char, AAcid_> AAcid;
161 
162  template <> struct ValueSize<AAcid>
163  {
164  typedef uint8_t Type;
165  static const Type VALUE = 27;
166  };
167 
168  template <> struct BitsPerValue<AAcid>
169  {
170  typedef uint8_t Type;
171  static const Type VALUE = 5;
172  };
173 
175  {
176  static const AAcid _result = AAcid('X');
177  return _result;
178  }
179  inline void assign(char & c_target, AAcid const & source)
180  {
181  c_target = OMS_TranslateTableAAToChar_<>::VALUE[source.value];
182  }
183 
184  template <>
185  struct CompareTypeImpl<AAcid, uint8_t>
186  {
187  typedef AAcid Type;
188  };
189 
190  inline void assign(AAcid & target, uint8_t c_source)
191  {
192  target.value = OMS_TranslateTableByteToAA_<>::VALUE[c_source];
193  }
194 
195  template <>
196  struct CompareTypeImpl<AAcid, char>
197  {
198  typedef AAcid Type;
199  };
200 
201  inline void assign(AAcid & target, char c_source)
202  {
203  target.value = OMS_TranslateTableCharToAA_<>::VALUE[(unsigned char)c_source];
204  }
205 
206  typedef String<AAcid, Alloc<void> > AAString; // identical to seqan::Peptide, but without the misnomer (or how would you encode proteins? as seqan::Peptide?)
208 
210 
211  struct FuzzyAC_;
212  typedef Tag<FuzzyAC_> FuzzyAC;
213 
215  template <typename TNeedle>
216  struct Spawn
217  {
218  typedef typename Size<TNeedle>::Type TSize;
219  typedef typename Value<TNeedle>::Type TKeyword;
221  typedef Graph<Automaton<TAlphabet> > TGraph;
222  typedef typename VertexDescriptor<TGraph>::Type TVert;
228 
229  Spawn(TVert init_state, KeyWordLengthType current_depth, KeyWordLengthType aaa_seen, KeyWordLengthType mm_seen) :
230  current_state(init_state),
231  max_depth_decrease(current_depth),
232  ambAA_seen(aaa_seen),
233  mismatches_seen(mm_seen)
234  {}
235 
236  private:
237  Spawn();
238  Spawn& operator=(const Spawn&);
239  };
240 
241  template <typename TNeedle>
243  {
244 
246  : spawns(),
247  data_lastState(0), // a bit of cheating, but we know that root==0
251  {}
252 
253  void reset()
254  {
255  spawns.clear();
256  data_lastState = 0; // a bit of cheating, but we know that root==0
257  clear(hits_endPositions);
258  data_keywordIndex = 0;
259  data_needleLength = 0;
260  }
261 
262  typedef typename Size<TNeedle>::Type TSize;
263  typedef typename Value<TNeedle>::Type TKeyword;
265  typedef Graph<Automaton<TAlphabet> > TGraph;
266  typedef typename VertexDescriptor<TGraph>::Type TVert;
267  typedef __uint8 KeyWordLengthType;
268  typedef typename std::list<Spawn<TNeedle> > Spawns;
269  typedef typename std::list<Spawn<TNeedle> >::iterator SpawnIt;
270  typedef typename std::list<Spawn<TNeedle> >::const_iterator SpawnCIt;
271 
272  // "working" set; changes with every hit
275  // for hit reporting:
276  String<TSize> hits_endPositions;
279  };
280 
281  template <typename TNeedle>
282  class Pattern<TNeedle, FuzzyAC>
283  {
284  private:
285  Pattern(Pattern const& other);
286  Pattern const& operator=(Pattern const & other);
287  public:
289  : nilVal(getNil<TVert>()),
290  max_ambAA(0),
291  max_mmAA(0)
292  {}
293 
294  //typedef typename Size<TNeedle>::Type TSize; // defaults to uint64, but uint32 is enough...
295  typedef uint32_t TSize; // max. number of peptides allowed (4 billion; checked in init())
296  typedef typename Value<TNeedle>::Type TKeyword;
298  typedef Graph<Automaton<TAlphabet> > TGraph;
299  typedef typename VertexDescriptor<TGraph>::Type TVert;
300  typedef __uint8 KeyWordLengthType;
301 
302  // constant after C'tor;
303  const TVert nilVal;
304 
305  // "constant" data, after construction of trie
306  Holder<TNeedle> data_host;
308  String<String<TSize> > data_map_outputNodes;
309  String<KeyWordLengthType> data_node_depth;
312 
313 #ifndef NDEBUG
314  String<TVert> parentMap;
315 #endif
316 
317 
324  void init(TNeedle const& ndl, KeyWordLengthType max_AAA = 3, KeyWordLengthType max_mm = 0)
325  {
326  SEQAN_CHECKPOINT
327  max_ambAA = max_AAA;
328  max_mmAA = max_mm;
329  if (std::numeric_limits<TSize>::max() < length(ndl)) // check 4 billion peptide limit; use length(ndl) directly, since TSize might be too small
330  {
331  throw OpenMS::Exception::InvalidValue(__FILE__, __LINE__, "Pattern<FuzzyAC>(PeptideSet)", std::string("Input contains more than 2^32 peptides. Cannot create trie.").c_str(), OpenMS::String(length(ndl)));
332  }
333  TSize ln = (TSize)length(ndl);
334  for (TSize i = 0; i < ln; ++i)
335  {
336  if (length(ndl[i]) > std::numeric_limits<KeyWordLengthType>::max())
337  {
338  throw OpenMS::Exception::InvalidValue(__FILE__, __LINE__, "Pattern<FuzzyAC>(PeptideSet)", std::string("Input peptide to FuzzyAC must NOT be longer than 255 chars!").c_str(), std::string(begin(ndl[i]), end(ndl[i])));
339  }
340  TSize lni = (TSize)length(ndl[i]);
341  for (TSize j = 0; j < lni; ++j)
342  {
343  if (isAmbiguous(ndl[i][j])) // this check is important -- find() code below relies on no ambiguous chars being present!
344  {
345  throw OpenMS::Exception::InvalidValue(__FILE__, __LINE__, "Pattern<FuzzyAC>(PeptideSet)", std::string("Input peptide to FuzzyAC must NOT contain ambiguous amino acids (B/J/Z/X)!").c_str(), std::string(begin(ndl[i]), end(ndl[i])));
346  }
347  }
348  }
349  setHost(*this, ndl);
350  }
351 
353  {
354  SEQAN_CHECKPOINT
355  }
356  //____________________________________________________________________________
357  };
358 
359 
361  // Host Metafunctions
363 
364  template <typename TNeedle>
365  struct Host< Pattern<TNeedle, FuzzyAC> >
366  {
367  typedef TNeedle Type;
368  };
369 
370  template <typename TNeedle>
371  struct Host< Pattern<TNeedle, FuzzyAC> const>
372  {
373  typedef TNeedle const Type;
374  };
375 
376 
378  // Functions
380 
385  template<typename T>
386  inline void _getSpawnRange(const AAcid c, T& idxFirst, T& idxLast)
387  {
388  // jump table: // start of spawns // end of spawns (including)
389  static const T jump[4][2] = { { ordValue(AAcid('D')), ordValue(AAcid('N')) }, // B = D,N
390  { ordValue(AAcid('I')), ordValue(AAcid('L')) }, // J = I,L
391  { ordValue(AAcid('E')), ordValue(AAcid('Q')) }, // Z = E,Q
392  { 0 , 21 } }; // X = A..V
393  static const T ord_b = ordValue(AAcid('B'));
394 #ifndef NDEBUG
395  assert(ordValue(AAcid('N')) - ordValue(AAcid('D')) == 1); // must be neighbours
396  assert(ordValue(AAcid('Q')) - ordValue(AAcid('E')) == 1); // must be neighbours
397  assert(ordValue(AAcid('V')) == 21); // make sure the table is ordered as we expect
398  // row of jump table:
399  assert(ord_b == 22); // make sure the table is ordered as we expect
400  assert(ordValue(AAcid('J')) == 23); // make sure the table is ordered as we expect
401  assert(ordValue(AAcid('Z')) == 24); // make sure the table is ordered as we expect
402  assert(ordValue(AAcid('X')) == 25); // make sure the table is ordered as we expect
403 #endif
404  idxFirst = jump[ordValue(c) - ord_b][0];
405  idxLast = jump[ordValue(c) - ord_b][1];
406  }
407 
408 
409  template <typename TNeedle>
411  {
412  //OpenMS::StopWatch sw;
413  //sw.start();
414  typedef typename Position<TNeedle>::Type TPosition;
415  typedef typename Value<TNeedle>::Type TKeyword;
416  typedef typename Value<TKeyword>::Type TAlphabet;
417  typedef Graph<Automaton<TAlphabet> > TGraph;
418  typedef typename VertexDescriptor<TGraph>::Type TVert;
419  TVert nilVal = getNil<TVert>();
420 
421  // Create regular trie
422  createTrie(me.data_graph, me.data_map_outputNodes, host(me));
423 
424  // Create parent map
425  String<TVert> parentMap;
426  String<TAlphabet> parentCharMap;
427  resizeVertexMap(me.data_graph, parentMap);
428  resizeVertexMap(me.data_graph, parentCharMap);
429  // init all nodes to Nil
430  for (TPosition i = 0; i < length(parentMap); ++i) {
431  assignProperty(parentMap, i, nilVal);
432  }
433  typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
434  TEdgeIterator itEd(me.data_graph);
435  //int count_prop = 0;
436  for (; !atEnd(itEd); goNext(itEd)) {
437  // property, vertex , value
438  assignProperty(parentMap, targetVertex(itEd), sourceVertex(itEd));
439  assignProperty(parentCharMap, targetVertex(itEd), label(itEd));
440  //++count_prop;
441  }
442  //std::cout << "\nassigned " << count_prop << " props to " << length(parentMap) << " values\n";
443 #ifndef NDEBUG
444  me.parentMap = parentMap;
445 #endif
446 
447  // Build AC
448  TVert root = getRoot(me.data_graph);
449  // properties....
450  String<TVert> data_map_failurelink;
451  resizeVertexMap(me.data_graph, data_map_failurelink);
452  assignProperty(data_map_failurelink, root, nilVal);
453 
454  resizeVertexMap(me.data_graph, me.data_node_depth);
455  assignProperty(me.data_node_depth, root, 0);
456 
457  // Bfs Traversal
458  typedef typename Iterator<TGraph, BfsIterator>::Type TBfsIterator;
459  TBfsIterator it(me.data_graph, root);
460  typedef typename ValueSize<AAcid>::Type TSize;
461  TSize idxAAFirst, idxAALast;
462  _getSpawnRange('X', idxAAFirst, idxAALast);
463  // create nextMove function for root (point to itself)
464  for (TSize idx = idxAAFirst; idx <= idxAALast; ++idx)
465  {
466  if (getSuccessor(me.data_graph, root, AAcid(idx)) == nilVal) addEdge(me.data_graph, root, root, AAcid(idx));
467  }
468 
469  goNext(it); // skip root
470 
471  // connectivity statistics
472  //std::map<int, int> connectivity;
473 
474  for (; !atEnd(it); goNext(it))
475  {
476  const typename GetValue<TBfsIterator>::Type itval = *it; // dereferencing *it will give an index into the property array!
477 
478  //++connectivity[outDegree(me.data_graph, itval)];
479 
480  // set depth of current node using: depths(parent) + 1
481  TVert parent = getProperty(parentMap, itval);
482  assignProperty(me.data_node_depth, itval, getProperty(me.data_node_depth, parent) + 1);
483 
484 
488  // sigma: edge label
489  TAlphabet sigma = getProperty(parentCharMap, itval);
490  // take suffix link of parent and try to go down with sigma
491  TVert down = getProperty(data_map_failurelink, parent);
492  while ((down != nilVal) &&
493  (getSuccessor(me.data_graph, down, sigma) == nilVal))
494  {
495  down = getProperty(data_map_failurelink, down);
496  }
497  if (down != nilVal)
498  { // we found an edge to follow down
499  assignProperty(data_map_failurelink, itval, getSuccessor(me.data_graph, down, sigma));
500  // output function
501  String<TPosition> endPositions = getProperty(me.data_map_outputNodes, getProperty(data_map_failurelink, itval));
502  if (!empty(endPositions))
503  {
504  // get current end positions (full path) ...
505  String<TPosition> endPositionsCurrent = getProperty(me.data_map_outputNodes, itval);
506  typedef typename Iterator<String<TPosition>, Rooted >::Type TStringIterator;
507  // .. and append all patterns which are a suffix
508  TStringIterator sit = begin(endPositions);
509  for (;!atEnd(sit); goNext(sit))
510  {
511  appendValue(endPositionsCurrent, *sit);
512  }
513  assignProperty(me.data_map_outputNodes, itval, endPositionsCurrent);
514  }
515  }
516  else { // no suffix exists: point suffix link of current node to root
517  assignProperty(data_map_failurelink, itval, root);
518  }
519 
520  // create nextMove function
521  for (TSize idx = idxAAFirst; idx <= idxAALast; ++idx)
522  {
523  if (getSuccessor(me.data_graph, itval, AAcid(idx)) == nilVal)
524  { // no child:
525  const TVert& target = getSuccessor(me.data_graph, getProperty(data_map_failurelink, itval) , AAcid(idx));
526  addEdge(me.data_graph, itval, target, AAcid(idx));
527  }
528 
529  }
530  }
531 
532  //sw.stop();
533  //std::cout << OpenMS::String(" Trie build time: ") + sw.getClockTime() + " s (wall), " + sw.getCPUTime() + " s (CPU)).\n\n";
534 
535  /*for (std::map<int,int>::const_iterator it = connectivity.begin(); it != connectivity.end(); ++it)
536  {
537  std::cout << " conn: " << it->first << " : " << it->second << "x\n";
538  }*/
539 
540  }
541 
542  template <typename TNeedle, typename TNeedle2>
543  void setHost(Pattern<TNeedle, FuzzyAC> & me, TNeedle2 const & needle)
544  {
545  SEQAN_CHECKPOINT;
546  SEQAN_ASSERT_NOT(empty(needle));
547  setValue(me.data_host, needle);
548  clear(me.data_graph);
549  clear(me.data_map_outputNodes);
550  _createAcTrie(me);
551  }
552  template <typename TNeedle, typename TNeedle2>
553  inline void setHost(Pattern<TNeedle, FuzzyAC> & me, TNeedle2 & needle)
554  {
555  setHost(me, reinterpret_cast<TNeedle2 const &>(needle));
556  }
557  //____________________________________________________________________________
558 
559  //____________________________________________________________________________
560  template <typename TNeedle>
561  inline typename Size<TNeedle>::Type position(const PatternAuxData<TNeedle>& dh)
562  {
563  return dh.data_keywordIndex;
564  }
565 
566  template <typename TFinder, typename TNeedle>
567  inline void _reportHit(TFinder& finder, const Pattern<TNeedle, FuzzyAC>& me, PatternAuxData<TNeedle>& dh) {
568  size_t idx_endPosVec = length(dh.hits_endPositions) - 1;
569  dh.data_keywordIndex = dh.hits_endPositions[idx_endPosVec];
570  resize(dh.hits_endPositions, idx_endPosVec); // pop last hit
571  dh.data_needleLength = length(value(host(me), dh.data_keywordIndex)) - 1;
572  finder -= dh.data_needleLength; // position finder at beginning of hit
573  _setFinderLength(finder, dh.data_needleLength + 1);
574  _setFinderEnd(finder, position(finder) + length(finder)); // end of match within haystack
575  return;
576  }
577 
578  inline bool isAmbiguous(AAcid c)
579  { // relies on the fact that ambiguous AA's are occurring in a block
580  static const typename ValueSize<AAcid>::Type vB = ordValue(AAcid('B')); // D,N
581  static const typename ValueSize<AAcid>::Type vX = ordValue(AAcid('X')); // all
582  return (vB <= ordValue(c) && ordValue(c) <= vX);
583  }
584 
585  inline bool isAmbiguous(const AAString& s)
586  {
587  for (AAStringIterator it = begin(s); it != end(s); ++it)
588  {
589  if (isAmbiguous(*it)) return true;
590  }
591  return false;
592  }
593 
594 
595 
596  // called by spawns only
597  // returns false if it reached the 'root', true otherwise
598  template <typename TNeedle>
601  Spawn<TNeedle>& spawn,
602  const AAcid c) // ALWAYS ambiguous!!!!
603  {
604  assert(isAmbiguous(c));
605 
606  DEBUG_ONLY std::cout << "\n\ntrying to spawn from spawn on AAA: " << c << " with path: " << getPath(me, spawn.current_state) << "\n";
607  typedef typename ValueSize<AAcid>::Type TSize;
608  TSize idxFirst, idxLast;
609  _getSpawnRange(c, idxFirst, idxLast);
610  for (TSize idx = idxFirst + 1; idx <= idxLast; ++idx) // first iteration is left for the parent
611  {
612  Spawn<TNeedle> spawn2 = spawn; // a potential spawn
613  //std::cout << "spawn aa is " << AAcid(idx) << "\n";
614  if (_consumeChar(me, dh, spawn2, AAcid(idx)))
615  {
616  // Spawn2 inherits the depths from its parent
617  dh.spawns.push_front(spawn2);
618  DEBUG_ONLY std::cout << " Spawn from Spawn '" << getPath(me, spawn2.current_state) << "' created at d: " << int(spawn2.max_depth_decrease) << " AA-seen: " << int(spawn2.ambAA_seen) << "\n";
619  }
620  }
621  bool r = _consumeChar(me, dh, spawn, AAcid(idxFirst));
622  if (r)
623  {
624  DEBUG_ONLY std::cout << " Spawn from Spawn '" << getPath(me, spawn.current_state) << "' created at d: " << int(spawn.max_depth_decrease) << " AA-seen: " << int(spawn.ambAA_seen) << "\n";
625  }
626  return r;
627  }
628 
629 #ifdef NDEBUG
630  template<class TNeedle> inline std::string getPath(const Pattern<TNeedle, FuzzyAC>& /*me*/, typename Pattern<TNeedle, FuzzyAC>::TVert /*current_state*/)
631  {
632  return "";
633  }
634 #else
635  template<class TNeedle> inline std::string getPath(const Pattern<TNeedle, FuzzyAC>& me, typename Pattern<TNeedle, FuzzyAC>::TVert current_state)
637  {
638  if (getRoot(me.data_graph) == current_state) return "";
639 
640  typename Pattern<TNeedle, FuzzyAC>::TVert suffix_node = getProperty(me.parentMap, current_state);
641  typename Iterator<typename Pattern<TNeedle, FuzzyAC>::TGraph const, OutEdgeIterator>::Type it(me.data_graph, suffix_node);
642  while(targetVertex(it) != current_state) ++it;
643  char c = (label(it));
644  return getPath(me, suffix_node) + c;
645  }
646 #endif
647 
648  template<class TNeedle> inline void addHits(const Pattern<TNeedle, FuzzyAC>& me, PatternAuxData<TNeedle>& dh, const Spawn<TNeedle>& spawn)
649  {
650  typedef typename Pattern<TNeedle, FuzzyAC>::TSize TSize;
651  const String<TSize>& needle_hits = getProperty(me.data_map_outputNodes, spawn.current_state);
652  //DEBUG_ONLY std::cout << "spawn at path: " << getPath(me, spawn.current_state) << "\n";
653  if (length(needle_hits))
654  {
655  int path_length = getProperty(me.data_node_depth, spawn.current_state); // == length of current path to spawn
656  int unambiguous_suffix_length = path_length - spawn.max_depth_decrease; // == length of suffix peptide which does not contain AAA
657  DEBUG_ONLY std::cout << " spawn adding hits which are at least " << unambiguous_suffix_length << " chars long (thus contain the AAA).\n";
658 
659  // but only report those which contain the AAA
660  for (auto it = begin(needle_hits); it != end(needle_hits); ++it)
661  {
662  int hit_length = (int)length(value(host(me), *it));
663  if (hit_length < unambiguous_suffix_length) break; // assumption: terminalStateMap is sorted by length of hits! ... uiuiui...
664  DEBUG_ONLY std::cout << " add spawn hit: needle #" << *it << " as " << (value(host(me), *it)) << "\n";
665  append(dh.hits_endPositions, *it); // append hits which still contain the AAA
666  }
667  }
668  }
669  template<class TNeedle> inline void addHits(const Pattern<TNeedle, FuzzyAC>& me, PatternAuxData<TNeedle>& dh, const typename Pattern<TNeedle, FuzzyAC>::TVert& current_state)
670  {
671  typedef typename Pattern<TNeedle, FuzzyAC>::TSize TSize;
672  //DEBUG_ONLY std::cout << "master at path: " << getPath(me, current_state) << "\n";
673  const String<TSize>& needle_hits = getProperty(me.data_map_outputNodes, current_state);
674  if (length(needle_hits))
675  {
676  DEBUG_ONLY std::cout << "master's new hits: total " << length(needle_hits) << " hits\n";
677  append(dh.hits_endPositions, getProperty(me.data_map_outputNodes, current_state)); // indices into TNeedle!
678  }
679  }
680 
681 
690  template <typename TNeedle>
693  typename Pattern<TNeedle, FuzzyAC>::TVert& current,
694  const AAcid c)
695  {
696  //DEBUG_ONLY std::cout << "consuming real char " << c << " ";
697  current = getSuccessor(me.data_graph, current, c);
698  assert(current != me.nilVal);
699  if (current == getRoot(me.data_graph)) {
700  return false;
701  }
702  addHits(me, dh, current);
703  return true;
704  }
705 
713  template <typename TNeedle>
716  Spawn<TNeedle>& spawn,
717  const AAcid c)
718  {
719  //DEBUG_ONLY std::cout << "consuming real char " << c << " ";
720  typename Pattern<TNeedle, FuzzyAC>::TVert successor = getSuccessor(me.data_graph, spawn.current_state, c);
721  assert(successor != me.nilVal);
722  // check if prefix was lost
723  if (successor == getRoot(me.data_graph)) return false;
724 
725  if (getProperty(me.data_node_depth, spawn.current_state) >= getProperty(me.data_node_depth, successor))
726  { // went at least one level up (and maybe one down again, hence equality)
727  typedef typename Pattern<TNeedle, FuzzyAC>::KeyWordLengthType KeyWordLengthType;
728  const KeyWordLengthType up_count = 1 + getProperty(me.data_node_depth, spawn.current_state) - getProperty(me.data_node_depth, successor);
729  // the +1 is not valid if we end up at root (where the spawn is dead anyway - see above)
730  if (up_count > spawn.max_depth_decrease)
731  {
732  //DEBUG_ONLY std::cout << "spawn died while going up (AAA out of scope)\n";
733  //spawn.current_state = getRoot(me.data_graph); // reset to root -- not required
734  return false; // this spawn just threw away its reason of existence (i.e. the AAA). Die!
735  }
736  spawn.max_depth_decrease -= up_count;
737  }
738  spawn.current_state = successor;
739  addHits(me, dh, spawn); // use spawn version for length checking!
740  return true;
741  }
742 
743  //
744  // This is called by Spawns only!
745  // Returns false if it reached the 'root', true otherwise
746  template <typename TNeedle>
749  Spawn<TNeedle>& spawn,
750  const AAcid c)
751  {
752  typedef typename ValueSize<AAcid>::Type TSize;
753 
754  bool try_ambAA = spawn.ambAA_seen < me.max_ambAA;
755 
756  if (spawn.mismatches_seen < me.max_mmAA)
757  { // try all AA's
758  TSize idx_first(-1), idx_last(-1);
759  _getSpawnRange(AAcid('X'), idx_first, idx_last);
760  // .. except c or its ambiguities
761  TSize idx_AAfirst(ordValue(c)), idx_AAlast(ordValue(c));
762  if (try_ambAA && isAmbiguous(c))
763  { // if ambAA below will cover it, we will NOT use mismatches (this would be a duplicate)
764  _getSpawnRange(c, idx_AAfirst, idx_AAlast);
765  }
766  for (; idx_first <= idx_last; ++idx_first)
767  {
768  if (idx_first == idx_AAfirst)
769  { // ignore this range
770  idx_first = idx_AAlast;
771  continue;
772  }
773  Spawn<TNeedle> spawn2 = spawn; // a potential spawn
774  //std::cout << "spawn mm is " << AAcid(idx_first) << "\n";
775  if (_consumeChar(me, dh, spawn2, AAcid(idx_first)))
776  { // Spawn2 inherits the depths from its parent
777  ++spawn2.mismatches_seen;
778  dh.spawns.push_front(spawn2);
779  DEBUG_ONLY std::cout << " Spawn from Spawn '" << getPath(me, spawn2.current_state) << "' created at d: " << int(spawn2.max_depth_decrease) << " MM-seen: " << int(spawn2.mismatches_seen) << "\n";
780  }
781  }
782  }
783  // see if the Spawn can take another ambAA...
784  if (isAmbiguous(c))
785  {
786  if (!try_ambAA) return false; // we are at max and cannot consume more ambAA's; do not even try to create sub-spawns
787  ++spawn.ambAA_seen; // increase ambAA count -- also for sub-Spawns which will follow from here...
788  DEBUG_ONLY std::cout << "\n\ntrying to spawn from spawn on AAA: " << c << " with path: " << getPath(me, spawn.current_state) << "\n";
789  TSize idxFirst, idxLast;
790  _getSpawnRange(c, idxFirst, idxLast);
791  for (; idxFirst < idxLast; ++idxFirst) // last iteration is left for the parent
792  {
793  Spawn<TNeedle> spawn2 = spawn; // a potential spawn
794  //std::cout << "spawn aa is " << AAcid(idx) << "\n";
795  if (_consumeChar(me, dh, spawn2, AAcid(idxFirst)))
796  {
797  // Spawn2 inherits the depths from its parent
798  dh.spawns.push_front(spawn2);
799  DEBUG_ONLY std::cout << " Spawn from Spawn '" << getPath(me, spawn2.current_state) << "' created at d: " << int(spawn2.max_depth_decrease) << " AA-seen: " << int(spawn2.ambAA_seen) << "\n";
800  }
801  }
802  bool r = _consumeChar(me, dh, spawn, AAcid(idxLast));
803  if (r)
804  {
805  DEBUG_ONLY std::cout << " Spawn from Spawn '" << getPath(me, spawn.current_state) << "' created at d: " << int(spawn.max_depth_decrease) << " AA-seen: " << int(spawn.ambAA_seen) << "\n";
806  }
807  return r;
808  }
809  // UNambiguous
810  return _consumeChar(me, dh, spawn, c);
811  }
812 
813  // This is called by the master thread only!
814  // returns false if it reached the 'root', true otherwise
815  template <typename TNeedle>
818  const AAcid c)
819  {
820  typedef typename ValueSize<AAcid>::Type TSize;
821  typedef typename Pattern<TNeedle, FuzzyAC>::TVert TVert;
822 
823  bool consider_ambAA = me.max_ambAA > 0;
824 
825  if (me.max_mmAA > 0)
826  { // try all AA's
827  TSize idx_first(-1), idx_last(-1);
828  _getSpawnRange(AAcid('X'), idx_first, idx_last);
829  // .. except c or its ambiguities
830  TSize idx_AAfirst(ordValue(c)), idx_AAlast(ordValue(c));
831  if (consider_ambAA && isAmbiguous(c))
832  {
833  _getSpawnRange(c, idx_AAfirst, idx_AAlast);
834  }
835  for (; idx_first <= idx_last; ++idx_first)
836  {
837  if (idx_first == idx_AAfirst)
838  { // ignore this range
839  idx_first = idx_AAlast;
840  continue;
841  }
842  TVert node_spawn = dh.data_lastState; // last state of master
843  if (_consumeChar(me, dh, node_spawn, AAcid(idx_first))) // call this using master's _consumeChar(), since it might pass through root (which is allowed), but should not die.
844  { // spawn from current position; push front to flag as 'processed' for the current input char
845  // depths is 'current_depth - 1' (must be computed here!); mmAA-count: fixed to 1 (since spawned from master)
846  dh.spawns.push_front(Spawn<TNeedle>(node_spawn, getProperty(me.data_node_depth, node_spawn) - 1, 0, 1));
847  DEBUG_ONLY std::cout << " Init Spawn from Master consuming '" << AAcid(idx_first) << "\n";
848  }
849  }
850  }
851  if (isAmbiguous(c))
852  {
853  if (consider_ambAA)
854  {
855  DEBUG_ONLY std::cout << "found AAA: " << c << "\n";
856 
857  TSize idx_first, idx_last;
858  _getSpawnRange(c, idx_first, idx_last);
859  for (; idx_first <= idx_last; ++idx_first)
860  {
861  TVert node_spawn = dh.data_lastState; // last state of master
862  if (_consumeChar(me, dh, node_spawn, AAcid(idx_first))) // call this using master's _consumeChar(), since it might pass through root (which is allowed), but should not die.
863  { // spawn from current position; push front to flag as 'processed' for the current input char
864  // depths is 'current_depth - 1' (must be computed here!); ambAA-count: fixed to 1 (first AAA, since spawned from master)
865  dh.spawns.push_front(Spawn<TNeedle>(node_spawn, getProperty(me.data_node_depth, node_spawn) - 1, 1, 0));
866  DEBUG_ONLY std::cout << " Init Spawn from Master consuming '" << AAcid(idx_first) << "\n";
867  }
868  }
869  }
870  else
871  { // do not spawn anything
872  DEBUG_ONLY std::cout << " --> Main found AAA, but none allowed. Resetting to root. No spawns created.\n";
873  }
874  dh.data_lastState = getRoot(me.data_graph); // reset Master to root
875  return;
876  } // end: ambiguous
877 
878  // 'c' is UN-ambiguous
879  _consumeChar(me, dh, dh.data_lastState, c); // adds hits as well
880  }
881 
882 
883  template <typename TFinder, typename TNeedle>
884  inline bool find(TFinder& finder, const Pattern<TNeedle, FuzzyAC>& me, PatternAuxData<TNeedle>& dh)
885  {
886  if (empty(finder))
887  {
888  _finderSetNonEmpty(finder);
889  }
890  else
891  {
892  finder += dh.data_needleLength; // restore last consumed position in haystack
893  if (!empty(dh.hits_endPositions))
894  { // Process left-over hits
895  _reportHit(finder, me, dh);
896  return true;
897  }
898  // nothing to report at this point
899  ++finder; // advance to next position
900  }
901 
902  while (!atEnd(finder))
903  {
904  const AAcid c = *finder;
905  DEBUG_ONLY std::cout << "\n\n-- consuming " << c << " ---\n";
906  // spawns; do them first, since we might add new (but settled) spawns in main-thread & sub-spawns
907  if (!dh.spawns.empty())
908  {
909  typename PatternAuxData<TNeedle>::SpawnIt it = dh.spawns.begin();
910  //DEBUG_ONLY std::cout << " --> Spawns (" << dh.spawns.size() << " alive):\n";
911  while (it != dh.spawns.end())
912  {
913  if (!_spawnConsumeChar(me, dh, *it, c)) // might create new spawns
914  { // spawn reached root --> kill it
915  it = dh.spawns.erase(it); // remove and advance
916  //DEBUG_ONLY std::cout << " Killed spawn (" << dh.spawns.size() << " alive):\n";
917  }
918  else
919  { // next spawn
920  ++it;
921  }
922  }
923  }
924  // main thread
925  DEBUG_ONLY std::cout << " --> Main; d: " << int(getProperty(me.data_node_depth, dh.data_lastState)) << ")\n";
926  _masterConsumeChar(me, dh, c); // might create new spawns
927  DEBUG_ONLY std::cout << " <-- Main end; d: " << int(getProperty(me.data_node_depth, dh.data_lastState)) << ")\n";
928 
929  // print current states
930  DEBUG_ONLY std::cout << " --> POST: Main state: " << getPath(me, dh.data_lastState) << "\n";
931  for (typename PatternAuxData<TNeedle>::SpawnIt it = dh.spawns.begin(); it != dh.spawns.end(); ++it)
932  {
933  DEBUG_ONLY std::cout << " --> POST: Spawn state: " << getPath(me, it->current_state) << "\n";
934  }
935 
936  if (!empty(dh.hits_endPositions))
937  {
938  _reportHit(finder, me, dh);
939  return true;
940  }
941 
942  ++finder;
943  }
944  return false;
945  }
946 
947 }// namespace SEQAN_NAMESPACE_MAIN
948 
949 namespace OpenMS
950 {
951 
953  // FuzzyAC
955 
971  {
972  public:
973  typedef typename ::seqan::StringSet<::seqan::AAString> PeptideDB;
974  typedef typename ::seqan::Pattern<PeptideDB, ::seqan::FuzzyAC> FuzzyACPattern;
975 
991  static void initPattern(const PeptideDB& pep_db, const int aaa_max, const int mm_max, FuzzyACPattern& pattern)
992  {
993  pattern.init(pep_db, KeyWordLengthType(aaa_max), KeyWordLengthType(mm_max));
994  }
995 
1001  : finder_(),
1002  protein_(),
1003  dh_()
1004  {
1005  }
1013  AhoCorasickAmbiguous(const String& protein_sequence)
1014  : finder_(),
1015  protein_(),
1016  dh_()
1017  {
1018  setProtein(protein_sequence);
1019  }
1020 
1024  void setProtein(const String& protein_sequence)
1025  {
1026  protein_ = protein_sequence.c_str(); // we need an internal copy, since finder_ only keeps a pointer
1027  finder_ = ::seqan::Finder<seqan::AAString>(protein_);
1028  dh_.reset();
1029  }
1030 
1037  bool findNext(const FuzzyACPattern& pattern)
1038  {
1039  return ::seqan::find(finder_, pattern, dh_);
1040  }
1041 
1048  {
1050  }
1051 
1058  {
1059  return (Int)position(finder_);
1060  }
1061 
1062  private:
1063  typedef typename FuzzyACPattern::KeyWordLengthType KeyWordLengthType;
1064 
1065  // member
1066  ::seqan::Finder<seqan::AAString> finder_;
1069  }; // class FuzzyAC
1070 
1071 } // namespace OpenMS
1072 
seqan::AAcid_
Definition: AhoCorasickAmbiguous.h:159
OpenMS::AhoCorasickAmbiguous
Extended Aho-Corasick algorithm capable of matching ambiguous amino acids in the pattern (i....
Definition: AhoCorasickAmbiguous.h:970
DEBUG_ONLY
#define DEBUG_ONLY
Definition: AhoCorasickAmbiguous.h:46
seqan::PatternAuxData::spawns
Spawns spawns
spawn instances currently walking the tree
Definition: AhoCorasickAmbiguous.h:273
seqan::CompareTypeImpl< AAcid, uint8_t >::Type
AAcid Type
Definition: AhoCorasickAmbiguous.h:187
seqan::Spawn::max_depth_decrease
KeyWordLengthType max_depth_decrease
maximum loss in depths of traversed nodes (both while reporting hits and changing its own state)
Definition: AhoCorasickAmbiguous.h:225
seqan::_createSecondarySpawns
bool _createSecondarySpawns(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, Spawn< TNeedle > &spawn, const AAcid c)
Definition: AhoCorasickAmbiguous.h:599
seqan::Host< Pattern< TNeedle, FuzzyAC > const >::Type
const TNeedle Type
Definition: AhoCorasickAmbiguous.h:373
seqan::_getSpawnRange
void _getSpawnRange(const AAcid c, T &idxFirst, T &idxLast)
given an ambAA c, return a range of AA's (including idxLast) which need to be spawned.
Definition: AhoCorasickAmbiguous.h:386
seqan::OMS_TranslateTableCharToAA_
Definition: AhoCorasickAmbiguous.h:95
OpenMS::Size
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
OpenMS::AhoCorasickAmbiguous::findNext
bool findNext(const FuzzyACPattern &pattern)
Enumerate hits.
Definition: AhoCorasickAmbiguous.h:1037
seqan::PatternAuxData::reset
void reset()
Definition: AhoCorasickAmbiguous.h:253
seqan::Spawn::current_state
TVert current_state
Definition: AhoCorasickAmbiguous.h:224
seqan::Spawn::mismatches_seen
KeyWordLengthType mismatches_seen
number of mismatches the spawn has seen
Definition: AhoCorasickAmbiguous.h:227
seqan::Pattern< TNeedle, FuzzyAC >::data_host
Holder< TNeedle > data_host
holds needles, i.e. Peptides
Definition: AhoCorasickAmbiguous.h:306
OpenMS::AhoCorasickAmbiguous::AhoCorasickAmbiguous
AhoCorasickAmbiguous(const String &protein_sequence)
Prepare to start searching for hits in a new protein sequence.
Definition: AhoCorasickAmbiguous.h:1013
seqan::AAcid
SimpleType< unsigned char, AAcid_ > AAcid
Definition: AhoCorasickAmbiguous.h:160
Exception.h
OpenMS::AhoCorasickAmbiguous::FuzzyACPattern
::seqan::Pattern< PeptideDB, ::seqan::FuzzyAC > FuzzyACPattern
Definition: AhoCorasickAmbiguous.h:974
seqan::ValueSize< AAcid >::Type
uint8_t Type
Definition: AhoCorasickAmbiguous.h:164
OpenMS::AhoCorasickAmbiguous::KeyWordLengthType
FuzzyACPattern::KeyWordLengthType KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:1063
OpenMS::AhoCorasickAmbiguous::dh_
::seqan::PatternAuxData< PeptideDB > dh_
auxiliary data to hold a state after searching
Definition: AhoCorasickAmbiguous.h:1068
seqan::Pattern< TNeedle, FuzzyAC >::KeyWordLengthType
__uint8 KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:300
seqan::PatternAuxData::data_lastState
TVert data_lastState
Last state of master instance in the trie.
Definition: AhoCorasickAmbiguous.h:274
seqan::PatternAuxData::TAlphabet
Value< TKeyword >::Type TAlphabet
Definition: AhoCorasickAmbiguous.h:264
seqan::Pattern< TNeedle, FuzzyAC >::TAlphabet
Value< TKeyword >::Type TAlphabet
Definition: AhoCorasickAmbiguous.h:297
seqan::Pattern< TNeedle, FuzzyAC >::data_graph
TGraph data_graph
regular trie data
Definition: AhoCorasickAmbiguous.h:307
seqan::BitsPerValue< AAcid >::Type
uint8_t Type
Definition: AhoCorasickAmbiguous.h:170
seqan::unknownValueImpl
AAcid unknownValueImpl(AAcid *)
Definition: AhoCorasickAmbiguous.h:174
seqan::Spawn::TKeyword
Value< TNeedle >::Type TKeyword
Definition: AhoCorasickAmbiguous.h:219
seqan::PatternAuxData::TGraph
Graph< Automaton< TAlphabet > > TGraph
Definition: AhoCorasickAmbiguous.h:265
OpenMS::AhoCorasickAmbiguous::finder_
::seqan::Finder< seqan::AAString > finder_
locate the next peptide hit in protein
Definition: AhoCorasickAmbiguous.h:1066
seqan::Pattern< TNeedle, FuzzyAC >::parentMap
String< TVert > parentMap
allows to find parent of each node
Definition: AhoCorasickAmbiguous.h:314
seqan::Pattern< TNeedle, FuzzyAC >::TVert
VertexDescriptor< TGraph >::Type TVert
Definition: AhoCorasickAmbiguous.h:299
seqan::Spawn
state of an AC spawn, operating on a trie
Definition: AhoCorasickAmbiguous.h:216
seqan::OMS_TranslateTableByteToAA_::VALUE
static const char VALUE[256]
Definition: AhoCorasickAmbiguous.h:133
seqan::OMS_TranslateTableAAToChar_::VALUE
static const char VALUE[27]
Definition: AhoCorasickAmbiguous.h:59
Value
int
seqan::Pattern< TNeedle, FuzzyAC >::data_map_outputNodes
String< String< TSize > > data_map_outputNodes
regular trie data – plus: this gets augmented with all suffix traversals which are output nodes
Definition: AhoCorasickAmbiguous.h:308
seqan::Pattern< TNeedle, FuzzyAC >::TSize
uint32_t TSize
Definition: AhoCorasickAmbiguous.h:295
seqan::PatternAuxData::SpawnIt
std::list< Spawn< TNeedle > >::iterator SpawnIt
Definition: AhoCorasickAmbiguous.h:269
seqan::Spawn::KeyWordLengthType
Pattern< TNeedle, FuzzyAC >::KeyWordLengthType KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:223
SeqanIncludeWrapper.h
Iterator
seqan::Spawn::TSize
Size< TNeedle >::Type TSize
Definition: AhoCorasickAmbiguous.h:218
seqan::Pattern< TNeedle, FuzzyAC >
Definition: AhoCorasickAmbiguous.h:282
seqan::Pattern< TNeedle, FuzzyAC >::max_mmAA
KeyWordLengthType max_mmAA
mismatches; default: 0
Definition: AhoCorasickAmbiguous.h:311
seqan::AAString
String< AAcid, Alloc< void > > AAString
Definition: AhoCorasickAmbiguous.h:206
OpenMS::AhoCorasickAmbiguous::protein_
::seqan::AAString protein_
the protein sequence - we need to store it since the finder only keeps a pointer to protein when cons...
Definition: AhoCorasickAmbiguous.h:1067
OpenMS::AhoCorasickAmbiguous::AhoCorasickAmbiguous
AhoCorasickAmbiguous()
Default Ctor; call setProtein() before using findNext().
Definition: AhoCorasickAmbiguous.h:1000
OpenMS::AhoCorasickAmbiguous::PeptideDB
::seqan::StringSet<::seqan::AAString > PeptideDB
Definition: AhoCorasickAmbiguous.h:973
seqan::PatternAuxData::TVert
VertexDescriptor< TGraph >::Type TVert
Definition: AhoCorasickAmbiguous.h:266
seqan::Host< Pattern< TNeedle, FuzzyAC > >::Type
TNeedle Type
Definition: AhoCorasickAmbiguous.h:367
seqan::PatternAuxData::TSize
Size< TNeedle >::Type TSize
Definition: AhoCorasickAmbiguous.h:262
seqan::Spawn::Spawn
Spawn()
seqan::Pattern< TNeedle, FuzzyAC >::TKeyword
Value< TNeedle >::Type TKeyword
Definition: AhoCorasickAmbiguous.h:296
seqan::Pattern< TNeedle, FuzzyAC >::~Pattern
~Pattern()
Definition: AhoCorasickAmbiguous.h:352
seqan::addHits
void addHits(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, const Spawn< TNeedle > &spawn)
Definition: AhoCorasickAmbiguous.h:648
seqan::Spawn::ambAA_seen
KeyWordLengthType ambAA_seen
number of ambAA's which the spawn has seen
Definition: AhoCorasickAmbiguous.h:226
seqan::PatternAuxData::data_needleLength
TSize data_needleLength
Last length of needle to reposition finder.
Definition: AhoCorasickAmbiguous.h:278
seqan::assign
void assign(char &c_target, AAcid const &source)
Definition: AhoCorasickAmbiguous.h:179
seqan::OMS_TranslateTableAAToChar_
Definition: AhoCorasickAmbiguous.h:57
seqan::Spawn::TGraph
Graph< Automaton< TAlphabet > > TGraph
Definition: AhoCorasickAmbiguous.h:221
seqan::PatternAuxData::Spawns
std::list< Spawn< TNeedle > > Spawns
Definition: AhoCorasickAmbiguous.h:268
seqan::isAmbiguous
bool isAmbiguous(AAcid c)
Definition: AhoCorasickAmbiguous.h:578
seqan::setHost
void setHost(Pattern< TNeedle, FuzzyAC > &me, TNeedle2 const &needle)
Definition: AhoCorasickAmbiguous.h:543
seqan::getPath
std::string getPath(const Pattern< TNeedle, FuzzyAC > &me, typename Pattern< TNeedle, FuzzyAC >::TVert current_state)
for debug only
Definition: AhoCorasickAmbiguous.h:636
seqan::FuzzyAC
Tag< FuzzyAC_ > FuzzyAC
Definition: AhoCorasickAmbiguous.h:211
OpenMS::AhoCorasickAmbiguous::setProtein
void setProtein(const String &protein_sequence)
Reset to new protein sequence. All previous data is forgotten.
Definition: AhoCorasickAmbiguous.h:1024
seqan::OMS_TranslateTableCharToAA_::VALUE
static const char VALUE[256]
Definition: AhoCorasickAmbiguous.h:97
seqan::position
Size< TNeedle >::Type position(const PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:561
seqan::_masterConsumeChar
void _masterConsumeChar(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, const AAcid c)
Definition: AhoCorasickAmbiguous.h:816
seqan::Pattern< TNeedle, FuzzyAC >::max_ambAA
KeyWordLengthType max_ambAA
ambiguity; default: 3
Definition: AhoCorasickAmbiguous.h:310
seqan::find
bool find(TFinder &finder, const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:884
seqan::PatternAuxData::KeyWordLengthType
__uint8 KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:267
seqan::PatternAuxData
Definition: AhoCorasickAmbiguous.h:242
OpenMS::Constants::c
const double c
seqan::PatternAuxData::SpawnCIt
std::list< Spawn< TNeedle > >::const_iterator SpawnCIt
Definition: AhoCorasickAmbiguous.h:270
OpenMS::String
A more convenient string class.
Definition: String.h:58
seqan::Spawn::Spawn
Spawn(TVert init_state, KeyWordLengthType current_depth, KeyWordLengthType aaa_seen, KeyWordLengthType mm_seen)
Definition: AhoCorasickAmbiguous.h:229
seqan::_spawnConsumeChar
bool _spawnConsumeChar(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, Spawn< TNeedle > &spawn, const AAcid c)
Definition: AhoCorasickAmbiguous.h:747
OpenMS::AhoCorasickAmbiguous::getHitDBIndex
Size getHitDBIndex()
Get index of hit into peptide database of the pattern.
Definition: AhoCorasickAmbiguous.h:1047
seqan
Definition: AhoCorasickAmbiguous.h:51
seqan::Pattern< TNeedle, FuzzyAC >::data_node_depth
String< KeyWordLengthType > data_node_depth
depths of each graph node
Definition: AhoCorasickAmbiguous.h:309
seqan::Pattern< TNeedle, FuzzyAC >::init
void init(TNeedle const &ndl, KeyWordLengthType max_AAA=3, KeyWordLengthType max_mm=0)
Pattern Ctor with vector of needles, i.e. keywords/peptides.
Definition: AhoCorasickAmbiguous.h:324
OpenMS
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
OpenMS::Exception::InvalidValue
Invalid value exception.
Definition: Exception.h:335
OpenMS::StringConversions::append
void append(const T &i, String &target)
Definition: StringUtils.h:77
seqan::Spawn::TVert
VertexDescriptor< TGraph >::Type TVert
Definition: AhoCorasickAmbiguous.h:222
Macros.h
OpenMS::AhoCorasickAmbiguous::initPattern
static void initPattern(const PeptideDB &pep_db, const int aaa_max, const int mm_max, FuzzyACPattern &pattern)
Construct a trie from a set of peptide sequences (which are to be found in a protein).
Definition: AhoCorasickAmbiguous.h:991
seqan::Pattern< TNeedle, FuzzyAC >::nilVal
const TVert nilVal
NULL pointer for trie; e.g. returned when no successor is found.
Definition: AhoCorasickAmbiguous.h:303
seqan::Spawn::TAlphabet
Value< TKeyword >::Type TAlphabet
Definition: AhoCorasickAmbiguous.h:220
OpenMS::AhoCorasickAmbiguous::getHitProteinPosition
Int getHitProteinPosition()
Offset into protein sequence where hit was found.
Definition: AhoCorasickAmbiguous.h:1057
seqan::PatternAuxData::data_keywordIndex
TSize data_keywordIndex
Current keyword that produced a hit.
Definition: AhoCorasickAmbiguous.h:277
String.h
seqan::Spawn::operator=
Spawn & operator=(const Spawn &)
seqan::PatternAuxData::PatternAuxData
PatternAuxData()
Definition: AhoCorasickAmbiguous.h:245
seqan::_reportHit
void _reportHit(TFinder &finder, const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:567
seqan::_consumeChar
bool _consumeChar(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, typename Pattern< TNeedle, FuzzyAC >::TVert &current, const AAcid c)
Universal UN-ambiguous char consumer. Works for Master and Spawns.
Definition: AhoCorasickAmbiguous.h:691
seqan::OMS_TranslateTableByteToAA_
Definition: AhoCorasickAmbiguous.h:131
seqan::Pattern< TNeedle, FuzzyAC >::TGraph
Graph< Automaton< TAlphabet > > TGraph
Definition: AhoCorasickAmbiguous.h:298
seqan::Pattern< TNeedle, FuzzyAC >::Pattern
Pattern()
Definition: AhoCorasickAmbiguous.h:288
seqan::PatternAuxData::TKeyword
Value< TNeedle >::Type TKeyword
Definition: AhoCorasickAmbiguous.h:263
seqan::PatternAuxData::hits_endPositions
String< TSize > hits_endPositions
All remaining keyword indices.
Definition: AhoCorasickAmbiguous.h:276
seqan::AAStringIterator
Iterator< const AAString, Rooted >::Type AAStringIterator
Definition: AhoCorasickAmbiguous.h:207
seqan::_createAcTrie
void _createAcTrie(Pattern< TNeedle, FuzzyAC > &me)
Definition: AhoCorasickAmbiguous.h:410