44 #define DEBUG_ONLY if (false) 46 #define DEBUG_ONLY if (false) 56 template <
typename T =
void>
94 template <
typename T =
void>
100 template <
typename T>
103 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
104 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
105 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 25, 25, 25, 25, 25,
106 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
109 25, 0, 22, 2, 3, 15, 5, 6, 7, 8, 23, 10, 9, 12, 4, 13,
112 14, 16, 17, 18, 19, 20, 21, 11, 25, 1, 24, 25, 25, 25, 25, 25,
115 25, 0, 22, 2, 3, 15, 5, 6, 7, 8, 23, 10, 9, 12, 4, 13,
118 14, 16, 17, 18, 19, 20, 21, 11, 25, 1, 24, 25, 25, 25, 25,
120 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
121 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
122 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
123 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
124 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
125 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
126 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
127 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
130 template <
typename T =
void>
136 template <
typename T>
139 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
140 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 25, 25, 25, 25, 25,
141 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
142 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
143 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
144 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
145 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
146 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
147 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
148 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
149 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
150 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
151 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
152 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
153 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
154 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25
160 typedef SimpleType<unsigned char, AAcid_>
AAcid;
168 template <>
struct BitsPerValue<
AAcid>
185 struct CompareTypeImpl<
AAcid, uint8_t>
215 template <
typename TNeedle>
218 typedef typename Size<TNeedle>::Type
TSize;
221 typedef Graph<Automaton<TAlphabet> >
TGraph;
222 typedef typename VertexDescriptor<TGraph>::Type
TVert;
241 template <
typename TNeedle>
262 typedef typename Size<TNeedle>::Type
TSize;
265 typedef Graph<Automaton<TAlphabet> >
TGraph;
266 typedef typename VertexDescriptor<TGraph>::Type
TVert;
268 typedef typename std::list<Spawn<TNeedle> >
Spawns;
269 typedef typename std::list<Spawn<TNeedle> >::iterator
SpawnIt;
270 typedef typename std::list<Spawn<TNeedle> >::const_iterator
SpawnCIt;
281 template <
typename TNeedle>
285 Pattern(Pattern
const& other);
286 Pattern
const& operator=(Pattern
const & other);
289 : nilVal(getNil<
TVert>()),
298 typedef Graph<Automaton<TAlphabet> >
TGraph;
299 typedef typename VertexDescriptor<TGraph>::Type
TVert;
329 if (std::numeric_limits<TSize>::max() < length(ndl))
334 for (
TSize i = 0; i < ln; ++i)
336 if (length(ndl[i]) > std::numeric_limits<KeyWordLengthType>::max())
338 throw OpenMS::Exception::InvalidValue(__FILE__, __LINE__,
"Pattern<FuzzyAC>(PeptideSet)", std::string(
"Input peptide to FuzzyAC must NOT be longer than 255 chars!").c_str(), std::string(begin(ndl[i]), end(ndl[i])));
341 for (
TSize j = 0; j < lni; ++j)
345 throw OpenMS::Exception::InvalidValue(__FILE__, __LINE__,
"Pattern<FuzzyAC>(PeptideSet)", std::string(
"Input peptide to FuzzyAC must NOT contain ambiguous amino acids (B/J/Z/X)!").c_str(), std::string(begin(ndl[i]), end(ndl[i])));
364 template <
typename TNeedle>
370 template <
typename TNeedle>
389 static const T jump[4][2] = { { ordValue(
AAcid(
'D')), ordValue(
AAcid(
'N')) },
390 { ordValue(
AAcid(
'I')), ordValue(
AAcid(
'L')) },
391 { ordValue(
AAcid(
'E')), ordValue(
AAcid(
'Q')) },
393 static const T ord_b = ordValue(
AAcid(
'B'));
395 assert(ordValue(
AAcid(
'N')) - ordValue(
AAcid(
'D')) == 1);
396 assert(ordValue(
AAcid(
'Q')) - ordValue(
AAcid(
'E')) == 1);
397 assert(ordValue(
AAcid(
'V')) == 21);
400 assert(ordValue(
AAcid(
'J')) == 23);
401 assert(ordValue(
AAcid(
'Z')) == 24);
402 assert(ordValue(
AAcid(
'X')) == 25);
404 idxFirst = jump[ordValue(
c) - ord_b][0];
405 idxLast = jump[ordValue(
c) - ord_b][1];
409 template <
typename TNeedle>
414 typedef typename Position<TNeedle>::Type TPosition;
417 typedef Graph<Automaton<TAlphabet> > TGraph;
418 typedef typename VertexDescriptor<TGraph>::Type TVert;
419 TVert nilVal = getNil<TVert>();
425 String<TVert> parentMap;
426 String<TAlphabet> parentCharMap;
428 resizeVertexMap(me.
data_graph, parentCharMap);
430 for (TPosition i = 0; i < length(parentMap); ++i) {
431 assignProperty(parentMap, i, nilVal);
433 typedef typename Iterator<TGraph, EdgeIterator>::Type TEdgeIterator;
436 for (; !atEnd(itEd); goNext(itEd)) {
438 assignProperty(parentMap, targetVertex(itEd), sourceVertex(itEd));
439 assignProperty(parentCharMap, targetVertex(itEd), label(itEd));
450 String<TVert> data_map_failurelink;
451 resizeVertexMap(me.
data_graph, data_map_failurelink);
452 assignProperty(data_map_failurelink, root, nilVal);
458 typedef typename Iterator<TGraph, BfsIterator>::Type TBfsIterator;
461 TSize idxAAFirst, idxAALast;
464 for (TSize idx = idxAAFirst; idx <= idxAALast; ++idx)
474 for (; !atEnd(it); goNext(it))
476 const typename GetValue<TBfsIterator>::Type itval = *it;
481 TVert parent = getProperty(parentMap, itval);
489 TAlphabet sigma = getProperty(parentCharMap, itval);
491 TVert down = getProperty(data_map_failurelink, parent);
492 while ((down != nilVal) &&
493 (getSuccessor(me.
data_graph, down, sigma) == nilVal))
495 down = getProperty(data_map_failurelink, down);
499 assignProperty(data_map_failurelink, itval, getSuccessor(me.
data_graph, down, sigma));
501 String<TPosition> endPositions = getProperty(me.
data_map_outputNodes, getProperty(data_map_failurelink, itval));
502 if (!empty(endPositions))
506 typedef typename Iterator<String<TPosition>, Rooted >::Type TStringIterator;
508 TStringIterator sit = begin(endPositions);
509 for (;!atEnd(sit); goNext(sit))
511 appendValue(endPositionsCurrent, *sit);
517 assignProperty(data_map_failurelink, itval, root);
521 for (TSize idx = idxAAFirst; idx <= idxAALast; ++idx)
525 const TVert& target = getSuccessor(me.
data_graph, getProperty(data_map_failurelink, itval) ,
AAcid(idx));
542 template <
typename TNeedle,
typename TNeedle2>
546 SEQAN_ASSERT_NOT(empty(needle));
552 template <
typename TNeedle,
typename TNeedle2>
555 setHost(me, reinterpret_cast<TNeedle2 const &>(needle));
560 template <
typename TNeedle>
566 template <
typename TFinder,
typename TNeedle>
574 _setFinderEnd(finder,
position(finder) + length(finder));
582 return (vB <= ordValue(
c) && ordValue(
c) <= vX);
598 template <
typename TNeedle>
608 TSize idxFirst, idxLast;
610 for (TSize idx = idxFirst + 1; idx <= idxLast; ++idx)
617 dh.
spawns.push_front(spawn2);
638 if (getRoot(me.data_graph) == current_state)
return "";
641 typename Iterator<typename Pattern<TNeedle, FuzzyAC>::TGraph
const, OutEdgeIterator>::Type it(me.data_graph, suffix_node);
642 while(targetVertex(it) != current_state) ++it;
643 char c = (label(it));
644 return getPath(me, suffix_node) +
c;
653 if (length(needle_hits))
657 DEBUG_ONLY std::cout <<
" spawn adding hits which are at least " << unambiguous_suffix_length <<
" chars long (thus contain the AAA).\n";
660 for (
auto it = begin(needle_hits); it != end(needle_hits); ++it)
662 int hit_length = (int)length(value(host(me), *it));
663 if (hit_length < unambiguous_suffix_length)
break;
664 DEBUG_ONLY std::cout <<
" add spawn hit: needle #" << *it <<
" as " << (value(host(me), *it)) <<
"\n";
674 if (length(needle_hits))
676 DEBUG_ONLY std::cout <<
"master's new hits: total " << length(needle_hits) <<
" hits\n";
690 template <
typename TNeedle>
698 assert(current != me.
nilVal);
713 template <
typename TNeedle>
721 assert(successor != me.
nilVal);
723 if (successor == getRoot(me.
data_graph))
return false;
746 template <
typename TNeedle>
758 TSize idx_first(-1), idx_last(-1);
761 TSize idx_AAfirst(ordValue(
c)), idx_AAlast(ordValue(
c));
766 for (; idx_first <= idx_last; ++idx_first)
768 if (idx_first == idx_AAfirst)
770 idx_first = idx_AAlast;
778 dh.
spawns.push_front(spawn2);
786 if (!try_ambAA)
return false;
789 TSize idxFirst, idxLast;
791 for (; idxFirst < idxLast; ++idxFirst)
798 dh.
spawns.push_front(spawn2);
815 template <
typename TNeedle>
827 TSize idx_first(-1), idx_last(-1);
830 TSize idx_AAfirst(ordValue(
c)), idx_AAlast(ordValue(
c));
835 for (; idx_first <= idx_last; ++idx_first)
837 if (idx_first == idx_AAfirst)
839 idx_first = idx_AAlast;
847 DEBUG_ONLY std::cout <<
" Init Spawn from Master consuming '" <<
AAcid(idx_first) <<
"\n";
855 DEBUG_ONLY std::cout <<
"found AAA: " <<
c <<
"\n";
857 TSize idx_first, idx_last;
859 for (; idx_first <= idx_last; ++idx_first)
866 DEBUG_ONLY std::cout <<
" Init Spawn from Master consuming '" <<
AAcid(idx_first) <<
"\n";
872 DEBUG_ONLY std::cout <<
" --> Main found AAA, but none allowed. Resetting to root. No spawns created.\n";
883 template <
typename TFinder,
typename TNeedle>
888 _finderSetNonEmpty(finder);
902 while (!atEnd(finder))
905 DEBUG_ONLY std::cout <<
"\n\n-- consuming " <<
c <<
" ---\n";
911 while (it != dh.
spawns.end())
933 DEBUG_ONLY std::cout <<
" --> POST: Spawn state: " <<
getPath(me, it->current_state) <<
"\n";
973 typedef typename ::seqan::StringSet<::seqan::AAString>
PeptideDB;
1026 protein_ = protein_sequence.c_str();
Spawn(TVert init_state, KeyWordLengthType current_depth, KeyWordLengthType aaa_seen, KeyWordLengthType mm_seen)
Definition: AhoCorasickAmbiguous.h:229
Int getHitProteinPosition()
Offset into protein sequence where hit was found.
Definition: AhoCorasickAmbiguous.h:1057
AAcid unknownValueImpl(AAcid *)
Definition: AhoCorasickAmbiguous.h:174
bool _consumeChar(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, typename Pattern< TNeedle, FuzzyAC >::TVert ¤t, const AAcid c)
Universal UN-ambiguous char consumer. Works for Master and Spawns.
Definition: AhoCorasickAmbiguous.h:691
uint32_t TSize
Definition: AhoCorasickAmbiguous.h:295
TSize data_needleLength
Definition: AhoCorasickAmbiguous.h:278
SimpleType< unsigned char, AAcid_ > AAcid
Definition: AhoCorasickAmbiguous.h:160
uint8_t Type
Definition: AhoCorasickAmbiguous.h:164
Definition: AhoCorasickAmbiguous.h:51
KeyWordLengthType max_mmAA
Definition: AhoCorasickAmbiguous.h:311
String< TVert > parentMap
allows to find parent of each node
Definition: AhoCorasickAmbiguous.h:314
::seqan::PatternAuxData< PeptideDB > dh_
auxiliary data to hold a state after searching
Definition: AhoCorasickAmbiguous.h:1068
AAcid Type
Definition: AhoCorasickAmbiguous.h:187
String< TSize > hits_endPositions
Definition: AhoCorasickAmbiguous.h:276
Spawn & operator=(const Spawn &)
state of an AC spawn, operating on a trie
Definition: AhoCorasickAmbiguous.h:216
std::list< Spawn< TNeedle > > Spawns
Definition: AhoCorasickAmbiguous.h:268
TVert current_state
Definition: AhoCorasickAmbiguous.h:224
VertexDescriptor< TGraph >::Type TVert
Definition: AhoCorasickAmbiguous.h:222
::seqan::Finder< seqan::AAString > finder_
locate the next peptide hit in protein
Definition: AhoCorasickAmbiguous.h:1066
const TVert nilVal
Definition: AhoCorasickAmbiguous.h:303
Iterator< const AAString, Rooted >::Type AAStringIterator
Definition: AhoCorasickAmbiguous.h:207
Size getHitDBIndex()
Get index of hit into peptide database of the pattern.
Definition: AhoCorasickAmbiguous.h:1047
Graph< Automaton< TAlphabet > > TGraph
Definition: AhoCorasickAmbiguous.h:298
Definition: AhoCorasickAmbiguous.h:57
Value< TNeedle >::Type TKeyword
Definition: AhoCorasickAmbiguous.h:263
Extended Aho-Corasick algorithm capable of matching ambiguous amino acids in the pattern (i...
Definition: AhoCorasickAmbiguous.h:970
Pattern()
Definition: AhoCorasickAmbiguous.h:288
Value< TKeyword >::Type TAlphabet
Definition: AhoCorasickAmbiguous.h:297
TVert data_lastState
Definition: AhoCorasickAmbiguous.h:274
void init(TNeedle const &ndl, KeyWordLengthType max_AAA=3, KeyWordLengthType max_mm=0)
Pattern Ctor with vector of needles, i.e. keywords/peptides.
Definition: AhoCorasickAmbiguous.h:324
A more convenient string class.
Definition: String.h:58
FuzzyACPattern::KeyWordLengthType KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:1063
Definition: AhoCorasickAmbiguous.h:131
TGraph data_graph
Definition: AhoCorasickAmbiguous.h:307
TNeedle const Type
Definition: AhoCorasickAmbiguous.h:373
void setHost(Pattern< TNeedle, FuzzyAC > &me, TNeedle2 const &needle)
Definition: AhoCorasickAmbiguous.h:543
std::string getPath(const Pattern< TNeedle, FuzzyAC > &me, typename Pattern< TNeedle, FuzzyAC >::TVert current_state)
for debug only
Definition: AhoCorasickAmbiguous.h:636
uint8_t Type
Definition: AhoCorasickAmbiguous.h:170
void _getSpawnRange(const AAcid c, T &idxFirst, T &idxLast)
given an ambAA c, return a range of AA's (including idxLast) which need to be spawned.
Definition: AhoCorasickAmbiguous.h:386
static char const VALUE[256]
Definition: AhoCorasickAmbiguous.h:97
Definition: AhoCorasickAmbiguous.h:282
KeyWordLengthType max_depth_decrease
Definition: AhoCorasickAmbiguous.h:225
Holder< TNeedle > data_host
Definition: AhoCorasickAmbiguous.h:306
KeyWordLengthType ambAA_seen
Definition: AhoCorasickAmbiguous.h:226
VertexDescriptor< TGraph >::Type TVert
Definition: AhoCorasickAmbiguous.h:299
void addHits(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, const Spawn< TNeedle > &spawn)
Definition: AhoCorasickAmbiguous.h:648
::seqan::StringSet<::seqan::AAString > PeptideDB
Definition: AhoCorasickAmbiguous.h:973
Size< TNeedle >::Type TSize
Definition: AhoCorasickAmbiguous.h:218
bool findNext(const FuzzyACPattern &pattern)
Enumerate hits.
Definition: AhoCorasickAmbiguous.h:1037
Definition: AhoCorasickAmbiguous.h:95
String< KeyWordLengthType > data_node_depth
Definition: AhoCorasickAmbiguous.h:309
TSize data_keywordIndex
Definition: AhoCorasickAmbiguous.h:277
String< String< TSize > > data_map_outputNodes
Definition: AhoCorasickAmbiguous.h:308
__uint8 KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:267
static char const VALUE[27]
Definition: AhoCorasickAmbiguous.h:59
::seqan::Pattern< PeptideDB, ::seqan::FuzzyAC > FuzzyACPattern
Definition: AhoCorasickAmbiguous.h:974
TNeedle Type
Definition: AhoCorasickAmbiguous.h:367
bool isAmbiguous(AAcid c)
Definition: AhoCorasickAmbiguous.h:578
Value< TKeyword >::Type TAlphabet
Definition: AhoCorasickAmbiguous.h:220
Definition: AhoCorasickAmbiguous.h:242
Size< TNeedle >::Type TSize
Definition: AhoCorasickAmbiguous.h:262
Graph< Automaton< TAlphabet > > TGraph
Definition: AhoCorasickAmbiguous.h:265
Value< TNeedle >::Type TKeyword
Definition: AhoCorasickAmbiguous.h:219
Spawns spawns
Definition: AhoCorasickAmbiguous.h:273
Tag< FuzzyAC_ > FuzzyAC
Definition: AhoCorasickAmbiguous.h:211
void setProtein(const String &protein_sequence)
Reset to new protein sequence. All previous data is forgotten.
Definition: AhoCorasickAmbiguous.h:1024
KeyWordLengthType max_ambAA
Definition: AhoCorasickAmbiguous.h:310
void assign(char &c_target, AAcid const &source)
Definition: AhoCorasickAmbiguous.h:179
Definition: AhoCorasickAmbiguous.h:159
std::list< Spawn< TNeedle > >::iterator SpawnIt
Definition: AhoCorasickAmbiguous.h:269
int Int
Signed integer type.
Definition: Types.h:102
void _createAcTrie(Pattern< TNeedle, FuzzyAC > &me)
Definition: AhoCorasickAmbiguous.h:410
void reset()
Definition: AhoCorasickAmbiguous.h:253
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
static void initPattern(const PeptideDB &pep_db, const int aaa_max, const int mm_max, FuzzyACPattern &pattern)
Construct a trie from a set of peptide sequences (which are to be found in a protein).
Definition: AhoCorasickAmbiguous.h:991
static char const VALUE[256]
Definition: AhoCorasickAmbiguous.h:133
AhoCorasickAmbiguous(const String &protein_sequence)
Prepare to start searching for hits in a new protein sequence.
Definition: AhoCorasickAmbiguous.h:1013
~Pattern()
Definition: AhoCorasickAmbiguous.h:352
Pattern< TNeedle, FuzzyAC >::KeyWordLengthType KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:223
Size< TNeedle >::Type position(const PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:561
VertexDescriptor< TGraph >::Type TVert
Definition: AhoCorasickAmbiguous.h:266
String< AAcid, Alloc< void > > AAString
Definition: AhoCorasickAmbiguous.h:206
void _reportHit(TFinder &finder, const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:567
bool find(TFinder &finder, const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:884
KeyWordLengthType mismatches_seen
Definition: AhoCorasickAmbiguous.h:227
PatternAuxData()
Definition: AhoCorasickAmbiguous.h:245
#define DEBUG_ONLY
Definition: AhoCorasickAmbiguous.h:46
__uint8 KeyWordLengthType
Definition: AhoCorasickAmbiguous.h:300
Value< TNeedle >::Type TKeyword
Definition: AhoCorasickAmbiguous.h:296
void _masterConsumeChar(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, const AAcid c)
Definition: AhoCorasickAmbiguous.h:816
::seqan::AAString protein_
the protein sequence - we need to store it since the finder only keeps a pointer to protein when cons...
Definition: AhoCorasickAmbiguous.h:1067
bool _spawnConsumeChar(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, Spawn< TNeedle > &spawn, const AAcid c)
Definition: AhoCorasickAmbiguous.h:747
bool _createSecondarySpawns(const Pattern< TNeedle, FuzzyAC > &me, PatternAuxData< TNeedle > &dh, Spawn< TNeedle > &spawn, const AAcid c)
Definition: AhoCorasickAmbiguous.h:599
Invalid value exception.
Definition: Exception.h:335
AhoCorasickAmbiguous()
Default Ctor; call setProtein() before using findNext().
Definition: AhoCorasickAmbiguous.h:1000
Value< TKeyword >::Type TAlphabet
Definition: AhoCorasickAmbiguous.h:264
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
std::list< Spawn< TNeedle > >::const_iterator SpawnCIt
Definition: AhoCorasickAmbiguous.h:270
Graph< Automaton< TAlphabet > > TGraph
Definition: AhoCorasickAmbiguous.h:221