148 inline ExitCodes run(std::vector<FASTAFile::FASTAEntry>& proteins, std::vector<ProteinIdentification>& prot_ids, std::vector<PeptideIdentification>& pep_ids)
151 return run<TFI_Vector>(protein_container, prot_ids, pep_ids);
193 if (decoy_string_.empty())
195 bool is_decoy_string_auto_successful = findDecoyString_(proteins);
197 if (!is_decoy_string_auto_successful && contains_decoys_)
199 return DECOYSTRING_EMPTY;
201 else if (!is_decoy_string_auto_successful && !contains_decoys_)
203 LOG_WARN <<
"Unable to determine decoy string automatically, not enough decoys were detected! Using default " << (prefix_ ?
"prefix" :
"suffix") <<
" decoy string '" << decoy_string_ <<
"\n" 204 <<
"If you think that this is false, please provide a decoy_string and its position manually!" << std::endl;
209 LOG_INFO <<
"Using " << (prefix_ ?
"prefix" :
"suffix") <<
" decoy string '" << decoy_string_ <<
"'" << std::endl;
222 bool xtandem_fix_parameters =
true, msgfplus_fix_parameters =
true;
228 xtandem_fix_parameters =
false;
232 if (enzyme.
getEnzymeName() ==
"Trypsin/P") { msgfplus_fix_parameters =
false; }
235 for (
const auto& prot_id : prot_ids)
237 if (!msgfplus_fix_parameters && !xtandem_fix_parameters) {
break; }
238 String se = prot_id.getSearchEngine();
240 if (search_engine !=
"XTANDEM") { xtandem_fix_parameters =
false; }
241 if (search_engine !=
"MSGFPLUS" ||
"MS-GF+") { msgfplus_fix_parameters =
false; }
245 if (msgfplus_fix_parameters && enzyme.
getEnzymeName() ==
"Trypsin")
247 LOG_WARN <<
"MSGFPlus detected but enzyme cutting rules were set to Trypsin. Correcting to Trypsin/P to copy with special cutting rule in MSGFPlus." << std::endl;
255 const size_t PROTEIN_CACHE_SIZE = 4e5;
257 this->startProgress(0, 1,
"Load first chunk");
258 proteins.cacheChunk(PROTEIN_CACHE_SIZE);
261 if (proteins.empty())
263 LOG_ERROR <<
"Error: An empty database was provided. Mapping makes no sense. Aborting..." << std::endl;
264 return DATABASE_EMPTY;
269 LOG_WARN <<
"Warning: An empty set of peptide identifications was provided. Output will be empty as well." << std::endl;
270 if (!keep_unreferenced_proteins_)
273 for (std::vector<ProteinIdentification>::iterator it = prot_ids.begin();
274 it != prot_ids.end(); ++it)
276 it->getHits().clear();
279 return PEPTIDE_IDS_EMPTY;
284 std::vector<bool> protein_is_decoy;
285 std::vector<std::string> protein_accessions;
287 bool invalid_protein_sequence =
false;
294 bool has_illegal_AAs(
false);
296 for (std::vector<PeptideIdentification>::const_iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
299 const std::vector<PeptideHit>& hits = it1->getHits();
300 for (std::vector<PeptideHit>::const_iterator it2 = hits.begin(); it2 != hits.end(); ++it2)
306 String seq = it2->getSequence().toUnmodifiedString().
remove(
'*');
309 LOG_ERROR <<
"Peptide sequence '" << it2->getSequence() <<
"' contains one or more ambiguous amino acids (B|J|Z|X).\n";
310 has_illegal_AAs =
true;
316 appendValue(pep_DB, seq.c_str());
321 LOG_ERROR <<
"One or more peptides contained illegal amino acids. This is not allowed!" 322 <<
"\nPlease either remove the peptide or replace it with one of the unambiguous ones (while allowing for ambiguous AA's to match the protein)." << std::endl;;
325 LOG_INFO <<
"Mapping " << length(pep_DB) <<
" peptides to " << (proteins.size() == PROTEIN_CACHE_SIZE ?
"? (unknown number of)" :
String(proteins.size())) <<
" proteins." << std::endl;
327 if (length(pep_DB) == 0)
329 LOG_WARN <<
"Warning: Peptide identifications have no hits inside! Output will be empty as well." << std::endl;
330 return PEPTIDE_IDS_EMPTY;
336 LOG_INFO <<
"Searching with up to " << aaa_max_ <<
" ambiguous amino acid(s) and " << mm_max_ <<
" mismatch(es)!" << std::endl;
347 uint16_t count_j_proteins(0);
348 bool has_active_data =
true;
349 const std::string jumpX(aaa_max_ + mm_max_ + 1,
'X');
351 this->startProgress(0, proteins.size() == PROTEIN_CACHE_SIZE ? std::numeric_limits<SignedSize>::max() : proteins.size(),
"Aho-Corasick");
352 std::atomic<int> progress_prots(0);
364 #pragma omp barrier // all threads need to be here, since we are about to swap protein data 367 DEBUG_ONLY std::cerr <<
" activating cache ...\n";
368 has_active_data = proteins.activateCache();
369 protein_accessions.resize(proteins.getChunkOffset() + proteins.chunkSize());
372 if (!has_active_data)
break;
377 DEBUG_ONLY std::cerr <<
"Filling Protein Cache ...";
378 proteins.cacheChunk(PROTEIN_CACHE_SIZE);
379 protein_is_decoy.resize(proteins.getChunkOffset() + prot_count);
382 const String& seq = proteins.chunkAt(i).identifier;
383 protein_is_decoy[i + proteins.getChunkOffset()] = (prefix_ ? seq.
hasPrefix(decoy_string_) : seq.
hasSuffix(decoy_string_));
387 DEBUG_ONLY std::cerr <<
" starting for loop \n";
389 #pragma omp for schedule(dynamic, 100) nowait 393 if (omp_get_thread_num() == 0)
395 this->setProgress(progress_prots);
398 prot = proteins.chunkAt(i).sequence;
402 if (prot.
has(
'[') || prot.
has(
'('))
404 invalid_protein_sequence =
true;
423 Size prot_idx = i + proteins.getChunkOffset();
432 size_t offset = -1, start = 0;
433 while ((offset = prot.find(jumpX, offset + 1)) != std::string::npos)
436 addHits_(fuzzyAC, pattern, pep_DB, prot.
substr(start, offset + jumpX.size() - start), prot, prot_idx, (
int)start, func_threads);
438 while (offset + jumpX.size() < prot.size() && prot[offset + jumpX.size()] ==
'X') ++offset;
443 if (start < prot.size())
445 addHits_(fuzzyAC, pattern, pep_DB, prot.
substr(start), prot, prot_idx, (int)start, func_threads);
450 addHits_(fuzzyAC, pattern, pep_DB, prot, prot, prot_idx, 0, func_threads);
455 protein_accessions[prot_idx] = proteins.chunkAt(i).identifier;
456 acc_to_prot_thread[protein_accessions[prot_idx]] = prot_idx;
463 #pragma omp critical(PeptideIndexer_joinAC) 468 func.
merge(func_threads);
470 acc_to_prot.insert(acc_to_prot_thread.begin(), acc_to_prot_thread.end());
471 acc_to_prot_thread.clear();
477 std::cout <<
"Merge took: " << s.
toString() <<
"\n";
479 std::cout << mu.
delta(
"Aho-Corasick") <<
"\n\n";
485 <<
" ... rejected by enzyme filter: " << func.
filter_rejected << std::endl;
487 if (count_j_proteins)
489 LOG_WARN <<
"PeptideIndexer found " << count_j_proteins <<
" protein sequences in your database containing the amino acid 'J'." 490 <<
"To match 'J' in a protein, an ambiguous amino acid placeholder for I/L will be used.\n" 491 <<
"This costs runtime and eats into the 'aaa_max' limit, leaving less opportunity for B/Z/X matches.\n" 492 <<
"If you want 'J' to be treated as unambiguous, enable '-IL_equivalent'!" << std::endl;
502 for (
Size run_idx = 0; run_idx < prot_ids.size(); ++run_idx)
504 runid_to_runidx[prot_ids[run_idx].getIdentifier()] = run_idx;
508 Size stats_matched_unique(0);
509 Size stats_matched_multi(0);
510 Size stats_unmatched(0);
511 Size stats_count_m_t(0);
512 Size stats_count_m_d(0);
513 Size stats_count_m_td(0);
518 for (std::vector<PeptideIdentification>::iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
521 Size run_idx = runid_to_runidx[it1->getIdentifier()];
523 std::vector<PeptideHit>& hits = it1->getHits();
525 for (std::vector<PeptideHit>::iterator it2 = hits.begin(); it2 != hits.end(); ++it2)
528 it2->setPeptideEvidences(std::vector<PeptideEvidence>());
533 bool matches_target(
false);
534 bool matches_decoy(
false);
536 std::set<Size> prot_indices;
538 for (std::set<PeptideProteinMatchInformation>::const_iterator it_i = func.
pep_to_prot[pep_idx].begin();
541 prot_indices.insert(it_i->protein_index);
542 const String& accession = protein_accessions[it_i->protein_index];
543 PeptideEvidence pe(accession, it_i->position, it_i->position + (
int)it2->getSequence().size() - 1, it_i->AABefore, it_i->AAAfter);
544 it2->addPeptideEvidence(pe);
546 runidx_to_protidx[run_idx].insert(it_i->protein_index);
548 if (protein_is_decoy[it_i->protein_index])
550 matches_decoy =
true;
554 matches_target =
true;
558 if (matches_decoy && matches_target)
560 it2->setMetaValue(
"target_decoy",
"target+decoy");
563 else if (matches_target)
565 it2->setMetaValue(
"target_decoy",
"target");
568 else if (matches_decoy)
570 it2->setMetaValue(
"target_decoy",
"decoy");
575 if (prot_indices.size() == 1)
577 it2->setMetaValue(
"protein_references",
"unique");
578 ++stats_matched_unique;
580 else if (prot_indices.size() > 1)
582 it2->setMetaValue(
"protein_references",
"non-unique");
583 ++stats_matched_multi;
587 it2->setMetaValue(
"protein_references",
"unmatched");
589 if (stats_unmatched < 15)
LOG_INFO <<
"Unmatched peptide: " << it2->getSequence() <<
"\n";
590 else if (stats_unmatched == 15)
LOG_INFO <<
"Unmatched peptide: ...\n";
598 Size total_peptides = stats_count_m_t + stats_count_m_d + stats_count_m_td + stats_unmatched;
599 LOG_INFO <<
"-----------------------------------\n";
602 LOG_INFO <<
" unmatched : " << stats_unmatched <<
" (" << stats_unmatched * 100 / total_peptides <<
" %)\n";
604 LOG_INFO <<
" match to target DB only: " << stats_count_m_t <<
" (" << stats_count_m_t * 100 / total_peptides <<
" %)\n";
605 LOG_INFO <<
" match to decoy DB only : " << stats_count_m_d <<
" (" << stats_count_m_d * 100 / total_peptides <<
" %)\n";
606 LOG_INFO <<
" match to both : " << stats_count_m_td <<
" (" << stats_count_m_td * 100 / total_peptides <<
" %)\n";
608 LOG_INFO <<
" mapping to proteins:\n";
609 LOG_INFO <<
" no match (to 0 protein) : " << stats_unmatched <<
"\n";
610 LOG_INFO <<
" unique match (to 1 protein) : " << stats_matched_unique <<
"\n";
611 LOG_INFO <<
" non-unique match (to >1 protein): " << stats_matched_multi << std::endl;
614 Size stats_matched_proteins(0), stats_matched_new_proteins(0), stats_orphaned_proteins(0), stats_proteins_target(0), stats_proteins_decoy(0);
617 for (
Size run_idx = 0; run_idx < prot_ids.size(); ++run_idx)
619 std::set<Size> masterset = runidx_to_protidx[run_idx];
621 std::vector<ProteinHit>& phits = prot_ids[run_idx].getHits();
624 std::vector<ProteinHit> orphaned_hits;
625 for (std::vector<ProteinHit>::iterator p_hit = phits.begin(); p_hit != phits.end(); ++p_hit)
627 const String& acc = p_hit->getAccession();
628 if (!acc_to_prot.
has(acc))
630 ++stats_orphaned_proteins;
631 if (keep_unreferenced_proteins_)
633 p_hit->setMetaValue(
"target_decoy",
"");
634 orphaned_hits.push_back(*p_hit);
639 phits = orphaned_hits;
644 phits.reserve(phits.size() + masterset.size());
645 for (std::set<Size>::const_iterator it = masterset.begin(); it != masterset.end(); ++it)
650 if (write_protein_sequence_ || write_protein_description_)
652 proteins.readAt(fe, *it);
653 if (write_protein_sequence_)
657 if (write_protein_description_)
662 if (protein_is_decoy[*it])
665 ++stats_proteins_decoy;
670 ++stats_proteins_target;
672 phits.push_back(hit);
673 ++stats_matched_new_proteins;
675 stats_matched_proteins += phits.size();
679 LOG_INFO <<
"-----------------------------------\n";
682 LOG_INFO <<
" total proteins searched: " << proteins.size() <<
"\n";
683 LOG_INFO <<
" matched proteins : " << stats_matched_proteins <<
" (" << stats_matched_new_proteins <<
" new)\n";
684 if (stats_matched_proteins)
686 LOG_INFO <<
" matched target proteins: " << stats_proteins_target <<
" (" << stats_proteins_target * 100 / stats_matched_proteins <<
" %)\n";
687 LOG_INFO <<
" matched decoy proteins : " << stats_proteins_decoy <<
" (" << stats_proteins_decoy * 100 / stats_matched_proteins <<
" %)\n";
689 LOG_INFO <<
" orphaned proteins : " << stats_orphaned_proteins << (keep_unreferenced_proteins_ ?
" (all kept)" :
" (all removed)\n");
690 LOG_INFO <<
"-----------------------------------" << std::endl;
694 bool has_error =
false;
696 if (invalid_protein_sequence)
698 LOG_ERROR <<
"Error: One or more protein sequences contained the characters '[' or '(', which are illegal in protein sequences." 699 <<
"\nPeptide hits might be masked by these characters (which usually indicate presence of modifications).\n";
703 if ((stats_count_m_d + stats_count_m_td) == 0)
705 String msg(
"No peptides were matched to the decoy portion of the database! Did you provide the correct concatenated database? Are your 'decoy_string' (=" +
String(decoy_string_) +
") and 'decoy_string_position' (=" +
String(param_.getValue(
"decoy_string_position")) +
") settings correct?");
706 if (missing_decoy_action_ ==
"error")
708 LOG_ERROR <<
"Error: " << msg <<
"\nSet 'missing_decoy_action' to 'warn' if you are sure this is ok!\nAborting ..." << std::endl;
711 else if (missing_decoy_action_ ==
"warn")
713 LOG_WARN <<
"Warn: " << msg <<
"\nSet 'missing_decoy_action' to 'error' if you want to elevate this to an error!" << std::endl;
720 if ((!allow_unmatched_) && (stats_unmatched > 0))
722 LOG_ERROR <<
"PeptideIndexer found unmatched peptides, which could not be associated to a protein.\n" 723 <<
"Potential solutions:\n" 724 <<
" - check your FASTA database for completeness\n" 725 <<
" - set 'enzyme:specificity' to match the identification parameters of the search engine\n" 726 <<
" - some engines (e.g. X! Tandem) employ loose cutting rules generating non-tryptic peptides;\n" 727 <<
" if you trust them, disable enzyme specificity\n" 728 <<
" - increase 'aaa_max' to allow more ambiguous amino acids\n" 729 <<
" - as a last resort: use the 'allow_unmatched' option to accept unmatched peptides\n" 730 <<
" (note that unmatched peptides cannot be used for FDR calculation or quantification)\n";
736 LOG_ERROR <<
"Result files will be written, but PeptideIndexer will exit with an error code." << std::endl;
737 return UNEXPECTED_RESULT;
742 const String& getDecoyString()
const;
744 bool isPrefix()
const;
756 std::vector<std::string> affixes = {
"decoy",
"dec",
"reverse",
"rev",
"__id_decoy",
"xxx",
"shuffled",
"shuffle",
"pseudo",
"random"};
764 contains_decoys_ =
true;
767 const std::string regexstr_prefix = std::string(
"^(") + ListUtils::concatenate<std::string>(affixes,
"_*|") +
"_*)";
768 const std::string regexstr_suffix = std::string(
"(") + ListUtils::concatenate<std::string>(affixes,
"_*|") +
"_*)$";
771 const boost::regex pattern_prefix(regexstr_prefix);
772 const boost::regex pattern_suffix(regexstr_suffix);
774 int all_prefix_occur(0), all_suffix_occur(0), all_proteins_count(0);
776 const size_t PROTEIN_CACHE_SIZE = 4e5;
780 proteins.cacheChunk(PROTEIN_CACHE_SIZE);
781 if (!proteins.activateCache())
break;
783 auto prot_count = (
SignedSize) proteins.chunkSize();
784 all_proteins_count += prot_count;
789 String seq = proteins.chunkAt(i).identifier;
796 bool found_prefix = boost::regex_search(seq_lower, sm, pattern_prefix);
799 std::string match = sm[0];
803 decoy_count[match].first++;
807 decoy_case_sensitive[match] = seq_decoy;
811 bool found_suffix = boost::regex_search(seq_lower, sm, pattern_suffix);
814 std::string match = sm[0];
818 decoy_count[match].second++;
822 decoy_case_sensitive[match] = seq_decoy;
829 for (
auto &a : decoy_count)
LOG_DEBUG << a.first <<
"\t" << a.second.first <<
"\t" << a.second.second << std::endl;
833 if (all_prefix_occur + all_suffix_occur < 0.4 * all_proteins_count) {
834 decoy_string_ =
"DECOY_";
837 contains_decoys_ =
false;
841 if (all_prefix_occur == all_suffix_occur)
843 LOG_ERROR <<
"Unable to determine decoy string!" << std::endl;
848 for (
const auto& pair : decoy_count)
850 const std::string & case_insensitive_decoy_string = pair.first;
851 const std::pair<int, int>& prefix_suffix_counts = pair.second;
852 double freq_prefix =
static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(all_prefix_occur);
853 double freq_prefix_in_proteins =
static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(all_proteins_count);
855 if (freq_prefix >= 0.8 && freq_prefix_in_proteins >= 0.4)
858 decoy_string_ = decoy_case_sensitive[case_insensitive_decoy_string];
860 if (prefix_suffix_counts.first != all_prefix_occur)
862 LOG_WARN <<
"More than one decoy prefix observed!" << std::endl;
863 LOG_WARN <<
"Using most frequent decoy prefix (" << (int) (freq_prefix * 100) <<
"%)" << std::endl;
871 for (
const auto& pair : decoy_count)
873 const std::string& case_insensitive_decoy_string = pair.first;
874 const std::pair<int, int>& prefix_suffix_counts = pair.second;
875 double freq_suffix =
static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(all_suffix_occur);
876 double freq_suffix_in_proteins =
static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(all_proteins_count);
878 if (freq_suffix >= 0.8 && freq_suffix_in_proteins >= 0.4)
881 decoy_string_ = decoy_case_sensitive[case_insensitive_decoy_string];
883 if (prefix_suffix_counts.second != all_suffix_occur)
885 LOG_WARN <<
"More than one decoy suffix observed!" << std::endl;
886 LOG_WARN <<
"Using most frequent decoy suffix (" << (int) (freq_suffix * 100) <<
"%)" << std::endl;
893 LOG_ERROR <<
"Unable to determine decoy string and its position. Please provide a decoy string and its position as parameters." << std::endl;
922 else if (AABefore != other.
AABefore)
926 else if (AAAfter != other.
AAAfter)
928 return AAAfter < other.
AAAfter;
945 typedef std::map<OpenMS::Size, std::set<PeptideProteinMatchInformation> >
MapType;
964 pep_to_prot(), filter_passed(0), filter_rejected(0), enzyme_(enzyme), xtandem_(xtandem)
970 if (pep_to_prot.empty())
976 for (FoundProteinFunctor::MapType::const_iterator it = other.
pep_to_prot.begin(); it != other.
pep_to_prot.end(); ++it)
978 this->pep_to_prot[it->first].insert(other.
pep_to_prot[it->first].begin(), other.
pep_to_prot[it->first].end());
1002 pep_to_prot[idx_pep].insert(match);
1020 const seqan::Peptide& tmp_pep = pep_DB[fuzzyAC.
getHitDBIndex()];
1026 void updateMembers_()
override;
Int getHitProteinPosition()
Offset into protein sequence where hit was found.
Definition: AhoCorasickAmbiguous.h:1057
MapType pep_to_prot
peptide index –> protein indices
Definition: PeptideIndexing.h:948
void setEnzyme(const String &name)
Sets the enzyme for the digestion (by name)
String delta(const String &event="delta")
bool hasPrefix(const String &string) const
true if String begins with string, false otherwise
String description
Definition: FASTAFile.h:79
static String suffix(const String &this_s, size_t length)
Definition: StringUtils.h:269
Definition: PeptideIndexing.h:136
String & toLower()
Converts the string to lowercase.
std::map< OpenMS::Size, std::set< PeptideProteinMatchInformation > > MapType
Definition: PeptideIndexing.h:945
String sequence
Definition: FASTAFile.h:80
#define LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged...
Definition: LogStream.h:452
void addHit(const OpenMS::Size idx_pep, const OpenMS::Size idx_prot, const OpenMS::Size len_pep, const OpenMS::String &seq_prot, OpenMS::Int position)
Definition: PeptideIndexing.h:989
#define LOG_INFO
Macro if a information, e.g. a status should be reported.
Definition: LogStream.h:456
ExitCodes
Exit codes.
Definition: PeptideIndexing.h:130
bool has(Byte byte) const
true if String contains the byte, false otherwise
bool contains_decoys_
Definition: PeptideIndexing.h:749
OpenMS::Size filter_passed
number of accepted hits (passing addHit() constraints)
Definition: PeptideIndexing.h:951
#define LOG_ERROR
Macro to be used if non-fatal error are reported (processing continues)
Definition: LogStream.h:448
OpenMS::Size protein_index
index of the protein the peptide is contained in
Definition: PeptideIndexing.h:901
Definition: PeptideIndexing.h:132
bool operator==(const PeptideProteinMatchInformation &other) const
Definition: PeptideIndexing.h:933
std::map< std::string, std::string > CaseInsensitiveToCaseSensitiveDecoy
Definition: PeptideIndexing.h:748
bool keep_unreferenced_proteins_
Definition: PeptideIndexing.h:1036
void setDescription(const String &description)
sets the description of the protein
Size getHitDBIndex()
Get index of hit into peptide database of the pattern.
Definition: AhoCorasickAmbiguous.h:1047
Int mm_max_
Definition: PeptideIndexing.h:1041
Extended Aho-Corasick algorithm capable of matching ambiguous amino acids in the pattern (i...
Definition: AhoCorasickAmbiguous.h:970
double getClockTime() const
String & substitute(char from, char to)
Replaces all occurrences of the character from by the character to.
bool xtandem_
are we checking xtandem cleavage rules?
Definition: PeptideIndexing.h:960
Specificity getSpecificity() const
Returns the specificity for the digestion.
A more convenient string class.
Definition: String.h:58
bool IL_equivalent_
Definition: PeptideIndexing.h:1038
bool write_protein_sequence_
Definition: PeptideIndexing.h:1034
StopWatch Class.
Definition: StopWatch.h:59
static const char N_TERMINAL_AA
Definition: PeptideEvidence.h:60
FASTAContainer<TFI_Vector> simply takes an existing vector of FASTAEntries and provides the same inte...
Definition: FASTAContainer.h:237
bool findDecoyString_(FASTAContainer< T > &proteins)
Definition: PeptideIndexing.h:752
Definition: PeptideIndexing.h:898
void addHits_(AhoCorasickAmbiguous &fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern &pattern, const AhoCorasickAmbiguous::PeptideDB &pep_DB, const String &prot, const String &full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor &func_threads) const
Definition: PeptideIndexing.h:1015
String getEnzymeName() const
Returns the enzyme for the digestion.
template parameter for vector-based FASTA access
Definition: FASTAContainer.h:76
bool write_protein_description_
Definition: PeptideIndexing.h:1035
std::map< std::string, std::pair< int, int > > DecoyStringToAffixCount
Definition: PeptideIndexing.h:747
String enzyme_name_
Definition: PeptideIndexing.h:1031
String decoy_string_
Definition: PeptideIndexing.h:1028
Representation of a protein hit.
Definition: ProteinHit.h:57
A convenience class to report either absolute or delta (between two timepoints) RAM usage...
Definition: SysInfo.h:83
Int aaa_max_
Definition: PeptideIndexing.h:1040
FASTA entry type (identifier, description and sequence)
Definition: FASTAFile.h:76
OpenMS::Int position
the position of the peptide in the protein
Definition: PeptideIndexing.h:904
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:54
Refreshes the protein references for all peptide hits in a vector of PeptideIdentifications and adds ...
Definition: PeptideIndexing.h:124
::seqan::StringSet<::seqan::AAString > PeptideDB
Definition: AhoCorasickAmbiguous.h:973
ExitCodes run(FASTAContainer< T > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
Re-index peptide identifications honoring enzyme cutting rules, ambiguous amino acids and target/deco...
Definition: PeptideIndexing.h:190
#define LOG_DEBUG
Macro for general debugging information.
Definition: LogStream.h:460
bool findNext(const FuzzyACPattern &pattern)
Enumerate hits.
Definition: AhoCorasickAmbiguous.h:1037
char AAAfter
the amino acid before the peptide in the protein
Definition: PeptideIndexing.h:910
bool operator<(const PeptideProteinMatchInformation &other) const
Definition: PeptideIndexing.h:912
bool isValidProduct(const String &protein, int pep_pos, int pep_length, bool ignore_missed_cleavages=true, bool allow_nterm_protein_cleavage=false, bool allow_random_asp_pro_cleavage=false) const
Variant of EnzymaticDigestion::isValidProduct() with support for n-term protein cleavage and random D...
::seqan::Pattern< PeptideDB, ::seqan::FuzzyAC > FuzzyACPattern
Definition: AhoCorasickAmbiguous.h:974
semi specific, i.e., one of the two cleavage sites must fulfill requirements
Definition: EnzymaticDigestion.h:69
bool isAmbiguous(AAcid c)
Definition: AhoCorasickAmbiguous.h:578
ExitCodes run(std::vector< FASTAFile::FASTAEntry > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
forward for old interface and pyOpenMS; use run<T>() for more control
Definition: PeptideIndexing.h:148
bool hasSuffix(const String &string) const
true if String ends with string, false otherwise
void setMetaValue(const String &name, const DataValue &value)
Sets the DataValue corresponding to a name.
String missing_decoy_action_
Definition: PeptideIndexing.h:1030
void merge(FoundProteinFunctor &other)
Definition: PeptideIndexing.h:968
FoundProteinFunctor(const ProteaseDigestion &enzyme, bool xtandem)
Definition: PeptideIndexing.h:963
OpenMS::Size filter_rejected
number of rejected hits (not passing addHit())
Definition: PeptideIndexing.h:954
void setAccession(const String &accession)
sets the accession of the protein
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:91
static Specificity getSpecificityByName(const String &name)
Definition: PeptideIndexing.h:942
String substr(size_t pos=0, size_t n=npos) const
Wrapper for the STL substr() method. Returns a String object with its contents initialized to a subst...
Class for the enzymatic digestion of proteins.
Definition: ProteaseDigestion.h:60
void setSequence(const String &sequence)
sets the protein sequence
void setSpecificity(Specificity spec)
Sets the specificity for the digestion (default is SPEC_FULL).
String enzyme_specificity_
Definition: PeptideIndexing.h:1032
void setProtein(const String &protein_sequence)
Reset to new protein sequence. All previous data is forgotten.
Definition: AhoCorasickAmbiguous.h:1024
ProteaseDigestion enzyme_
Definition: PeptideIndexing.h:957
int Int
Signed integer type.
Definition: Types.h:102
Definition: PeptideIndexing.h:137
Definition: PeptideIndexing.h:135
no requirements on start / end
Definition: EnzymaticDigestion.h:70
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
bool allow_unmatched_
Definition: PeptideIndexing.h:1037
static void initPattern(const PeptideDB &pep_db, const int aaa_max, const int mm_max, FuzzyACPattern &pattern)
Construct a trie from a set of peptide sequences (which are to be found in a protein).
Definition: AhoCorasickAmbiguous.h:991
Definition: PeptideIndexing.h:133
static String prefix(const String &this_s, size_t length)
Definition: StringUtils.h:260
Definition: PeptideIndexing.h:134
Representation of a peptide evidence.
Definition: PeptideEvidence.h:50
Size< TNeedle >::Type position(const PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:561
Map class based on the STL map (containing several convenience functions)
Definition: Map.h:50
String< AAcid, Alloc< void > > AAString
Definition: AhoCorasickAmbiguous.h:206
static const char C_TERMINAL_AA
Definition: PeptideEvidence.h:61
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
void after()
record data for the second timepoint
#define DEBUG_ONLY
Definition: AhoCorasickAmbiguous.h:46
String toString() const
get a compact representation of the current time status.
bool has(const Key &key) const
Test whether the map contains the given key.
Definition: Map.h:108
static String & toUpper(String &this_s)
Definition: StringUtils.h:732
String & remove(char what)
Remove all occurrences of the character what.
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
bool prefix_
Definition: PeptideIndexing.h:1029
char AABefore
the amino acid after the peptide in the protein
Definition: PeptideIndexing.h:907