OpenMS  2.4.0
PeptideIndexing.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2018.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Andreas Bertsch, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 
54 #include <OpenMS/SYSTEM/SysInfo.h>
55 
56 #include <atomic>
57 #include <algorithm>
58 #include <fstream>
59 
60 
61 namespace OpenMS
62 {
63 
124  class OPENMS_DLLAPI PeptideIndexing :
125  public DefaultParamHandler, public ProgressLogger
126  {
127 public:
128 
131  {
138  };
139 
141  PeptideIndexing();
142 
144  ~PeptideIndexing() override;
145 
146 
148  inline ExitCodes run(std::vector<FASTAFile::FASTAEntry>& proteins, std::vector<ProteinIdentification>& prot_ids, std::vector<PeptideIdentification>& pep_ids)
149  {
150  FASTAContainer<TFI_Vector> protein_container(proteins);
151  return run<TFI_Vector>(protein_container, prot_ids, pep_ids);
152  }
153 
189  template<typename T>
190  ExitCodes run(FASTAContainer<T>& proteins, std::vector<ProteinIdentification>& prot_ids, std::vector<PeptideIdentification>& pep_ids)
191  {
192  // no decoy string provided? try to deduce from data
193  if (decoy_string_.empty())
194  {
195  bool is_decoy_string_auto_successful = findDecoyString_(proteins);
196 
197  if (!is_decoy_string_auto_successful && contains_decoys_)
198  {
199  return DECOYSTRING_EMPTY;
200  }
201  else if (!is_decoy_string_auto_successful && !contains_decoys_)
202  {
203  LOG_WARN << "Unable to determine decoy string automatically, not enough decoys were detected! Using default " << (prefix_ ? "prefix" : "suffix") << " decoy string '" << decoy_string_ << "\n"
204  << "If you think that this is false, please provide a decoy_string and its position manually!" << std::endl;
205  }
206  else
207  {
208  // decoy string and position was extracted successfully
209  LOG_INFO << "Using " << (prefix_ ? "prefix" : "suffix") << " decoy string '" << decoy_string_ << "'" << std::endl;
210  }
211 
212  proteins.reset();
213  }
214 
215  //---------------------------------------------------------------
216  // parsing parameters, correcting xtandem and MSGFPlus parameters
217  //---------------------------------------------------------------
218  ProteaseDigestion enzyme;
219  enzyme.setEnzyme(enzyme_name_);
220  enzyme.setSpecificity(enzyme.getSpecificityByName(enzyme_specificity_));
221 
222  bool xtandem_fix_parameters = true, msgfplus_fix_parameters = true;
223 
224  // specificity is none or semi? don't automate xtandem
227  {
228  xtandem_fix_parameters = false;
229  }
230 
231  // enzyme is already Trypsin/P? don't automate MSGFPlus
232  if (enzyme.getEnzymeName() == "Trypsin/P") { msgfplus_fix_parameters = false; }
233 
234  // determine if search engine is solely xtandem or MSGFPlus
235  for (const auto& prot_id : prot_ids)
236  {
237  if (!msgfplus_fix_parameters && !xtandem_fix_parameters) { break; }
238  String se = prot_id.getSearchEngine();
239  std::string search_engine = StringUtils::toUpper(se);
240  if (search_engine != "XTANDEM") { xtandem_fix_parameters = false; }
241  if (search_engine != "MSGFPLUS" || "MS-GF+") { msgfplus_fix_parameters = false; }
242  }
243 
244  // solely MSGFPlus -> Trypsin P as enzyme
245  if (msgfplus_fix_parameters && enzyme.getEnzymeName() == "Trypsin")
246  {
247  LOG_WARN << "MSGFPlus detected but enzyme cutting rules were set to Trypsin. Correcting to Trypsin/P to copy with special cutting rule in MSGFPlus." << std::endl;
248  enzyme.setEnzyme("Trypsin/P");
249  }
250 
251  //-------------------------------------------------------------
252  // calculations
253  //-------------------------------------------------------------
254  // cache the first proteins
255  const size_t PROTEIN_CACHE_SIZE = 4e5; // 400k should be enough for most DB's and is not too hard on memory either (~200 MB FASTA)
256 
257  this->startProgress(0, 1, "Load first chunk");
258  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
259  this->endProgress();
260 
261  if (proteins.empty()) // we do not allow an empty database
262  {
263  LOG_ERROR << "Error: An empty database was provided. Mapping makes no sense. Aborting..." << std::endl;
264  return DATABASE_EMPTY;
265  }
266 
267  if (pep_ids.empty()) // Aho-Corasick requires non-empty input; but we allow this case, since the TOPP tool should not crash when encountering a bad raw file (with no PSMs)
268  {
269  LOG_WARN << "Warning: An empty set of peptide identifications was provided. Output will be empty as well." << std::endl;
270  if (!keep_unreferenced_proteins_)
271  {
272  // delete only protein hits, not whole ID runs incl. meta data:
273  for (std::vector<ProteinIdentification>::iterator it = prot_ids.begin();
274  it != prot_ids.end(); ++it)
275  {
276  it->getHits().clear();
277  }
278  }
279  return PEPTIDE_IDS_EMPTY;
280  }
281 
282  FoundProteinFunctor func(enzyme, xtandem_fix_parameters); // store the matches
283  Map<String, Size> acc_to_prot; // map: accessions --> FASTA protein index
284  std::vector<bool> protein_is_decoy; // protein index -> is decoy?
285  std::vector<std::string> protein_accessions; // protein index -> accession
286 
287  bool invalid_protein_sequence = false; // check for proteins with modifications, i.e. '[' or '(', and throw an exception
288 
289  { // new scope - forget data after search
290 
291  /*
292  BUILD Peptide DB
293  */
294  bool has_illegal_AAs(false);
296  for (std::vector<PeptideIdentification>::const_iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
297  {
298  //String run_id = it1->getIdentifier();
299  const std::vector<PeptideHit>& hits = it1->getHits();
300  for (std::vector<PeptideHit>::const_iterator it2 = hits.begin(); it2 != hits.end(); ++it2)
301  {
302  //
303  // Warning:
304  // do not skip over peptides here, since the results are iterated in the same way
305  //
306  String seq = it2->getSequence().toUnmodifiedString().remove('*'); // make a copy, i.e. do NOT change the peptide sequence!
307  if (seqan::isAmbiguous(seqan::AAString(seq.c_str())))
308  { // do not quit here, to show the user all sequences .. only quit after loop
309  LOG_ERROR << "Peptide sequence '" << it2->getSequence() << "' contains one or more ambiguous amino acids (B|J|Z|X).\n";
310  has_illegal_AAs = true;
311  }
312  if (IL_equivalent_) // convert L to I;
313  {
314  seq.substitute('L', 'I');
315  }
316  appendValue(pep_DB, seq.c_str());
317  }
318  }
319  if (has_illegal_AAs)
320  {
321  LOG_ERROR << "One or more peptides contained illegal amino acids. This is not allowed!"
322  << "\nPlease either remove the peptide or replace it with one of the unambiguous ones (while allowing for ambiguous AA's to match the protein)." << std::endl;;
323  }
324 
325  LOG_INFO << "Mapping " << length(pep_DB) << " peptides to " << (proteins.size() == PROTEIN_CACHE_SIZE ? "? (unknown number of)" : String(proteins.size())) << " proteins." << std::endl;
326 
327  if (length(pep_DB) == 0)
328  { // Aho-Corasick will crash if given empty needles as input
329  LOG_WARN << "Warning: Peptide identifications have no hits inside! Output will be empty as well." << std::endl;
330  return PEPTIDE_IDS_EMPTY;
331  }
332 
333  /*
334  Aho Corasick (fast)
335  */
336  LOG_INFO << "Searching with up to " << aaa_max_ << " ambiguous amino acid(s) and " << mm_max_ << " mismatch(es)!" << std::endl;
338  LOG_INFO << "Building trie ...";
339  StopWatch s;
340  s.start();
342  AhoCorasickAmbiguous::initPattern(pep_DB, aaa_max_, mm_max_, pattern);
343  s.stop();
344  LOG_INFO << " done (" << int(s.getClockTime()) << "s)" << std::endl;
345  s.reset();
346 
347  uint16_t count_j_proteins(0);
348  bool has_active_data = true; // becomes false if end of FASTA file is reached
349  const std::string jumpX(aaa_max_ + mm_max_ + 1, 'X'); // jump over stretches of 'X' which cost a lot of time; +1 because AXXA is a valid hit for aaa_max == 2 (cannot split it)
350  // use very large target value for progress if DB size is unknown (did not fit into first chunk)
351  this->startProgress(0, proteins.size() == PROTEIN_CACHE_SIZE ? std::numeric_limits<SignedSize>::max() : proteins.size(), "Aho-Corasick");
352  std::atomic<int> progress_prots(0);
353 #ifdef _OPENMP
354 #pragma omp parallel
355 #endif
356  {
357  FoundProteinFunctor func_threads(enzyme, xtandem_fix_parameters);
358  Map<String, Size> acc_to_prot_thread; // map: accessions --> FASTA protein index
359  AhoCorasickAmbiguous fuzzyAC;
360  String prot;
361 
362  while (true)
363  {
364  #pragma omp barrier // all threads need to be here, since we are about to swap protein data
365  #pragma omp single
366  {
367  DEBUG_ONLY std::cerr << " activating cache ...\n";
368  has_active_data = proteins.activateCache(); // swap in last cache
369  protein_accessions.resize(proteins.getChunkOffset() + proteins.chunkSize());
370  } // implicit barrier here
371 
372  if (!has_active_data) break; // leave while-loop
373  SignedSize prot_count = (SignedSize)proteins.chunkSize();
374 
375  #pragma omp master
376  {
377  DEBUG_ONLY std::cerr << "Filling Protein Cache ...";
378  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
379  protein_is_decoy.resize(proteins.getChunkOffset() + prot_count);
380  for (SignedSize i = 0; i < prot_count; ++i)
381  { // do this in master only, to avoid false sharing
382  const String& seq = proteins.chunkAt(i).identifier;
383  protein_is_decoy[i + proteins.getChunkOffset()] = (prefix_ ? seq.hasPrefix(decoy_string_) : seq.hasSuffix(decoy_string_));
384  }
385  DEBUG_ONLY std::cerr << " done" << std::endl;
386  }
387  DEBUG_ONLY std::cerr << " starting for loop \n";
388  // search all peptides in each protein
389  #pragma omp for schedule(dynamic, 100) nowait
390  for (SignedSize i = 0; i < prot_count; ++i)
391  {
392  ++progress_prots; // atomic
393  if (omp_get_thread_num() == 0)
394  {
395  this->setProgress(progress_prots);
396  }
397 
398  prot = proteins.chunkAt(i).sequence;
399  prot.remove('*');
400 
401  // check for invalid sequences with modifications
402  if (prot.has('[') || prot.has('('))
403  {
404  invalid_protein_sequence = true; // not omp-critical because its write-only
405  // we cannot throw an exception here, since we'd need to catch it within the parallel region
406  }
407 
408  // convert L/J to I; also replace 'J' in proteins
409  if (IL_equivalent_)
410  {
411  prot.substitute('L', 'I');
412  prot.substitute('J', 'I');
413  }
414  else
415  { // warn if 'J' is found (it eats into aaa_max)
416  if (prot.has('J'))
417  {
418  #pragma omp atomic
419  ++count_j_proteins;
420  }
421  }
422 
423  Size prot_idx = i + proteins.getChunkOffset();
424 
425  // test if protein was a hit
426  Size hits_total = func_threads.filter_passed + func_threads.filter_rejected;
427 
428  // check if there are stretches of 'X'
429  if (prot.has('X'))
430  {
431  // create chunks of the protein (splitting it at stretches of 'X..X') and feed them to AC one by one
432  size_t offset = -1, start = 0;
433  while ((offset = prot.find(jumpX, offset + 1)) != std::string::npos)
434  {
435  //std::cout << "found X..X at " << offset << " in protein " << proteins[i].identifier << "\n";
436  addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start, offset + jumpX.size() - start), prot, prot_idx, (int)start, func_threads);
437  // skip ahead while we encounter more X...
438  while (offset + jumpX.size() < prot.size() && prot[offset + jumpX.size()] == 'X') ++offset;
439  start = offset;
440  //std::cout << " new start: " << start << "\n";
441  }
442  // last chunk
443  if (start < prot.size())
444  {
445  addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start), prot, prot_idx, (int)start, func_threads);
446  }
447  }
448  else
449  {
450  addHits_(fuzzyAC, pattern, pep_DB, prot, prot, prot_idx, 0, func_threads);
451  }
452  // was protein found?
453  if (hits_total < func_threads.filter_passed + func_threads.filter_rejected)
454  {
455  protein_accessions[prot_idx] = proteins.chunkAt(i).identifier;
456  acc_to_prot_thread[protein_accessions[prot_idx]] = prot_idx;
457  }
458  } // end parallel FOR
459 
460  // join results again
461  DEBUG_ONLY std::cerr << " critical now \n";
462  #ifdef _OPENMP
463  #pragma omp critical(PeptideIndexer_joinAC)
464  #endif
465  {
466  s.start();
467  // hits
468  func.merge(func_threads);
469  // accession -> index
470  acc_to_prot.insert(acc_to_prot_thread.begin(), acc_to_prot_thread.end());
471  acc_to_prot_thread.clear();
472  s.stop();
473  } // OMP end critical
474  } // end readChunk
475  } // OMP end parallel
476  this->endProgress();
477  std::cout << "Merge took: " << s.toString() << "\n";
478  mu.after();
479  std::cout << mu.delta("Aho-Corasick") << "\n\n";
480 
481  LOG_INFO << "\nAho-Corasick done:\n found " << func.filter_passed << " hits for " << func.pep_to_prot.size() << " of " << length(pep_DB) << " peptides.\n";
482 
483  // write some stats
484  LOG_INFO << "Peptide hits passing enzyme filter: " << func.filter_passed << "\n"
485  << " ... rejected by enzyme filter: " << func.filter_rejected << std::endl;
486 
487  if (count_j_proteins)
488  {
489  LOG_WARN << "PeptideIndexer found " << count_j_proteins << " protein sequences in your database containing the amino acid 'J'."
490  << "To match 'J' in a protein, an ambiguous amino acid placeholder for I/L will be used.\n"
491  << "This costs runtime and eats into the 'aaa_max' limit, leaving less opportunity for B/Z/X matches.\n"
492  << "If you want 'J' to be treated as unambiguous, enable '-IL_equivalent'!" << std::endl;
493  }
494 
495  } // end local scope
496 
497  //
498  // do mapping
499  //
500  // index existing proteins
501  Map<String, Size> runid_to_runidx; // identifier to index
502  for (Size run_idx = 0; run_idx < prot_ids.size(); ++run_idx)
503  {
504  runid_to_runidx[prot_ids[run_idx].getIdentifier()] = run_idx;
505  }
506 
507  // for peptides --> proteins
508  Size stats_matched_unique(0);
509  Size stats_matched_multi(0);
510  Size stats_unmatched(0); // no match to DB
511  Size stats_count_m_t(0); // match to Target DB
512  Size stats_count_m_d(0); // match to Decoy DB
513  Size stats_count_m_td(0); // match to T+D DB
514 
515  Map<Size, std::set<Size> > runidx_to_protidx; // in which protID do appear which proteins (according to mapped peptides)
516 
517  Size pep_idx(0);
518  for (std::vector<PeptideIdentification>::iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
519  {
520  // which ProteinIdentification does the peptide belong to?
521  Size run_idx = runid_to_runidx[it1->getIdentifier()];
522 
523  std::vector<PeptideHit>& hits = it1->getHits();
524 
525  for (std::vector<PeptideHit>::iterator it2 = hits.begin(); it2 != hits.end(); ++it2)
526  {
527  // clear protein accessions
528  it2->setPeptideEvidences(std::vector<PeptideEvidence>());
529 
530  //
531  // is this a decoy hit?
532  //
533  bool matches_target(false);
534  bool matches_decoy(false);
535 
536  std::set<Size> prot_indices;
537  // add new protein references
538  for (std::set<PeptideProteinMatchInformation>::const_iterator it_i = func.pep_to_prot[pep_idx].begin();
539  it_i != func.pep_to_prot[pep_idx].end(); ++it_i)
540  {
541  prot_indices.insert(it_i->protein_index);
542  const String& accession = protein_accessions[it_i->protein_index];
543  PeptideEvidence pe(accession, it_i->position, it_i->position + (int)it2->getSequence().size() - 1, it_i->AABefore, it_i->AAAfter);
544  it2->addPeptideEvidence(pe);
545 
546  runidx_to_protidx[run_idx].insert(it_i->protein_index); // fill protein hits
547 
548  if (protein_is_decoy[it_i->protein_index])
549  {
550  matches_decoy = true;
551  }
552  else
553  {
554  matches_target = true;
555  }
556  }
557 
558  if (matches_decoy && matches_target)
559  {
560  it2->setMetaValue("target_decoy", "target+decoy");
561  ++stats_count_m_td;
562  }
563  else if (matches_target)
564  {
565  it2->setMetaValue("target_decoy", "target");
566  ++stats_count_m_t;
567  }
568  else if (matches_decoy)
569  {
570  it2->setMetaValue("target_decoy", "decoy");
571  ++stats_count_m_d;
572  } // else: could match to no protein (i.e. both are false)
573  //else ... // not required (handled below; see stats_unmatched);
574 
575  if (prot_indices.size() == 1)
576  {
577  it2->setMetaValue("protein_references", "unique");
578  ++stats_matched_unique;
579  }
580  else if (prot_indices.size() > 1)
581  {
582  it2->setMetaValue("protein_references", "non-unique");
583  ++stats_matched_multi;
584  }
585  else
586  {
587  it2->setMetaValue("protein_references", "unmatched");
588  ++stats_unmatched;
589  if (stats_unmatched < 15) LOG_INFO << "Unmatched peptide: " << it2->getSequence() << "\n";
590  else if (stats_unmatched == 15) LOG_INFO << "Unmatched peptide: ...\n";
591  }
592 
593  ++pep_idx; // next hit
594  }
595 
596  }
597 
598  Size total_peptides = stats_count_m_t + stats_count_m_d + stats_count_m_td + stats_unmatched;
599  LOG_INFO << "-----------------------------------\n";
600  LOG_INFO << "Peptide statistics\n";
601  LOG_INFO << "\n";
602  LOG_INFO << " unmatched : " << stats_unmatched << " (" << stats_unmatched * 100 / total_peptides << " %)\n";
603  LOG_INFO << " target/decoy:\n";
604  LOG_INFO << " match to target DB only: " << stats_count_m_t << " (" << stats_count_m_t * 100 / total_peptides << " %)\n";
605  LOG_INFO << " match to decoy DB only : " << stats_count_m_d << " (" << stats_count_m_d * 100 / total_peptides << " %)\n";
606  LOG_INFO << " match to both : " << stats_count_m_td << " (" << stats_count_m_td * 100 / total_peptides << " %)\n";
607  LOG_INFO << "\n";
608  LOG_INFO << " mapping to proteins:\n";
609  LOG_INFO << " no match (to 0 protein) : " << stats_unmatched << "\n";
610  LOG_INFO << " unique match (to 1 protein) : " << stats_matched_unique << "\n";
611  LOG_INFO << " non-unique match (to >1 protein): " << stats_matched_multi << std::endl;
612 
614  Size stats_matched_proteins(0), stats_matched_new_proteins(0), stats_orphaned_proteins(0), stats_proteins_target(0), stats_proteins_decoy(0);
615 
616  // all peptides contain the correct protein hit references, now update the protein hits
617  for (Size run_idx = 0; run_idx < prot_ids.size(); ++run_idx)
618  {
619  std::set<Size> masterset = runidx_to_protidx[run_idx]; // all protein matches from above
620 
621  std::vector<ProteinHit>& phits = prot_ids[run_idx].getHits();
622  {
623  // go through existing protein hits and count orphaned proteins (with no peptide hits)
624  std::vector<ProteinHit> orphaned_hits;
625  for (std::vector<ProteinHit>::iterator p_hit = phits.begin(); p_hit != phits.end(); ++p_hit)
626  {
627  const String& acc = p_hit->getAccession();
628  if (!acc_to_prot.has(acc)) // acc_to_prot only contains found proteins from current run
629  { // old hit is orphaned
630  ++stats_orphaned_proteins;
631  if (keep_unreferenced_proteins_)
632  {
633  p_hit->setMetaValue("target_decoy", "");
634  orphaned_hits.push_back(*p_hit);
635  }
636  }
637  }
638  // only keep orphaned hits (if any)
639  phits = orphaned_hits;
640  }
641 
642  // add new protein hits
644  phits.reserve(phits.size() + masterset.size());
645  for (std::set<Size>::const_iterator it = masterset.begin(); it != masterset.end(); ++it)
646  {
647  ProteinHit hit;
648  hit.setAccession(protein_accessions[*it]);
649 
650  if (write_protein_sequence_ || write_protein_description_)
651  {
652  proteins.readAt(fe, *it);
653  if (write_protein_sequence_)
654  {
655  hit.setSequence(fe.sequence);
656  } // no else, since sequence is empty by default
657  if (write_protein_description_)
658  {
659  hit.setDescription(fe.description);
660  } // no else, since description is empty by default
661  }
662  if (protein_is_decoy[*it])
663  {
664  hit.setMetaValue("target_decoy", "decoy");
665  ++stats_proteins_decoy;
666  }
667  else
668  {
669  hit.setMetaValue("target_decoy", "target");
670  ++stats_proteins_target;
671  }
672  phits.push_back(hit);
673  ++stats_matched_new_proteins;
674  }
675  stats_matched_proteins += phits.size();
676  }
677 
678 
679  LOG_INFO << "-----------------------------------\n";
680  LOG_INFO << "Protein statistics\n";
681  LOG_INFO << "\n";
682  LOG_INFO << " total proteins searched: " << proteins.size() << "\n";
683  LOG_INFO << " matched proteins : " << stats_matched_proteins << " (" << stats_matched_new_proteins << " new)\n";
684  if (stats_matched_proteins)
685  { // prevent Division-by-0 Exception
686  LOG_INFO << " matched target proteins: " << stats_proteins_target << " (" << stats_proteins_target * 100 / stats_matched_proteins << " %)\n";
687  LOG_INFO << " matched decoy proteins : " << stats_proteins_decoy << " (" << stats_proteins_decoy * 100 / stats_matched_proteins << " %)\n";
688  }
689  LOG_INFO << " orphaned proteins : " << stats_orphaned_proteins << (keep_unreferenced_proteins_ ? " (all kept)" : " (all removed)\n");
690  LOG_INFO << "-----------------------------------" << std::endl;
691 
692 
694  bool has_error = false;
695 
696  if (invalid_protein_sequence)
697  {
698  LOG_ERROR << "Error: One or more protein sequences contained the characters '[' or '(', which are illegal in protein sequences."
699  << "\nPeptide hits might be masked by these characters (which usually indicate presence of modifications).\n";
700  has_error = true;
701  }
702 
703  if ((stats_count_m_d + stats_count_m_td) == 0)
704  {
705  String msg("No peptides were matched to the decoy portion of the database! Did you provide the correct concatenated database? Are your 'decoy_string' (=" + String(decoy_string_) + ") and 'decoy_string_position' (=" + String(param_.getValue("decoy_string_position")) + ") settings correct?");
706  if (missing_decoy_action_ == "error")
707  {
708  LOG_ERROR << "Error: " << msg << "\nSet 'missing_decoy_action' to 'warn' if you are sure this is ok!\nAborting ..." << std::endl;
709  has_error = true;
710  }
711  else if (missing_decoy_action_ == "warn")
712  {
713  LOG_WARN << "Warn: " << msg << "\nSet 'missing_decoy_action' to 'error' if you want to elevate this to an error!" << std::endl;
714  }
715  else // silent
716  {
717  }
718  }
719 
720  if ((!allow_unmatched_) && (stats_unmatched > 0))
721  {
722  LOG_ERROR << "PeptideIndexer found unmatched peptides, which could not be associated to a protein.\n"
723  << "Potential solutions:\n"
724  << " - check your FASTA database for completeness\n"
725  << " - set 'enzyme:specificity' to match the identification parameters of the search engine\n"
726  << " - some engines (e.g. X! Tandem) employ loose cutting rules generating non-tryptic peptides;\n"
727  << " if you trust them, disable enzyme specificity\n"
728  << " - increase 'aaa_max' to allow more ambiguous amino acids\n"
729  << " - as a last resort: use the 'allow_unmatched' option to accept unmatched peptides\n"
730  << " (note that unmatched peptides cannot be used for FDR calculation or quantification)\n";
731  has_error = true;
732  }
733 
734  if (has_error)
735  {
736  LOG_ERROR << "Result files will be written, but PeptideIndexer will exit with an error code." << std::endl;
737  return UNEXPECTED_RESULT;
738  }
739  return EXECUTION_OK;
740  }
741 
742  const String& getDecoyString() const;
743 
744  bool isPrefix() const;
745 
746  protected:
747  using DecoyStringToAffixCount = std::map<std::string, std::pair<int, int>>;
748  using CaseInsensitiveToCaseSensitiveDecoy = std::map<std::string, std::string>;
750 
751  template<typename T>
753  {
754  // common decoy strings in FASTA files
755  // note: decoy prefixes/suffices must be provided in lower case
756  std::vector<std::string> affixes = {"decoy", "dec", "reverse", "rev", "__id_decoy", "xxx", "shuffled", "shuffle", "pseudo", "random"};
757 
758  // map decoys to counts of occurrences as prefix/suffix
759  DecoyStringToAffixCount decoy_count;
760  // map case insensitive strings back to original case (as used in fasta)
761  CaseInsensitiveToCaseSensitiveDecoy decoy_case_sensitive;
762 
763  // assume that it contains decoys for now
764  contains_decoys_ = true;
765 
766  // setup prefix- and suffix regex strings
767  const std::string regexstr_prefix = std::string("^(") + ListUtils::concatenate<std::string>(affixes, "_*|") + "_*)";
768  const std::string regexstr_suffix = std::string("(") + ListUtils::concatenate<std::string>(affixes, "_*|") + "_*)$";
769 
770  // setup regexes
771  const boost::regex pattern_prefix(regexstr_prefix);
772  const boost::regex pattern_suffix(regexstr_suffix);
773 
774  int all_prefix_occur(0), all_suffix_occur(0), all_proteins_count(0);
775 
776  const size_t PROTEIN_CACHE_SIZE = 4e5;
777 
778  while (true)
779  {
780  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
781  if (!proteins.activateCache()) break;
782 
783  auto prot_count = (SignedSize) proteins.chunkSize();
784  all_proteins_count += prot_count;
785 
786  {
787  for (SignedSize i = 0; i < prot_count; ++i)
788  {
789  String seq = proteins.chunkAt(i).identifier;
790 
791  String seq_lower = seq;
792  seq_lower.toLower();
793 
794  boost::smatch sm;
795  // search for prefix
796  bool found_prefix = boost::regex_search(seq_lower, sm, pattern_prefix);
797  if (found_prefix)
798  {
799  std::string match = sm[0];
800  all_prefix_occur++;
801 
802  // increase count of observed prefix
803  decoy_count[match].first++;
804 
805  // store observed (case sensitive and with special characters)
806  std::string seq_decoy = StringUtils::prefix(seq, match.length());
807  decoy_case_sensitive[match] = seq_decoy;
808  }
809 
810  // search for suffix
811  bool found_suffix = boost::regex_search(seq_lower, sm, pattern_suffix);
812  if (found_suffix)
813  {
814  std::string match = sm[0];
815  all_suffix_occur++;
816 
817  // increase count of observed suffix
818  decoy_count[match].second++;
819 
820  // store observed (case sensitive and with special characters)
821  std::string seq_decoy = StringUtils::suffix(seq, match.length());
822  decoy_case_sensitive[match] = seq_decoy;
823  }
824  }
825  }
826  }
827 
828  // DEBUG ONLY: print counts of found decoys
829  for (auto &a : decoy_count) LOG_DEBUG << a.first << "\t" << a.second.first << "\t" << a.second.second << std::endl;
830 
831  // less than 40% of proteins are decoys -> won't be able to determine a decoy string and its position
832  // return default values
833  if (all_prefix_occur + all_suffix_occur < 0.4 * all_proteins_count) {
834  decoy_string_ = "DECOY_";
835  prefix_ = true;
836 
837  contains_decoys_ = false;
838  return false;
839  }
840 
841  if (all_prefix_occur == all_suffix_occur)
842  {
843  LOG_ERROR << "Unable to determine decoy string!" << std::endl;
844  return false;
845  }
846 
847  // Decoy prefix occurred at least 80% of all prefixes + observed in at least 40% of all proteins -> set it as prefix decoy
848  for (const auto& pair : decoy_count)
849  {
850  const std::string & case_insensitive_decoy_string = pair.first;
851  const std::pair<int, int>& prefix_suffix_counts = pair.second;
852  double freq_prefix = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(all_prefix_occur);
853  double freq_prefix_in_proteins = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(all_proteins_count);
854 
855  if (freq_prefix >= 0.8 && freq_prefix_in_proteins >= 0.4)
856  {
857  prefix_ = true;
858  decoy_string_ = decoy_case_sensitive[case_insensitive_decoy_string];
859 
860  if (prefix_suffix_counts.first != all_prefix_occur)
861  {
862  LOG_WARN << "More than one decoy prefix observed!" << std::endl;
863  LOG_WARN << "Using most frequent decoy prefix (" << (int) (freq_prefix * 100) <<"%)" << std::endl;
864  }
865 
866  return true;
867  }
868  }
869 
870  // Decoy suffix occurred at least 80% of all suffixes + observed in at least 40% of all proteins -> set it as suffix decoy
871  for (const auto& pair : decoy_count)
872  {
873  const std::string& case_insensitive_decoy_string = pair.first;
874  const std::pair<int, int>& prefix_suffix_counts = pair.second;
875  double freq_suffix = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(all_suffix_occur);
876  double freq_suffix_in_proteins = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(all_proteins_count);
877 
878  if (freq_suffix >= 0.8 && freq_suffix_in_proteins >= 0.4)
879  {
880  prefix_ = false;
881  decoy_string_ = decoy_case_sensitive[case_insensitive_decoy_string];
882 
883  if (prefix_suffix_counts.second != all_suffix_occur)
884  {
885  LOG_WARN << "More than one decoy suffix observed!" << std::endl;
886  LOG_WARN << "Using most frequent decoy suffix (" << (int) (freq_suffix * 100) <<"%)" << std::endl;
887  }
888 
889  return true;
890  }
891  }
892 
893  LOG_ERROR << "Unable to determine decoy string and its position. Please provide a decoy string and its position as parameters." << std::endl;
894  return false;
895  }
896 
897 
899  {
902 
905 
907  char AABefore;
908 
910  char AAAfter;
911 
912  bool operator<(const PeptideProteinMatchInformation& other) const
913  {
914  if (protein_index != other.protein_index)
915  {
916  return protein_index < other.protein_index;
917  }
918  else if (position != other.position)
919  {
920  return position < other.position;
921  }
922  else if (AABefore != other.AABefore)
923  {
924  return AABefore < other.AABefore;
925  }
926  else if (AAAfter != other.AAAfter)
927  {
928  return AAAfter < other.AAAfter;
929  }
930  return false;
931  }
932 
934  {
935  return protein_index == other.protein_index &&
936  position == other.position &&
937  AABefore == other.AABefore &&
938  AAAfter == other.AAAfter;
939  }
940 
941  };
943  {
944  public:
945  typedef std::map<OpenMS::Size, std::set<PeptideProteinMatchInformation> > MapType;
946 
949 
952 
955 
956  private:
958 
960  bool xtandem_;
961 
962  public:
963  explicit FoundProteinFunctor(const ProteaseDigestion& enzyme, bool xtandem) :
964  pep_to_prot(), filter_passed(0), filter_rejected(0), enzyme_(enzyme), xtandem_(xtandem)
965  {
966  }
967 
969  {
970  if (pep_to_prot.empty())
971  { // first merge is easy
972  pep_to_prot.swap(other.pep_to_prot);
973  }
974  else
975  {
976  for (FoundProteinFunctor::MapType::const_iterator it = other.pep_to_prot.begin(); it != other.pep_to_prot.end(); ++it)
977  { // augment set
978  this->pep_to_prot[it->first].insert(other.pep_to_prot[it->first].begin(), other.pep_to_prot[it->first].end());
979  }
980  other.pep_to_prot.clear();
981  }
982  // cheap members
983  this->filter_passed += other.filter_passed;
984  other.filter_passed = 0;
985  this->filter_rejected += other.filter_rejected;
986  other.filter_rejected = 0;
987  }
988 
989  void addHit(const OpenMS::Size idx_pep,
990  const OpenMS::Size idx_prot,
991  const OpenMS::Size len_pep,
992  const OpenMS::String& seq_prot,
994  {
995  if (enzyme_.isValidProduct(seq_prot, position, len_pep, true, true, xtandem_))
996  {
998  match.protein_index = idx_prot;
999  match.position = position;
1000  match.AABefore = (position == 0) ? PeptideEvidence::N_TERMINAL_AA : seq_prot[position - 1];
1001  match.AAAfter = (position + len_pep >= seq_prot.size()) ? PeptideEvidence::C_TERMINAL_AA : seq_prot[position + len_pep];
1002  pep_to_prot[idx_pep].insert(match);
1003  ++filter_passed;
1004  }
1005  else
1006  {
1007  //std::cerr << "REJECTED Peptide " << seq_pep << " with hit to protein "
1008  // << seq_prot << " at position " << position << std::endl;
1009  ++filter_rejected;
1010  }
1011  }
1012 
1013  };
1014 
1015  inline void addHits_(AhoCorasickAmbiguous& fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern& pattern, const AhoCorasickAmbiguous::PeptideDB& pep_DB, const String& prot, const String& full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor& func_threads) const
1016  {
1017  fuzzyAC.setProtein(prot);
1018  while (fuzzyAC.findNext(pattern))
1019  {
1020  const seqan::Peptide& tmp_pep = pep_DB[fuzzyAC.getHitDBIndex()];
1021  func_threads.addHit(fuzzyAC.getHitDBIndex(), idx_prot, length(tmp_pep), full_prot, fuzzyAC.getHitProteinPosition() + offset);
1022  }
1023 
1024  }
1025 
1026  void updateMembers_() override;
1027 
1029  bool prefix_;
1033 
1039 
1042 
1043  };
1044 }
1045 
Int getHitProteinPosition()
Offset into protein sequence where hit was found.
Definition: AhoCorasickAmbiguous.h:1057
MapType pep_to_prot
peptide index –> protein indices
Definition: PeptideIndexing.h:948
void setEnzyme(const String &name)
Sets the enzyme for the digestion (by name)
String delta(const String &event="delta")
bool hasPrefix(const String &string) const
true if String begins with string, false otherwise
String description
Definition: FASTAFile.h:79
static String suffix(const String &this_s, size_t length)
Definition: StringUtils.h:269
Definition: PeptideIndexing.h:136
String & toLower()
Converts the string to lowercase.
std::map< OpenMS::Size, std::set< PeptideProteinMatchInformation > > MapType
Definition: PeptideIndexing.h:945
String sequence
Definition: FASTAFile.h:80
#define LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged...
Definition: LogStream.h:452
void addHit(const OpenMS::Size idx_pep, const OpenMS::Size idx_prot, const OpenMS::Size len_pep, const OpenMS::String &seq_prot, OpenMS::Int position)
Definition: PeptideIndexing.h:989
#define LOG_INFO
Macro if a information, e.g. a status should be reported.
Definition: LogStream.h:456
ExitCodes
Exit codes.
Definition: PeptideIndexing.h:130
bool has(Byte byte) const
true if String contains the byte, false otherwise
bool contains_decoys_
Definition: PeptideIndexing.h:749
OpenMS::Size filter_passed
number of accepted hits (passing addHit() constraints)
Definition: PeptideIndexing.h:951
#define LOG_ERROR
Macro to be used if non-fatal error are reported (processing continues)
Definition: LogStream.h:448
OpenMS::Size protein_index
index of the protein the peptide is contained in
Definition: PeptideIndexing.h:901
Definition: PeptideIndexing.h:132
bool operator==(const PeptideProteinMatchInformation &other) const
Definition: PeptideIndexing.h:933
std::map< std::string, std::string > CaseInsensitiveToCaseSensitiveDecoy
Definition: PeptideIndexing.h:748
bool keep_unreferenced_proteins_
Definition: PeptideIndexing.h:1036
void setDescription(const String &description)
sets the description of the protein
Size getHitDBIndex()
Get index of hit into peptide database of the pattern.
Definition: AhoCorasickAmbiguous.h:1047
Int mm_max_
Definition: PeptideIndexing.h:1041
Extended Aho-Corasick algorithm capable of matching ambiguous amino acids in the pattern (i...
Definition: AhoCorasickAmbiguous.h:970
double getClockTime() const
String & substitute(char from, char to)
Replaces all occurrences of the character from by the character to.
bool xtandem_
are we checking xtandem cleavage rules?
Definition: PeptideIndexing.h:960
Specificity getSpecificity() const
Returns the specificity for the digestion.
A more convenient string class.
Definition: String.h:58
bool IL_equivalent_
Definition: PeptideIndexing.h:1038
bool write_protein_sequence_
Definition: PeptideIndexing.h:1034
StopWatch Class.
Definition: StopWatch.h:59
static const char N_TERMINAL_AA
Definition: PeptideEvidence.h:60
FASTAContainer<TFI_Vector> simply takes an existing vector of FASTAEntries and provides the same inte...
Definition: FASTAContainer.h:237
bool findDecoyString_(FASTAContainer< T > &proteins)
Definition: PeptideIndexing.h:752
void addHits_(AhoCorasickAmbiguous &fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern &pattern, const AhoCorasickAmbiguous::PeptideDB &pep_DB, const String &prot, const String &full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor &func_threads) const
Definition: PeptideIndexing.h:1015
String getEnzymeName() const
Returns the enzyme for the digestion.
template parameter for vector-based FASTA access
Definition: FASTAContainer.h:76
bool write_protein_description_
Definition: PeptideIndexing.h:1035
std::map< std::string, std::pair< int, int > > DecoyStringToAffixCount
Definition: PeptideIndexing.h:747
String enzyme_name_
Definition: PeptideIndexing.h:1031
String decoy_string_
Definition: PeptideIndexing.h:1028
Representation of a protein hit.
Definition: ProteinHit.h:57
A convenience class to report either absolute or delta (between two timepoints) RAM usage...
Definition: SysInfo.h:83
Int aaa_max_
Definition: PeptideIndexing.h:1040
FASTA entry type (identifier, description and sequence)
Definition: FASTAFile.h:76
OpenMS::Int position
the position of the peptide in the protein
Definition: PeptideIndexing.h:904
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:54
Refreshes the protein references for all peptide hits in a vector of PeptideIdentifications and adds ...
Definition: PeptideIndexing.h:124
::seqan::StringSet<::seqan::AAString > PeptideDB
Definition: AhoCorasickAmbiguous.h:973
ExitCodes run(FASTAContainer< T > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
Re-index peptide identifications honoring enzyme cutting rules, ambiguous amino acids and target/deco...
Definition: PeptideIndexing.h:190
#define LOG_DEBUG
Macro for general debugging information.
Definition: LogStream.h:460
bool findNext(const FuzzyACPattern &pattern)
Enumerate hits.
Definition: AhoCorasickAmbiguous.h:1037
char AAAfter
the amino acid before the peptide in the protein
Definition: PeptideIndexing.h:910
bool operator<(const PeptideProteinMatchInformation &other) const
Definition: PeptideIndexing.h:912
bool isValidProduct(const String &protein, int pep_pos, int pep_length, bool ignore_missed_cleavages=true, bool allow_nterm_protein_cleavage=false, bool allow_random_asp_pro_cleavage=false) const
Variant of EnzymaticDigestion::isValidProduct() with support for n-term protein cleavage and random D...
::seqan::Pattern< PeptideDB, ::seqan::FuzzyAC > FuzzyACPattern
Definition: AhoCorasickAmbiguous.h:974
semi specific, i.e., one of the two cleavage sites must fulfill requirements
Definition: EnzymaticDigestion.h:69
bool isAmbiguous(AAcid c)
Definition: AhoCorasickAmbiguous.h:578
ExitCodes run(std::vector< FASTAFile::FASTAEntry > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
forward for old interface and pyOpenMS; use run<T>() for more control
Definition: PeptideIndexing.h:148
bool hasSuffix(const String &string) const
true if String ends with string, false otherwise
void setMetaValue(const String &name, const DataValue &value)
Sets the DataValue corresponding to a name.
String missing_decoy_action_
Definition: PeptideIndexing.h:1030
void merge(FoundProteinFunctor &other)
Definition: PeptideIndexing.h:968
FoundProteinFunctor(const ProteaseDigestion &enzyme, bool xtandem)
Definition: PeptideIndexing.h:963
OpenMS::Size filter_rejected
number of rejected hits (not passing addHit())
Definition: PeptideIndexing.h:954
void setAccession(const String &accession)
sets the accession of the protein
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:91
static Specificity getSpecificityByName(const String &name)
Definition: PeptideIndexing.h:942
String substr(size_t pos=0, size_t n=npos) const
Wrapper for the STL substr() method. Returns a String object with its contents initialized to a subst...
Class for the enzymatic digestion of proteins.
Definition: ProteaseDigestion.h:60
void setSequence(const String &sequence)
sets the protein sequence
void setSpecificity(Specificity spec)
Sets the specificity for the digestion (default is SPEC_FULL).
String enzyme_specificity_
Definition: PeptideIndexing.h:1032
void setProtein(const String &protein_sequence)
Reset to new protein sequence. All previous data is forgotten.
Definition: AhoCorasickAmbiguous.h:1024
ProteaseDigestion enzyme_
Definition: PeptideIndexing.h:957
int Int
Signed integer type.
Definition: Types.h:102
Definition: PeptideIndexing.h:137
Definition: PeptideIndexing.h:135
no requirements on start / end
Definition: EnzymaticDigestion.h:70
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
bool allow_unmatched_
Definition: PeptideIndexing.h:1037
static void initPattern(const PeptideDB &pep_db, const int aaa_max, const int mm_max, FuzzyACPattern &pattern)
Construct a trie from a set of peptide sequences (which are to be found in a protein).
Definition: AhoCorasickAmbiguous.h:991
Definition: PeptideIndexing.h:133
static String prefix(const String &this_s, size_t length)
Definition: StringUtils.h:260
Definition: PeptideIndexing.h:134
Representation of a peptide evidence.
Definition: PeptideEvidence.h:50
Size< TNeedle >::Type position(const PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:561
Map class based on the STL map (containing several convenience functions)
Definition: Map.h:50
String< AAcid, Alloc< void > > AAString
Definition: AhoCorasickAmbiguous.h:206
static const char C_TERMINAL_AA
Definition: PeptideEvidence.h:61
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
void after()
record data for the second timepoint
#define DEBUG_ONLY
Definition: AhoCorasickAmbiguous.h:46
String toString() const
get a compact representation of the current time status.
bool has(const Key &key) const
Test whether the map contains the given key.
Definition: Map.h:108
static String & toUpper(String &this_s)
Definition: StringUtils.h:732
String & remove(char what)
Remove all occurrences of the character what.
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
bool prefix_
Definition: PeptideIndexing.h:1029
char AABefore
the amino acid after the peptide in the protein
Definition: PeptideIndexing.h:907