Home  · Classes  · Annotated Classes  · Modules  · Members  · Namespaces  · Related Pages
PeptideIndexing.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2017.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Andreas Bertsch, Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 
53 #include <OpenMS/SYSTEM/SysInfo.h>
54 
55 #include <atomic>
56 #include <algorithm>
57 #include <fstream>
58 
59 namespace OpenMS
60 {
61 
118  class OPENMS_DLLAPI PeptideIndexing :
119  public DefaultParamHandler, public ProgressLogger
120  {
121 public:
122 
125  {
131  UNEXPECTED_RESULT
132  };
133 
135  PeptideIndexing();
136 
138  ~PeptideIndexing() override;
139 
141  inline ExitCodes run(std::vector<FASTAFile::FASTAEntry>& proteins, std::vector<ProteinIdentification>& prot_ids, std::vector<PeptideIdentification>& pep_ids)
142  {
143  FASTAContainer<TFI_Vector> protein_container(proteins);
144  return run<TFI_Vector>(protein_container, prot_ids, pep_ids);
145  }
146 
182  template<typename T>
183  ExitCodes run(FASTAContainer<T>& proteins, std::vector<ProteinIdentification>& prot_ids, std::vector<PeptideIdentification>& pep_ids)
184  {
185  //-------------------------------------------------------------
186  // parsing parameters
187  //-------------------------------------------------------------
188  ProteaseDigestion enzyme;
189  enzyme.setEnzyme(enzyme_name_);
190  enzyme.setSpecificity(enzyme.getSpecificityByName(enzyme_specificity_));
191 
192  const size_t PROTEIN_CACHE_SIZE = 4e5; // 400k should be enough for most DB's and is not too hard on memory either (~200 MB FASTA)
193 
194  //-------------------------------------------------------------
195  // calculations
196  //-------------------------------------------------------------
197  // cache the first proteins
198  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
199 
200  if (proteins.empty()) // we do not allow an empty database
201  {
202  LOG_ERROR << "Error: An empty database was provided. Mapping makes no sense. Aborting..." << std::endl;
203  return DATABASE_EMPTY;
204  }
205 
206  if (pep_ids.empty()) // Aho-Corasick requires non-empty input; but we allow this case, since the TOPP tool should not crash when encountering a bad raw file (with no PSMs)
207  {
208  LOG_WARN << "Warning: An empty set of peptide identifications was provided. Output will be empty as well." << std::endl;
209  if (!keep_unreferenced_proteins_)
210  {
211  // delete only protein hits, not whole ID runs incl. meta data:
212  for (std::vector<ProteinIdentification>::iterator it = prot_ids.begin();
213  it != prot_ids.end(); ++it)
214  {
215  it->getHits().clear();
216  }
217  }
218  return PEPTIDE_IDS_EMPTY;
219  }
220 
221  FoundProteinFunctor func(enzyme); // store the matches
222  Map<String, Size> acc_to_prot; // map: accessions --> FASTA protein index
223  std::vector<bool> protein_is_decoy; // protein index -> is decoy?
224  std::vector<std::string> protein_accessions; // protein index -> accession
225 
226  bool invalid_protein_sequence = false; // check for proteins with modifications, i.e. '[' or '(', and throw an exception
227 
228  { // new scope - forget data after search
229 
230  /*
231  BUILD Peptide DB
232  */
233  bool has_illegal_AAs(false);
235  for (std::vector<PeptideIdentification>::const_iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
236  {
237  //String run_id = it1->getIdentifier();
238  const std::vector<PeptideHit>& hits = it1->getHits();
239  for (std::vector<PeptideHit>::const_iterator it2 = hits.begin(); it2 != hits.end(); ++it2)
240  {
241  //
242  // Warning:
243  // do not skip over peptides here, since the results are iterated in the same way
244  //
245  String seq = it2->getSequence().toUnmodifiedString().remove('*'); // make a copy, i.e. do NOT change the peptide sequence!
246  if (seqan::isAmbiguous(seqan::AAString(seq.c_str())))
247  { // do not quit here, to show the user all sequences .. only quit after loop
248  LOG_ERROR << "Peptide sequence '" << it2->getSequence() << "' contains one or more ambiguous amino acids (B|J|Z|X).\n";
249  has_illegal_AAs = true;
250  }
251  if (IL_equivalent_) // convert L to I;
252  {
253  seq.substitute('L', 'I');
254  }
255  appendValue(pep_DB, seq.c_str());
256  }
257  }
258  if (has_illegal_AAs)
259  {
260  LOG_ERROR << "One or more peptides contained illegal amino acids. This is not allowed!"
261  << "\nPlease either remove the peptide or replace it with one of the unambiguous ones (while allowing for ambiguous AA's to match the protein)." << std::endl;;
262  }
263 
264  LOG_INFO << "Mapping " << length(pep_DB) << " peptides to " << (proteins.size() == PROTEIN_CACHE_SIZE ? "? (unknown number of)" : String(proteins.size())) << " proteins." << std::endl;
265 
266  if (length(pep_DB) == 0)
267  { // Aho-Corasick will crash if given empty needles as input
268  LOG_WARN << "Warning: Peptide identifications have no hits inside! Output will be empty as well." << std::endl;
269  return PEPTIDE_IDS_EMPTY;
270  }
271 
272  /*
273  Aho Corasick (fast)
274  */
275  LOG_INFO << "Searching with up to " << aaa_max_ << " ambiguous amino acid(s) and " << mm_max_ << " mismatch(es)!" << std::endl;
277  LOG_INFO << "Building trie ...";
278  StopWatch s;
279  s.start();
281  AhoCorasickAmbiguous::initPattern(pep_DB, aaa_max_, mm_max_, pattern);
282  s.stop();
283  LOG_INFO << " done (" << int(s.getClockTime()) << "s)" << std::endl;
284  s.reset();
285 
286  uint16_t count_j_proteins(0);
287  bool has_active_data = true; // becomes false if end of FASTA file is reached
288  const std::string jumpX(aaa_max_ + mm_max_ + 1, 'X'); // jump over stretches of 'X' which cost a lot of time; +1 because AXXA is a valid hit for aaa_max == 2 (cannot split it)
289  this->startProgress(0, proteins.size(), "Aho-Corasick");
290  std::atomic<int> progress_prots(0);
291 #ifdef _OPENMP
292 #pragma omp parallel
293 #endif
294  {
295  FoundProteinFunctor func_threads(enzyme);
296  Map<String, Size> acc_to_prot_thread; // map: accessions --> FASTA protein index
297  AhoCorasickAmbiguous fuzzyAC;
298  String prot;
299 
300  while (true)
301  {
302  #pragma omp barrier // all threads need to be here, since we are about to swap protein data
303  #pragma omp single
304  {
305  DEBUG_ONLY std::cerr << " activating cache ...\n";
306  has_active_data = proteins.activateCache(); // swap in last cache
307  protein_accessions.resize(proteins.getChunkOffset() + proteins.chunkSize());
308  } // implicit barrier here
309 
310  if (!has_active_data) break; // leave while-loop
311  SignedSize prot_count = (SignedSize)proteins.chunkSize();
312 
313  #pragma omp master
314  {
315  DEBUG_ONLY std::cerr << "Filling Protein Cache ...";
316  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
317  protein_is_decoy.resize(proteins.getChunkOffset() + prot_count);
318  for (SignedSize i = 0; i < prot_count; ++i)
319  { // do this in master only, to avoid false sharing
320  const String& seq = proteins.chunkAt(i).identifier;
321  protein_is_decoy[i + proteins.getChunkOffset()] = (prefix_ ? seq.hasPrefix(decoy_string_) : seq.hasSuffix(decoy_string_));
322  }
323  DEBUG_ONLY std::cerr << " done" << std::endl;
324  }
325  DEBUG_ONLY std::cerr << " starting for loop \n";
326  // search all peptides in each protein
327  #pragma omp for schedule(dynamic, 100) nowait
328  for (SignedSize i = 0; i < prot_count; ++i)
329  {
330  ++progress_prots; // atomic
331  if (omp_get_thread_num() == 0)
332  {
333  this->setProgress(progress_prots);
334  }
335 
336  prot = proteins.chunkAt(i).sequence;
337  prot.remove('*');
338 
339  // check for invalid sequences with modifications
340  if (prot.has('[') || prot.has('('))
341  {
342  invalid_protein_sequence = true; // not omp-critical because its write-only
343  // we cannot throw an exception here, since we'd need to catch it within the parallel region
344  }
345 
346  // convert L/J to I; also replace 'J' in proteins
347  if (IL_equivalent_)
348  {
349  prot.substitute('L', 'I');
350  prot.substitute('J', 'I');
351  }
352  else
353  { // warn if 'J' is found (it eats into aaa_max)
354  if (prot.has('J'))
355  {
356  #pragma omp atomic
357  ++count_j_proteins;
358  }
359  }
360 
361  Size prot_idx = i + proteins.getChunkOffset();
362 
363  // test if protein was a hit
364  Size hits_total = func_threads.filter_passed + func_threads.filter_rejected;
365 
366  // check if there are stretches of 'X'
367  if (prot.has('X'))
368  {
369  // create chunks of the protein (splitting it at stretches of 'X..X') and feed them to AC one by one
370  size_t offset = -1, start = 0;
371  while ((offset = prot.find(jumpX, offset + 1)) != std::string::npos)
372  {
373  //std::cout << "found X..X at " << offset << " in protein " << proteins[i].identifier << "\n";
374  addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start, offset + jumpX.size() - start), prot, prot_idx, (int)start, func_threads);
375  // skip ahead while we encounter more X...
376  while (offset + jumpX.size() < prot.size() && prot[offset + jumpX.size()] == 'X') ++offset;
377  start = offset;
378  //std::cout << " new start: " << start << "\n";
379  }
380  // last chunk
381  if (start < prot.size())
382  {
383  addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start), prot, prot_idx, (int)start, func_threads);
384  }
385  }
386  else
387  {
388  addHits_(fuzzyAC, pattern, pep_DB, prot, prot, prot_idx, 0, func_threads);
389  }
390  // was protein found?
391  if (hits_total < func_threads.filter_passed + func_threads.filter_rejected)
392  {
393  protein_accessions[prot_idx] = proteins.chunkAt(i).identifier;
394  acc_to_prot_thread[protein_accessions[prot_idx]] = prot_idx;
395  }
396  } // end parallel FOR
397 
398  // join results again
399  DEBUG_ONLY std::cerr << " critical now \n";
400  #ifdef _OPENMP
401  #pragma omp critical(PeptideIndexer_joinAC)
402  #endif
403  {
404  s.start();
405  // hits
406  func.merge(func_threads);
407  // accession -> index
408  acc_to_prot.insert(acc_to_prot_thread.begin(), acc_to_prot_thread.end());
409  acc_to_prot_thread.clear();
410  s.stop();
411  } // OMP end critical
412  } // end readChunk
413  } // OMP end parallel
414  this->endProgress();
415  std::cout << "Merge took: " << s.toString() << "\n";
416  mu.after();
417  std::cout << mu.delta("Aho-Corasick") << "\n\n";
418 
419  LOG_INFO << "\nAho-Corasick done:\n found " << func.filter_passed << " hits for " << func.pep_to_prot.size() << " of " << length(pep_DB) << " peptides.\n";
420 
421  // write some stats
422  LOG_INFO << "Peptide hits passing enzyme filter: " << func.filter_passed << "\n"
423  << " ... rejected by enzyme filter: " << func.filter_rejected << std::endl;
424 
425  if (count_j_proteins)
426  {
427  LOG_WARN << "PeptideIndexer found " << count_j_proteins << " protein sequences in your database containing the amino acid 'J'."
428  << "To match 'J' in a protein, an ambiguous amino acid placeholder for I/L will be used.\n"
429  << "This costs runtime and eats into the 'aaa_max' limit, leaving less opportunity for B/Z/X matches.\n"
430  << "If you want 'J' to be treated as unambiguous, enable '-IL_equivalent'!" << std::endl;
431  }
432 
433  } // end local scope
434 
435  //
436  // do mapping
437  //
438  // index existing proteins
439  Map<String, Size> runid_to_runidx; // identifier to index
440  for (Size run_idx = 0; run_idx < prot_ids.size(); ++run_idx)
441  {
442  runid_to_runidx[prot_ids[run_idx].getIdentifier()] = run_idx;
443  }
444 
445  // for peptides --> proteins
446  Size stats_matched_unique(0);
447  Size stats_matched_multi(0);
448  Size stats_unmatched(0); // no match to DB
449  Size stats_count_m_t(0); // match to Target DB
450  Size stats_count_m_d(0); // match to Decoy DB
451  Size stats_count_m_td(0); // match to T+D DB
452 
453  Map<Size, std::set<Size> > runidx_to_protidx; // in which protID do appear which proteins (according to mapped peptides)
454 
455  Size pep_idx(0);
456  for (std::vector<PeptideIdentification>::iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
457  {
458  // which ProteinIdentification does the peptide belong to?
459  Size run_idx = runid_to_runidx[it1->getIdentifier()];
460 
461  std::vector<PeptideHit>& hits = it1->getHits();
462 
463  for (std::vector<PeptideHit>::iterator it2 = hits.begin(); it2 != hits.end(); ++it2)
464  {
465  // clear protein accessions
466  it2->setPeptideEvidences(std::vector<PeptideEvidence>());
467 
468  //
469  // is this a decoy hit?
470  //
471  bool matches_target(false);
472  bool matches_decoy(false);
473 
474  std::set<Size> prot_indices;
475  // add new protein references
476  for (std::set<PeptideProteinMatchInformation>::const_iterator it_i = func.pep_to_prot[pep_idx].begin();
477  it_i != func.pep_to_prot[pep_idx].end(); ++it_i)
478  {
479  prot_indices.insert(it_i->protein_index);
480  const String& accession = protein_accessions[it_i->protein_index];
481  PeptideEvidence pe(accession, it_i->position, it_i->position + (int)it2->getSequence().size() - 1, it_i->AABefore, it_i->AAAfter);
482  it2->addPeptideEvidence(pe);
483 
484  runidx_to_protidx[run_idx].insert(it_i->protein_index); // fill protein hits
485 
486  if (protein_is_decoy[it_i->protein_index])
487  {
488  matches_decoy = true;
489  }
490  else
491  {
492  matches_target = true;
493  }
494  }
495 
496  if (matches_decoy && matches_target)
497  {
498  it2->setMetaValue("target_decoy", "target+decoy");
499  ++stats_count_m_td;
500  }
501  else if (matches_target)
502  {
503  it2->setMetaValue("target_decoy", "target");
504  ++stats_count_m_t;
505  }
506  else if (matches_decoy)
507  {
508  it2->setMetaValue("target_decoy", "decoy");
509  ++stats_count_m_d;
510  } // else: could match to no protein (i.e. both are false)
511  //else ... // not required (handled below; see stats_unmatched);
512 
513  if (prot_indices.size() == 1)
514  {
515  it2->setMetaValue("protein_references", "unique");
516  ++stats_matched_unique;
517  }
518  else if (prot_indices.size() > 1)
519  {
520  it2->setMetaValue("protein_references", "non-unique");
521  ++stats_matched_multi;
522  }
523  else
524  {
525  it2->setMetaValue("protein_references", "unmatched");
526  ++stats_unmatched;
527  if (stats_unmatched < 15) LOG_INFO << "Unmatched peptide: " << it2->getSequence() << "\n";
528  else if (stats_unmatched == 15) LOG_INFO << "Unmatched peptide: ...\n";
529  }
530 
531  ++pep_idx; // next hit
532  }
533 
534  }
535 
536  Size total_peptides = stats_count_m_t + stats_count_m_d + stats_count_m_td + stats_unmatched;
537  LOG_INFO << "-----------------------------------\n";
538  LOG_INFO << "Peptide statistics\n";
539  LOG_INFO << "\n";
540  LOG_INFO << " unmatched : " << stats_unmatched << " (" << stats_unmatched * 100 / total_peptides << " %)\n";
541  LOG_INFO << " target/decoy:\n";
542  LOG_INFO << " match to target DB only: " << stats_count_m_t << " (" << stats_count_m_t * 100 / total_peptides << " %)\n";
543  LOG_INFO << " match to decoy DB only : " << stats_count_m_d << " (" << stats_count_m_d * 100 / total_peptides << " %)\n";
544  LOG_INFO << " match to both : " << stats_count_m_td << " (" << stats_count_m_td * 100 / total_peptides << " %)\n";
545  LOG_INFO << "\n";
546  LOG_INFO << " mapping to proteins:\n";
547  LOG_INFO << " no match (to 0 protein) : " << stats_unmatched << "\n";
548  LOG_INFO << " unique match (to 1 protein) : " << stats_matched_unique << "\n";
549  LOG_INFO << " non-unique match (to >1 protein): " << stats_matched_multi << std::endl;
550 
552  Size stats_matched_proteins(0), stats_matched_new_proteins(0), stats_orphaned_proteins(0), stats_proteins_target(0), stats_proteins_decoy(0);
553 
554  // all peptides contain the correct protein hit references, now update the protein hits
555  for (Size run_idx = 0; run_idx < prot_ids.size(); ++run_idx)
556  {
557  std::set<Size> masterset = runidx_to_protidx[run_idx]; // all protein matches from above
558 
559  std::vector<ProteinHit>& phits = prot_ids[run_idx].getHits();
560  {
561  // go through existing protein hits and count orphaned proteins (with no peptide hits)
562  std::vector<ProteinHit> orphaned_hits;
563  for (std::vector<ProteinHit>::iterator p_hit = phits.begin(); p_hit != phits.end(); ++p_hit)
564  {
565  const String& acc = p_hit->getAccession();
566  if (!acc_to_prot.has(acc)) // acc_to_prot only contains found proteins from current run
567  { // old hit is orphaned
568  ++stats_orphaned_proteins;
569  if (keep_unreferenced_proteins_)
570  {
571  p_hit->setMetaValue("target_decoy", "");
572  orphaned_hits.push_back(*p_hit);
573  }
574  }
575  }
576  // only keep orphaned hits (if any)
577  phits = orphaned_hits;
578  }
579 
580  // add new protein hits
582  phits.reserve(phits.size() + masterset.size());
583  for (std::set<Size>::const_iterator it = masterset.begin(); it != masterset.end(); ++it)
584  {
585  ProteinHit hit;
586  hit.setAccession(protein_accessions[*it]);
587 
588  if (write_protein_sequence_ || write_protein_description_)
589  {
590  proteins.readAt(fe, *it);
591  if (write_protein_sequence_)
592  {
593  hit.setSequence(fe.sequence);
594  } // no else, since sequence is empty by default
595  if (write_protein_description_)
596  {
597  hit.setDescription(fe.description);
598  } // no else, since description is empty by default
599  }
600  if (protein_is_decoy[*it])
601  {
602  hit.setMetaValue("target_decoy", "decoy");
603  ++stats_proteins_decoy;
604  }
605  else
606  {
607  hit.setMetaValue("target_decoy", "target");
608  ++stats_proteins_target;
609  }
610  phits.push_back(hit);
611  ++stats_matched_new_proteins;
612  }
613  stats_matched_proteins += phits.size();
614  }
615 
616 
617  LOG_INFO << "-----------------------------------\n";
618  LOG_INFO << "Protein statistics\n";
619  LOG_INFO << "\n";
620  LOG_INFO << " total proteins searched: " << proteins.size() << "\n";
621  LOG_INFO << " matched proteins : " << stats_matched_proteins << " (" << stats_matched_new_proteins << " new)\n";
622  if (stats_matched_proteins)
623  { // prevent Division-by-0 Exception
624  LOG_INFO << " matched target proteins: " << stats_proteins_target << " (" << stats_proteins_target * 100 / stats_matched_proteins << " %)\n";
625  LOG_INFO << " matched decoy proteins : " << stats_proteins_decoy << " (" << stats_proteins_decoy * 100 / stats_matched_proteins << " %)\n";
626  }
627  LOG_INFO << " orphaned proteins : " << stats_orphaned_proteins << (keep_unreferenced_proteins_ ? " (all kept)" : " (all removed)\n");
628  LOG_INFO << "-----------------------------------" << std::endl;
629 
630 
632  bool has_error = false;
633 
634  if (invalid_protein_sequence)
635  {
636  LOG_ERROR << "Error: One or more protein sequences contained the characters '[' or '(', which are illegal in protein sequences."
637  << "\nPeptide hits might be masked by these characters (which usually indicate presence of modifications).\n";
638  has_error = true;
639  }
640 
641  if ((stats_count_m_d + stats_count_m_td) == 0)
642  {
643  String msg("No peptides were matched to the decoy portion of the database! Did you provide the correct concatenated database? Are your 'decoy_string' (=" + String(decoy_string_) + ") and 'decoy_string_position' (=" + String(param_.getValue("decoy_string_position")) + ") settings correct?");
644  if (missing_decoy_action_ == "error")
645  {
646  LOG_ERROR << "Error: " << msg << "\nSet 'missing_decoy_action' to 'warn' if you are sure this is ok!\nAborting ..." << std::endl;
647  has_error = true;
648  }
649  else if (missing_decoy_action_ == "warn")
650  {
651  LOG_WARN << "Warn: " << msg << "\nSet 'missing_decoy_action' to 'error' if you want to elevate this to an error!" << std::endl;
652  }
653  else // silent
654  {
655  }
656  }
657 
658  if ((!allow_unmatched_) && (stats_unmatched > 0))
659  {
660  LOG_ERROR << "PeptideIndexer found unmatched peptides, which could not be associated to a protein.\n"
661  << "Potential solutions:\n"
662  << " - check your FASTA database for completeness\n"
663  << " - set 'enzyme:specificity' to match the identification parameters of the search engine\n"
664  << " - some engines (e.g. X! Tandem) employ loose cutting rules generating non-tryptic peptides;\n"
665  << " if you trust them, disable enzyme specificity\n"
666  << " - increase 'aaa_max' to allow more ambiguous amino acids\n"
667  << " - as a last resort: use the 'allow_unmatched' option to accept unmatched peptides\n"
668  << " (note that unmatched peptides cannot be used for FDR calculation or quantification)\n";
669  has_error = true;
670  }
671 
672  if (has_error)
673  {
674  LOG_ERROR << "Result files will be written, but PeptideIndexer will exit with an error code." << std::endl;
675  return UNEXPECTED_RESULT;
676  }
677  return EXECUTION_OK;
678  }
679 
680 protected:
682  {
685 
688 
690  char AABefore;
691 
693  char AAAfter;
694 
695  bool operator<(const PeptideProteinMatchInformation& other) const
696  {
697  if (protein_index != other.protein_index)
698  {
699  return protein_index < other.protein_index;
700  }
701  else if (position != other.position)
702  {
703  return position < other.position;
704  }
705  else if (AABefore != other.AABefore)
706  {
707  return AABefore < other.AABefore;
708  }
709  else if (AAAfter != other.AAAfter)
710  {
711  return AAAfter < other.AAAfter;
712  }
713  return false;
714  }
715 
717  {
718  return protein_index == other.protein_index &&
719  position == other.position &&
720  AABefore == other.AABefore &&
721  AAAfter == other.AAAfter;
722  }
723 
724  };
726  {
727  public:
728  typedef std::map<OpenMS::Size, std::set<PeptideProteinMatchInformation> > MapType;
729 
732 
735 
738 
739  private:
741 
742  public:
743  explicit FoundProteinFunctor(const ProteaseDigestion& enzyme) :
744  pep_to_prot(), filter_passed(0), filter_rejected(0), enzyme_(enzyme)
745  {
746  }
747 
749  {
750  if (pep_to_prot.empty())
751  { // first merge is easy
752  pep_to_prot.swap(other.pep_to_prot);
753  }
754  else
755  {
756  for (FoundProteinFunctor::MapType::const_iterator it = other.pep_to_prot.begin(); it != other.pep_to_prot.end(); ++it)
757  { // augment set
758  this->pep_to_prot[it->first].insert(other.pep_to_prot[it->first].begin(), other.pep_to_prot[it->first].end());
759  }
760  other.pep_to_prot.clear();
761  }
762  // cheap members
763  this->filter_passed += other.filter_passed;
764  other.filter_passed = 0;
765  this->filter_rejected += other.filter_rejected;
766  other.filter_rejected = 0;
767  }
768 
769  void addHit(const OpenMS::Size idx_pep,
770  const OpenMS::Size idx_prot,
771  const OpenMS::Size len_pep,
772  const OpenMS::String& seq_prot,
774  {
775  if (enzyme_.isValidProduct(seq_prot, position, len_pep, true, true))
776  {
778  match.protein_index = idx_prot;
779  match.position = position;
780  match.AABefore = (position == 0) ? PeptideEvidence::N_TERMINAL_AA : seq_prot[position - 1];
781  match.AAAfter = (position + len_pep >= seq_prot.size()) ? PeptideEvidence::C_TERMINAL_AA : seq_prot[position + len_pep];
782  pep_to_prot[idx_pep].insert(match);
783  ++filter_passed;
784  }
785  else
786  {
787  //std::cerr << "REJECTED Peptide " << seq_pep << " with hit to protein "
788  // << seq_prot << " at position " << position << std::endl;
789  ++filter_rejected;
790  }
791  }
792 
793  };
794 
795  inline void addHits_(AhoCorasickAmbiguous& fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern& pattern, const AhoCorasickAmbiguous::PeptideDB& pep_DB, const String& prot, const String& full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor& func_threads) const
796  {
797  fuzzyAC.setProtein(prot);
798  while (fuzzyAC.findNext(pattern))
799  {
800  const seqan::Peptide& tmp_pep = pep_DB[fuzzyAC.getHitDBIndex()];
801  func_threads.addHit(fuzzyAC.getHitDBIndex(), idx_prot, length(tmp_pep), full_prot, fuzzyAC.getHitProteinPosition() + offset);
802  }
803 
804  }
805 
806  void updateMembers_() override;
807 
809  bool prefix_;
813 
819 
822 
823  };
824 }
825 
void setMetaValue(const String &name, const DataValue &value)
Sets the DataValue corresponding to a name.
FoundProteinFunctor(const ProteaseDigestion &enzyme)
Definition: PeptideIndexing.h:743
bool has(Byte byte) const
true if String contains the byte, false otherwise
char AABefore
the amino acid after the peptide in the protein
Definition: PeptideIndexing.h:690
A more convenient string class.
Definition: String.h:57
void after()
record data for the second timepoint
ExitCodes run(FASTAContainer< T > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
Re-index peptide identifications honoring enzyme cutting rules, ambiguous amino acids and target/deco...
Definition: PeptideIndexing.h:183
#define LOG_INFO
Macro if a information, e.g. a status should be reported.
Definition: LogStream.h:454
OpenMS::Int position
the position of the peptide in the protein
Definition: PeptideIndexing.h:687
void setSequence(const String &sequence)
sets the protein sequence
Size< TNeedle >::Type position(const PatternAuxData< TNeedle > &dh)
Definition: AhoCorasickAmbiguous.h:561
bool operator==(const PeptideProteinMatchInformation &other) const
Definition: PeptideIndexing.h:716
StopWatch Class.
Definition: StopWatch.h:59
bool operator<(const PeptideProteinMatchInformation &other) const
Definition: PeptideIndexing.h:695
OpenMS::Size protein_index
index of the protein the peptide is contained in
Definition: PeptideIndexing.h:684
void setProtein(const String &protein_sequence)
Reset to new protein sequence. All previous data is forgotten.
Definition: AhoCorasickAmbiguous.h:1024
A convenience class to report either absolute or delta (between two timepoints) RAM usage...
Definition: SysInfo.h:83
::seqan::Pattern< PeptideDB, ::seqan::FuzzyAC > FuzzyACPattern
Definition: AhoCorasickAmbiguous.h:974
Definition: PeptideIndexing.h:725
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
Refreshes the protein references for all peptide hits in a vector of PeptideIdentifications and adds ...
Definition: PeptideIndexing.h:118
bool keep_unreferenced_proteins_
Definition: PeptideIndexing.h:816
String decoy_string_
Definition: PeptideIndexing.h:808
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
#define DEBUG_ONLY
Definition: AhoCorasickAmbiguous.h:46
Definition: PeptideIndexing.h:127
String missing_decoy_action_
Definition: PeptideIndexing.h:810
void setSpecificity(Specificity spec)
Sets the specificity for the digestion (default is SPEC_FULL).
String toString() const
get a compact representation of the current time status.
String & remove(char what)
Remove all occurrences of the character what.
ExitCodes
Exit codes.
Definition: PeptideIndexing.h:124
bool prefix_
Definition: PeptideIndexing.h:809
String enzyme_name_
Definition: PeptideIndexing.h:811
#define LOG_ERROR
Macro to be used if non-fatal error are reported (processing continues)
Definition: LogStream.h:446
Size getHitDBIndex()
Get index of hit into peptide database of the pattern.
Definition: AhoCorasickAmbiguous.h:1047
String substr(size_t pos=0, size_t n=npos) const
Wrapper for the STL substr() method. Returns a String object with its contents initialized to a subst...
#define LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged...
Definition: LogStream.h:450
std::map< OpenMS::Size, std::set< PeptideProteinMatchInformation > > MapType
Definition: PeptideIndexing.h:728
bool findNext(const FuzzyACPattern &pattern)
Enumerate hits.
Definition: AhoCorasickAmbiguous.h:1037
Int aaa_max_
Definition: PeptideIndexing.h:820
Int getHitProteinPosition()
Offset into protein sequence where hit was found.
Definition: AhoCorasickAmbiguous.h:1057
template parameter for vector-based FASTA access
Definition: FASTAContainer.h:76
Extended Aho-Corasick algorithm capable of matching ambiguous amino acids in the pattern (i...
Definition: AhoCorasickAmbiguous.h:970
void merge(FoundProteinFunctor &other)
Definition: PeptideIndexing.h:748
static const char N_TERMINAL_AA
Definition: PeptideEvidence.h:60
bool write_protein_sequence_
Definition: PeptideIndexing.h:814
Class for the enzymatic digestion of proteins.
Definition: ProteaseDigestion.h:60
Definition: PeptideIndexing.h:128
bool isAmbiguous(AAcid c)
Definition: AhoCorasickAmbiguous.h:578
Definition: PeptideIndexing.h:126
MapType pep_to_prot
peptide index –> protein indices
Definition: PeptideIndexing.h:731
Definition: PeptideIndexing.h:130
void setDescription(const String &description)
sets the description of the protein
bool IL_equivalent_
Definition: PeptideIndexing.h:818
Representation of a peptide evidence.
Definition: PeptideEvidence.h:50
bool has(const Key &key) const
Test whether the map contains the given key.
Definition: Map.h:108
static void initPattern(const PeptideDB &pep_db, const int aaa_max, const int mm_max, FuzzyACPattern &pattern)
Construct a trie from a set of peptide sequences (which are to be found in a protein).
Definition: AhoCorasickAmbiguous.h:991
void setEnzyme(const String &name)
Sets the enzyme for the digestion (by name)
void setAccession(const String &accession)
sets the accession of the protein
Representation of a protein hit.
Definition: ProteinHit.h:53
String enzyme_specificity_
Definition: PeptideIndexing.h:812
static Specificity getSpecificityByName(const String &name)
String delta(const String &event="delta")
bool hasPrefix(const String &string) const
true if String begins with string, false otherwise
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
String & substitute(char from, char to)
Replaces all occurrences of the character from by the character to.
OpenMS::Size filter_rejected
number of rejected hits (not passing addHit())
Definition: PeptideIndexing.h:737
bool write_protein_description_
Definition: PeptideIndexing.h:815
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:54
Int mm_max_
Definition: PeptideIndexing.h:821
String sequence
Definition: FASTAFile.h:80
FASTA entry type (identifier, description and sequence)
Definition: FASTAFile.h:76
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:91
bool allow_unmatched_
Definition: PeptideIndexing.h:817
static const char C_TERMINAL_AA
Definition: PeptideEvidence.h:61
String< AAcid, Alloc< void > > AAString
Definition: AhoCorasickAmbiguous.h:206
int Int
Signed integer type.
Definition: Types.h:102
bool isValidProduct(const String &protein, int pep_pos, int pep_length, bool ignore_missed_cleavages=true, bool allow_nterm_protein_cleavage=false, bool allow_random_asp_pro_cleavage=false) const
Variant of EnzymaticDigestion::isValidProduct() with support for n-term protein cleavage and random D...
char AAAfter
the amino acid before the peptide in the protein
Definition: PeptideIndexing.h:693
Map class based on the STL map (containing several convenience functions)
Definition: Map.h:50
FASTAContainer<TFI_Vector> simply takes an existing vector of FASTAEntries and provides the same inte...
Definition: FASTAContainer.h:226
::seqan::StringSet<::seqan::AAString > PeptideDB
Definition: AhoCorasickAmbiguous.h:973
String description
Definition: FASTAFile.h:79
bool hasSuffix(const String &string) const
true if String ends with string, false otherwise
OpenMS::Size filter_passed
number of accepted hits (passing addHit() constraints)
Definition: PeptideIndexing.h:734
ExitCodes run(std::vector< FASTAFile::FASTAEntry > &proteins, std::vector< ProteinIdentification > &prot_ids, std::vector< PeptideIdentification > &pep_ids)
forward for old interface and pyOpenMS; use run<T>() for more control
Definition: PeptideIndexing.h:141
void addHit(const OpenMS::Size idx_pep, const OpenMS::Size idx_prot, const OpenMS::Size len_pep, const OpenMS::String &seq_prot, OpenMS::Int position)
Definition: PeptideIndexing.h:769
double getClockTime() const
ProteaseDigestion enzyme_
Definition: PeptideIndexing.h:740
void addHits_(AhoCorasickAmbiguous &fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern &pattern, const AhoCorasickAmbiguous::PeptideDB &pep_DB, const String &prot, const String &full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor &func_threads) const
Definition: PeptideIndexing.h:795

OpenMS / TOPP release 2.3.0 Documentation generated on Wed Apr 18 2018 19:29:07 using doxygen 1.8.14