OpenMS
Loading...
Searching...
No Matches
ProSEAlgorithm.h
Go to the documentation of this file.
1// Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin
2// SPDX-License-Identifier: BSD-3-Clause
3//
4// --------------------------------------------------------------------------
5// $Maintainer: $
6// $Authors: Raphael Förster $
7// --------------------------------------------------------------------------
8
9#pragma once
10
13
22
23#include <algorithm> // std::min (used by inline computeModMatchTolerance_)
24#include <vector>
25
26namespace OpenMS
27{
28
43class OPENMS_DLLAPI ProSEAlgorithm :
45 public ProgressLogger
46{
47 public:
49
51 enum class ExitCodes
52 {
53 EXECUTION_OK,
54 INPUT_FILE_EMPTY,
55 UNEXPECTED_RESULT,
56 UNKNOWN_ERROR,
57 ILLEGAL_PARAMETERS
58 };
59
69 {
70 ExitCodes exit_code = ExitCodes::EXECUTION_OK;
71 std::vector<ProteinIdentification> protein_ids;
74 bool is_open_search = false;
75 };
76
102 {
103 std::vector<SearchResult> per_file;
105 };
106
116 {
117 std::vector<FASTAFile::FASTAEntry> db;
119 };
120
142 ExitCodes search(const String& in_spectra,
143 const String& in_db,
144 std::vector<ProteinIdentification>& prot_ids,
145 PeptideIdentificationList& pep_ids) const;
146
193 const String& in_db,
194 const String& output_base_name = "") const;
195
214 const std::vector<FASTAFile::FASTAEntry>& fasta_db,
215 std::vector<ProteinIdentification>& prot_ids,
216 PeptideIdentificationList& pep_ids) const;
217
236 SearchContext prepareContext(const std::vector<FASTAFile::FASTAEntry>& fasta_db) const;
237
259 SearchContext& ctx,
260 std::vector<ProteinIdentification>& prot_ids,
261 PeptideIdentificationList& pep_ids) const;
262
274 const std::vector<FASTAFile::FASTAEntry>& fasta_db,
275 const String& output_base_name = "") const;
276
304 const std::vector<String>& in_spectra_files,
305 const std::vector<FASTAFile::FASTAEntry>& fasta_db,
306 const std::vector<String>& output_base_names = {},
307 const String& aggregate_base_name = "") const;
308
320 const std::vector<String>& in_spectra_files,
321 const String& in_db,
322 const std::vector<String>& output_base_names = {},
323 const String& aggregate_base_name = "") const;
324
325 protected:
326 void updateMembers_() override;
327
330 {
332 /*
333 StringView sequence;
334 SignedSize peptide_mod_index; ///< enumeration index of the non-RNA peptide modification
335 */
336 // Layout: doubles first, then floats, then int, then uint16_t — minimizes padding (40 bytes excluding AASequence)
337 double score = 0;
338 double delta_mass = 0.0;
339 float prefix_fraction = 0;
340 float suffix_fraction = 0;
341 float mean_error = 0.0f;
342 int isotope_error = 0;
343 uint16_t applied_charge = 0;
344 uint16_t matched_b_ions = 0;
345 uint16_t matched_y_ions = 0;
346
347 static bool hasBetterScore(const AnnotatedHit_& a, const AnnotatedHit_& b)
348 {
349 if (a.score != b.score) return a.score > b.score;
350 return a.sequence < b.sequence;
351 }
352 };
353
355 static void preprocessSpectra_(PeakMap& exp, double fragment_mass_tolerance, bool fragment_mass_tolerance_unit_ppm);
356
381 void postProcessHits_(const PeakMap& exp,
382 std::vector<std::vector<ProSEAlgorithm::AnnotatedHit_> >& annotated_hits,
383 std::vector<ProteinIdentification>& protein_ids,
384 PeptideIdentificationList& peptide_ids,
385 Size top_hits,
386 const StringList& modifications_fixed,
387 const StringList& modifications_variable,
388 Int peptide_missed_cleavages,
389 double precursor_mass_tolerance,
390 double fragment_mass_tolerance,
391 const String& precursor_mass_tolerance_unit_ppm,
392 const String& fragment_mass_tolerance_unit_ppm,
393 const Int precursor_min_charge,
394 const Int precursor_max_charge,
395 const String& enzyme,
396 const String& database_name) const;
397
401 mutable double precursor_mass_tolerance_lower_{20.0};
402 mutable double precursor_mass_tolerance_upper_{20.0};
403 String precursor_mass_tolerance_unit_{"ppm"};
404
407
409
411
413
415
417
419
421
424
425 double fdr_psm_{0.0};
426 double fdr_protein_{0.0};
427
429
433 EnzymaticDigestion::Specificity peptide_enzyme_specificity_{EnzymaticDigestion::SPEC_FULL};
434
436
438
439 bool calibration_enabled_{false};
440 double calibration_subset_ratio_{0.1};
441 Size calibration_min_psms_{50};
442
451 {
452 double precursor_shift{0};
453 double precursor_spread{0};
454 double cal_lower{0};
455 double cal_upper{0};
456 double fragment_tolerance{0};
457 double fragment_shift{0};
458 bool extreme_bias{false};
459 bool success{false};
460 };
461
466
474 mutable double last_mod_match_tolerance_used_{-1.0};
475
487 {
488 if (precursor_mass_tolerance_lower_ <= 0.0) return precursor_mass_tolerance_upper_;
489 if (precursor_mass_tolerance_upper_ <= 0.0) return precursor_mass_tolerance_lower_;
490 return std::min(precursor_mass_tolerance_lower_, precursor_mass_tolerance_upper_);
491 }
492
506 FragmentIndex& fragment_index,
507 const std::vector<FASTAFile::FASTAEntry>& db) const;
508
511 const String& output_base_name) const;
512
514 void logSearchDiagnostics_(const PeakMap& spectra,
515 const std::vector<ProteinIdentification>& protein_ids,
516 const PeptideIdentificationList& peptide_ids) const;
517
519 bool isOpenSearchMode_() const
520 {
521 return FragmentIndex::isOpenSearchMode(precursor_mass_tolerance_lower_,
522 precursor_mass_tolerance_upper_,
523 precursor_mass_tolerance_unit_ == "ppm");
524 }
525};
526
527} // namespace
Representation of a peptide/protein sequence.
Definition AASequence.h:88
A base class for all classes handling default parameters.
Definition DefaultParamHandler.h:66
Specificity
when querying for valid digestion products, this determines if the specificity of the two peptide end...
Definition EnzymaticDigestion.h:42
Generates from a set of Fasta files a 2D-datastructure which stores all theoretical masses of all b a...
Definition FragmentIndex.h:35
In-Memory representation of a mass spectrometry run.
Definition MSExperiment.h:49
Combined result of open search modification analysis.
Definition OpenSearchModificationAnalysis.h:104
Container for peptide identifications from multiple spectra.
Definition PeptideIdentificationList.h:66
Fragment-index-based peptide database search algorithm (experimental).
Definition ProSEAlgorithm.h:46
ExitCodes search(PeakMap &spectra, const std::vector< FASTAFile::FASTAEntry > &fasta_db, std::vector< ProteinIdentification > &prot_ids, PeptideIdentificationList &pep_ids) const
In-memory search: search spectra against a protein database without file I/O.
static void preprocessSpectra_(PeakMap &exp, double fragment_mass_tolerance, bool fragment_mass_tolerance_unit_ppm)
filter, deisotope, decharge spectra
FragmentIndex fragment_index
Definition ProSEAlgorithm.h:118
void logSearchDiagnostics_(const PeakMap &spectra, const std::vector< ProteinIdentification > &protein_ids, const PeptideIdentificationList &peptide_ids) const
Helper: log search summary statistics and per-run tolerance estimation.
SearchResult searchWithModificationAnalysis(PeakMap &spectra, const std::vector< FASTAFile::FASTAEntry > &fasta_db, const String &output_base_name="") const
In-memory search with modification analysis: no file I/O required.
Size peptide_max_size_
Definition ProSEAlgorithm.h:431
Size precursor_max_charge_
Definition ProSEAlgorithm.h:406
Size precursor_min_charge_
Definition ProSEAlgorithm.h:405
Size report_top_hits_
Definition ProSEAlgorithm.h:437
Size modifications_max_variable_mods_per_peptide_
Definition ProSEAlgorithm.h:418
StringList modifications_fixed_
Definition ProSEAlgorithm.h:414
CalibrationResult_ runCalibrationPass_(PeakMap &spectra, FragmentIndex &fragment_index, const std::vector< FASTAFile::FASTAEntry > &db) const
Run a fast calibration pass on a subset of spectra to estimate mass accuracy.
SearchResult searchWithModificationAnalysis(const String &in_spectra, const String &in_db, const String &output_base_name="") const
Search with comprehensive results including modification analysis tables.
String enzyme_
Definition ProSEAlgorithm.h:420
std::vector< ProteinIdentification > protein_ids
Definition ProSEAlgorithm.h:71
MultiFileSearchResult searchWithModificationAnalysis(const std::vector< String > &in_spectra_files, const String &in_db, const std::vector< String > &output_base_names={}, const String &aggregate_base_name="") const
Multi-file search with modification analysis (FASTA file path).
Size peptide_min_size_
Definition ProSEAlgorithm.h:430
String fragment_mass_tolerance_unit_
Definition ProSEAlgorithm.h:412
SearchResult aggregate
Definition ProSEAlgorithm.h:104
IntList precursor_isotopes_
Definition ProSEAlgorithm.h:408
bool decoys_
Definition ProSEAlgorithm.h:422
StringList annotate_psm_
Definition ProSEAlgorithm.h:428
OpenSearchModificationAnalysis::OpenSearchAnalysisResult modification_analysis
Definition ProSEAlgorithm.h:73
std::vector< FASTAFile::FASTAEntry > db
Definition ProSEAlgorithm.h:117
ExitCodes search(PeakMap &spectra, SearchContext &ctx, std::vector< ProteinIdentification > &prot_ids, PeptideIdentificationList &pep_ids) const
In-memory search using a pre-built SearchContext.
bool isOpenSearchMode_() const
Helper function to determine if open search should be used based on tolerance.
Definition ProSEAlgorithm.h:519
String decoy_prefix_
Definition ProSEAlgorithm.h:423
PeptideIdentificationList peptide_ids
Definition ProSEAlgorithm.h:72
void updateMembers_() override
This method is used to update extra member variables at the end of the setParameters() method.
ExitCodes search(const String &in_spectra, const String &in_db, std::vector< ProteinIdentification > &prot_ids, PeptideIdentificationList &pep_ids) const
Search spectra in a spectrum file (mzML or Bruker .d) against a protein database using an FI-backed w...
ExitCodes
Exit codes.
Definition ProSEAlgorithm.h:52
StringList modifications_variable_
Definition ProSEAlgorithm.h:416
std::vector< SearchResult > per_file
Definition ProSEAlgorithm.h:103
double fragment_mass_tolerance_
Definition ProSEAlgorithm.h:410
SearchContext prepareContext(const std::vector< FASTAFile::FASTAEntry > &fasta_db) const
Build a SearchContext (decoy-augmented database + FragmentIndex) for reuse.
void logModificationAnalysisSummary_(const SearchResult &result, const String &output_base_name) const
Helper: log the modification analysis summary (shared by in-memory and file-based paths)
String peptide_motif_
Definition ProSEAlgorithm.h:435
void postProcessHits_(const PeakMap &exp, std::vector< std::vector< ProSEAlgorithm::AnnotatedHit_ > > &annotated_hits, std::vector< ProteinIdentification > &protein_ids, PeptideIdentificationList &peptide_ids, Size top_hits, const StringList &modifications_fixed, const StringList &modifications_variable, Int peptide_missed_cleavages, double precursor_mass_tolerance, double fragment_mass_tolerance, const String &precursor_mass_tolerance_unit_ppm, const String &fragment_mass_tolerance_unit_ppm, const Int precursor_min_charge, const Int precursor_max_charge, const String &enzyme, const String &database_name) const
Filter and annotate search results.
Size peptide_missed_cleavages_
Definition ProSEAlgorithm.h:432
CalibrationResult_ last_calibration_result_
Definition ProSEAlgorithm.h:465
double computeModMatchTolerance_() const
Definition ProSEAlgorithm.h:486
MultiFileSearchResult searchWithModificationAnalysis(const std::vector< String > &in_spectra_files, const std::vector< FASTAFile::FASTAEntry > &fasta_db, const std::vector< String > &output_base_names={}, const String &aggregate_base_name="") const
Multi-file search with modification analysis (in-memory FASTA).
Result of a calibration pass.
Definition ProSEAlgorithm.h:451
Multi-file search result bundle.
Definition ProSEAlgorithm.h:102
Prepared per-database state shared across multiple spectrum files.
Definition ProSEAlgorithm.h:116
Comprehensive search result including modification analysis.
Definition ProSEAlgorithm.h:69
Base class for all classes that want to report their progress.
Definition ProgressLogger.h:27
A more convenient string class.
Definition String.h:32
int Int
Signed integer type.
Definition Types.h:72
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition Types.h:97
std::vector< Int > IntList
Vector of signed integers.
Definition ListUtils.h:29
std::vector< String > StringList
Vector of String.
Definition ListUtils.h:44
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
Slimmer structure as storing all scored candidates in PeptideHit objects takes too much space.
Definition ProSEAlgorithm.h:330
static bool hasBetterScore(const AnnotatedHit_ &a, const AnnotatedHit_ &b)
Definition ProSEAlgorithm.h:347
double score
main score
Definition ProSEAlgorithm.h:337
AASequence sequence
Definition ProSEAlgorithm.h:331