OpenMS
Loading...
Searching...
No Matches
DBSuitability.h
Go to the documentation of this file.
1// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2// SPDX-License-Identifier: BSD-3-Clause
3//
4// --------------------------------------------------------------------------
5// $Maintainer: Tom Waschischeck $
6// $Authors: Tom Waschischeck $
7// --------------------------------------------------------------------------
8
9#pragma once
10
16
17#include <cfloat>
18#include <vector>
19
20#include <boost/regex.hpp>
21
22namespace OpenMS
23{
24 class ParamXMLFile;
25 class PeptideIdentification;
26 class PeptideHit;
27 class MSExperiment;
28
46 class OPENMS_DLLAPI DBSuitability:
48 {
49 public:
51 struct OPENMS_DLLAPI SuitabilityData
52 {
54 Size num_top_novo = 0;
55
57 Size num_top_db = 0;
58
60 Size num_interest = 0;
61
64 Size num_re_ranked = 0;
65
68 double cut_off = DBL_MAX;
69
79 double suitability = 0;
80
83 double suitability_no_rerank = 0;
84
86 double suitability_corr_no_rerank = 0;
87
88 // resets all members to their defaults
89 void clear();
90
93 void setCorrectionFactor(double factor);
94
95 double getCorrectionFactor() const;
96
97 double getCorrectedNovoHits() const;
98
100
110
111 private:
117 double corr_factor = -1;
118
120 double num_top_novo_corr = 0;
121
127 double suitability_corr = 0;
128 };
129
134
136 ~DBSuitability() override = default;
137
140
204 void compute(PeptideIdentificationList&& pep_ids, const MSExperiment& exp, const std::vector<FASTAFile::FASTAEntry>& original_fasta, const std::vector<FASTAFile::FASTAEntry>& novo_fasta, const ProteinIdentification::SearchParameters& search_params);
205
215 const std::vector<SuitabilityData>& getResults() const;
216
217 private:
219 std::vector<SuitabilityData> results_;
220
222 const boost::regex decoy_pattern_;
223
238 double getDecoyDiff_(const PeptideIdentification& pep_id) const;
239
254 double getDecoyCutOff_(const PeptideIdentificationList& pep_ids, double reranking_cutoff_percentile) const;
255
269 bool isNovoHit_(const PeptideHit& hit) const;
270
279 bool checkScoreBetterThanThreshold_(const PeptideHit& hit, double threshold, bool higher_score_better) const;
280
292
300 void writeIniFile_(const Param& parameters, const String& filename) const;
301
324 PeptideIdentificationList runIdentificationSearch_(const MSExperiment& exp, const std::vector<FASTAFile::FASTAEntry>& fasta_data, const String& adapter_name, Param& parameters) const;
325
336 std::vector<FASTAFile::FASTAEntry> getSubsampledFasta_(const std::vector<FASTAFile::FASTAEntry>& fasta_data, double subsampling_rate) const;
337
355
365 void appendDecoys_(std::vector<FASTAFile::FASTAEntry>& fasta) const;
366
374 double extractScore_(const PeptideHit& pep_hit) const;
375
389 double calculateCorrectionFactor_(const SuitabilityData& data, const SuitabilityData& data_sampled, double sampling_rate) const;
390
399 UInt numberOfUniqueProteins_(const PeptideIdentificationList& peps, UInt number_of_hits = 1) const;
400
409 Size getIndexWithMedianNovoHits_(const std::vector<SuitabilityData>& data) const;
410
427 double getScoreMatchingFDR_(const PeptideIdentificationList& pep_ids, double FDR, const String& score_name, bool higher_score_better) const;
428 };
429
430 // friend class to test private member functions
432 {
433 public:
435
437
438 std::vector<FASTAFile::FASTAEntry> getSubsampledFasta(const std::vector<FASTAFile::FASTAEntry>& fasta_data, double subsampling_rate)
439 {
440 return suit_.getSubsampledFasta_(fasta_data, subsampling_rate);
441 }
442
443 void appendDecoys(std::vector<FASTAFile::FASTAEntry>& fasta)
444 {
445 suit_.appendDecoys_(fasta);
446 }
447
448 double calculateCorrectionFactor(const DBSuitability::SuitabilityData& data, const DBSuitability::SuitabilityData& data_sampled, double sampling_rate)
449 {
450 return suit_.calculateCorrectionFactor_(data, data_sampled, sampling_rate);
451 }
452
454 {
455 return suit_.numberOfUniqueProteins_(peps, number_of_hits);
456 }
457
458 Size getIndexWithMedianNovoHits(const std::vector<DBSuitability::SuitabilityData>& data)
459 {
461 }
462
463 double getScoreMatchingFDR(const PeptideIdentificationList& pep_ids, double FDR, String score_name, bool higher_score_better)
464 {
465 return suit_.getScoreMatchingFDR_(pep_ids, FDR, score_name, higher_score_better);
466 }
467
468 /* Not tested:
469 getDecoyDiff_, getDecoyCutOff_, isNovoHit_, checkScoreBetterThanThreshold_
470 Reason: These functions are essential to the normal suitability calculation and if something would not work, the test for 'compute' would fail.
471
472 extractSearchAdapterInfoFromMetaValues_, writeIniFile_, extractScore_
473 Reason: These functions are very straightforeward.
474
475 runIdentificationSearch_
476 Reason: This function simulates a whole workflow and testing it would be to complicated.
477 */
478
479 private:
481 };
482}
483
Definition DBSuitability.h:432
std::vector< FASTAFile::FASTAEntry > getSubsampledFasta(const std::vector< FASTAFile::FASTAEntry > &fasta_data, double subsampling_rate)
Definition DBSuitability.h:438
double calculateCorrectionFactor(const DBSuitability::SuitabilityData &data, const DBSuitability::SuitabilityData &data_sampled, double sampling_rate)
Definition DBSuitability.h:448
DBSuitability suit_
Definition DBSuitability.h:480
UInt numberOfUniqueProteins(const PeptideIdentificationList &peps, UInt number_of_hits=1)
Definition DBSuitability.h:453
Size getIndexWithMedianNovoHits(const std::vector< DBSuitability::SuitabilityData > &data)
Definition DBSuitability.h:458
double getScoreMatchingFDR(const PeptideIdentificationList &pep_ids, double FDR, String score_name, bool higher_score_better)
Definition DBSuitability.h:463
void appendDecoys(std::vector< FASTAFile::FASTAEntry > &fasta)
Definition DBSuitability.h:443
This class holds the functionality of calculating the database suitability.
Definition DBSuitability.h:48
bool checkScoreBetterThanThreshold_(const PeptideHit &hit, double threshold, bool higher_score_better) const
Tests if a PeptideHit has a score better than the given threshold.
Size getIndexWithMedianNovoHits_(const std::vector< SuitabilityData > &data) const
Finds the SuitabilityData object with the median number of de novo hits.
void writeIniFile_(const Param &parameters, const String &filename) const
Writes parameters into a given file.
void compute(PeptideIdentificationList &&pep_ids, const MSExperiment &exp, const std::vector< FASTAFile::FASTAEntry > &original_fasta, const std::vector< FASTAFile::FASTAEntry > &novo_fasta, const ProteinIdentification::SearchParameters &search_params)
Computes suitability of a database used to search a mzML.
PeptideIdentificationList runIdentificationSearch_(const MSExperiment &exp, const std::vector< FASTAFile::FASTAEntry > &fasta_data, const String &adapter_name, Param &parameters) const
Executes the workflow from search adapter, followed by PeptideIndexer and finishes with FDR.
~DBSuitability() override=default
Destructor.
double calculateCorrectionFactor_(const SuitabilityData &data, const SuitabilityData &data_sampled, double sampling_rate) const
Calculates the correction factor from two suitability calculations.
UInt numberOfUniqueProteins_(const PeptideIdentificationList &peps, UInt number_of_hits=1) const
Determines the number of unique proteins found in the protein accessions of PeptideIdentifications.
const std::vector< SuitabilityData > & getResults() const
Returns results calculated by this metric.
std::vector< FASTAFile::FASTAEntry > getSubsampledFasta_(const std::vector< FASTAFile::FASTAEntry > &fasta_data, double subsampling_rate) const
Creates a subsampled fasta with the given subsampling rate.
double getScoreMatchingFDR_(const PeptideIdentificationList &pep_ids, double FDR, const String &score_name, bool higher_score_better) const
Extracts the worst score that still passes a FDR (q-value) threshold.
std::vector< SuitabilityData > results_
result vector
Definition DBSuitability.h:219
double getDecoyCutOff_(const PeptideIdentificationList &pep_ids, double reranking_cutoff_percentile) const
Calculates a xcorr cut-off based on decoy hits.
void appendDecoys_(std::vector< FASTAFile::FASTAEntry > &fasta) const
Calculates and appends decoys to a given vector of FASTAEntry.
void calculateSuitability_(const PeptideIdentificationList &pep_ids, SuitabilityData &data) const
Calculates all suitability data from a combined deNovo+database search.
bool isNovoHit_(const PeptideHit &hit) const
Tests if a PeptideHit is considered a deNovo hit.
const boost::regex decoy_pattern_
pattern for finding a decoy string
Definition DBSuitability.h:222
double extractScore_(const PeptideHit &pep_hit) const
Returns the cross correlation score normalized by MW (if existing), else if the 'force' flag is set t...
double getDecoyDiff_(const PeptideIdentification &pep_id) const
Calculates the xcorr difference between the top two hits marked as decoy.
std::pair< String, Param > extractSearchAdapterInfoFromMetaValues_(const ProteinIdentification::SearchParameters &meta_values) const
Looks through meta values of SearchParameters to find out which search adapter was used.
A base class for all classes handling default parameters.
Definition DefaultParamHandler.h:66
In-Memory representation of a mass spectrometry run.
Definition MSExperiment.h:49
Management and storage of parameters / INI files.
Definition Param.h:46
Represents a single spectrum match (candidate) for a specific tandem mass spectrum (MS/MS).
Definition PeptideHit.h:52
Container for peptide identifications from multiple spectra.
Definition PeptideIdentificationList.h:66
Represents the set of candidates (SpectrumMatches) identified for a single precursor spectrum.
Definition PeptideIdentification.h:64
A more convenient string class.
Definition String.h:34
unsigned int UInt
Unsigned integer type.
Definition Types.h:64
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition Types.h:97
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
struct to store results
Definition DBSuitability.h:52
SuitabilityData simulateNoReRanking() const
Returns a SuitabilityData object containing the data if re-ranking didn't happen.
Search parameters of the DB search.
Definition ProteinIdentification.h:254