OpenMS
IDMergerAlgorithm.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2023.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Julianus Pfeuffer $
32 // $Authors: Julianus Pfeuffer $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
42 
43 #include <unordered_set>
44 
45 namespace OpenMS
46 {
47 
48  //TODO add params for checking consistency (i.e. how strict to check)
49  //TODO add another subclass that does score-aware merging? (i.e. only keep best per peptide[sequence])
50 
61  class OPENMS_DLLAPI IDMergerAlgorithm:
62  public DefaultParamHandler,
63  public ProgressLogger
64  {
65  public:
66  explicit IDMergerAlgorithm (const String& runIdentifier = "merged");
67 
70  void insertRuns(std::vector<ProteinIdentification>&& prots,
71  std::vector<PeptideIdentification>&& peps);
72  void insertRuns(const std::vector<ProteinIdentification>& prots,
73  const std::vector<PeptideIdentification>& peps);
74 
75  //TODO add methods to just insert prots or just peps. Especially makes sense if you do re-indexing anyway,
76  // then you do not need the proteins. But then we need origin information. Either externally in form of a
77  // String or StringList (like the one from ProteinID.getPrimaryMSRunPath). Or by having the file annotated
78  // at the PeptideID (with getBasename maybe?)
79  // Current solution would be to clear the ProteinIdentification if you do not need the proteins and add all the
80  // necessary information about origin(s) to this ProteinIdentification.
81 
84  std::vector<PeptideIdentification>& peps);
85 
86  private:
87 
90 
93 
100  const std::vector<ProteinIdentification>& protRuns,
101  const String& experiment_type) const;
102 
110  const std::vector<ProteinIdentification>& protRuns,
111  const ProteinIdentification& ref,
112  const String& experiment_type) const;
113 
117  std::vector<ProteinIdentification>&& old_protRuns
118  );
119 
124  std::vector<PeptideIdentification>&& pepIDs,
125  const std::map<String, Size>& runID_to_runIdx,
126  const std::vector<StringList>& originFiles,
127  bool annotate_origin
128  );
129 
130 
132  std::vector<PeptideIdentification>&& pepIDs,
133  std::vector<ProteinIdentification>&& old_protRuns
134  );
135 
138 
140  std::vector<PeptideIdentification> pep_result_;
141 
142  static size_t accessionHash_(const ProteinHit& p){
143  return std::hash<String>()(p.getAccession());
144  }
145  static bool accessionEqual_(const ProteinHit& p1, const ProteinHit& p2){
146  return p1.getAccession() == p2.getAccession();
147  }
148  using hash_type = std::size_t (*)(const ProteinHit&);
149  using equal_type = bool (*)(const ProteinHit&, const ProteinHit&);
150  std::unordered_set<ProteinHit, hash_type, equal_type> collected_protein_hits_;
151 
153  bool filled_ = false;
154 
156  std::map<String, Size> file_origin_to_idx_;
157 
160  };
161 } // namespace OpenMS
A base class for all classes handling default parameters.
Definition: DefaultParamHandler.h:92
Creates a new Protein ID run into which other runs can be inserted. Creates union of protein hits but...
Definition: IDMergerAlgorithm.h:64
ProteinIdentification prot_result_
the resulting new Protein IDs
Definition: IDMergerAlgorithm.h:137
void movePepIDsAndRefProteinsToResultFaster_(std::vector< PeptideIdentification > &&pepIDs, std::vector< ProteinIdentification > &&old_protRuns)
bool(*)(const ProteinHit &, const ProteinHit &) equal_type
Definition: IDMergerAlgorithm.h:149
void insertProteinIDs_(std::vector< ProteinIdentification > &&old_protRuns)
void returnResultsAndClear(ProteinIdentification &prots, std::vector< PeptideIdentification > &peps)
Return the merged results and reset/clear all internal data.
bool checkOldRunConsistency_(const std::vector< ProteinIdentification > &protRuns, const ProteinIdentification &ref, const String &experiment_type) const
void insertRuns(const std::vector< ProteinIdentification > &prots, const std::vector< PeptideIdentification > &peps)
std::size_t(*)(const ProteinHit &) hash_type
Definition: IDMergerAlgorithm.h:148
static size_t accessionHash_(const ProteinHit &p)
Definition: IDMergerAlgorithm.h:142
std::vector< PeptideIdentification > pep_result_
the resulting new Peptide IDs
Definition: IDMergerAlgorithm.h:140
void insertRuns(std::vector< ProteinIdentification > &&prots, std::vector< PeptideIdentification > &&peps)
static bool accessionEqual_(const ProteinHit &p1, const ProteinHit &p2)
Definition: IDMergerAlgorithm.h:145
std::map< String, Size > file_origin_to_idx_
to keep track of the mzML origins of spectra
Definition: IDMergerAlgorithm.h:156
static void copySearchParams_(const ProteinIdentification &from, ProteinIdentification &to)
Copies over search parameters.
std::unordered_set< ProteinHit, hash_type, equal_type > collected_protein_hits_
Definition: IDMergerAlgorithm.h:150
String getNewIdentifier_() const
Returns the new identifier. The initial identifier plus a timestamp.
void updateAndMovePepIDs_(std::vector< PeptideIdentification > &&pepIDs, const std::map< String, Size > &runID_to_runIdx, const std::vector< StringList > &originFiles, bool annotate_origin)
String id_
the new identifier string
Definition: IDMergerAlgorithm.h:159
bool checkOldRunConsistency_(const std::vector< ProteinIdentification > &protRuns, const String &experiment_type) const
IDMergerAlgorithm(const String &runIdentifier="merged")
Base class for all classes that want to report their progress.
Definition: ProgressLogger.h:53
Representation of a protein hit.
Definition: ProteinHit.h:60
const String & getAccession() const
returns the accession of the protein
Representation of a protein identification run.
Definition: ProteinIdentification.h:76
A more convenient string class.
Definition: String.h:60
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:48