OpenMS  2.4.0
IDBoostGraph.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2017.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Julianus Pfeuffer $
32 // $Authors: Julianus Pfeuffer $
33 // --------------------------------------------------------------------------
34 
35 #ifndef OPENMS_ANALYSIS_ID_IDBOOSTGRAPH_H
36 #define OPENMS_ANALYSIS_ID_IDBOOSTGRAPH_H
37 
38 #define INFERENCE_BENCH
39 
40 #include <OpenMS/ANALYSIS/ID/MessagePasserFactory.h> //included in BPI
42 #include <OpenMS/CONCEPT/Types.h>
47 
48 #include <vector>
49 #include <unordered_map>
50 
51 #include <boost/function.hpp>
52 #include <boost/graph/adjacency_list.hpp>
53 #include <boost/graph/depth_first_search.hpp>
54 #include <boost/graph/filtered_graph.hpp>
55 #include <boost/graph/properties.hpp>
56 #include <boost/variant.hpp>
57 #include <boost/variant/detail/hash_variant.hpp>
58 #include <boost/variant/static_visitor.hpp>
59 
60 namespace OpenMS
61 {
62 
75  //TODO Add OPENMS_DLLAPI everywhere
76  class OPENMS_DLLAPI IDBoostGraph
77  {
78 
79  public:
80 
81  #pragma clang diagnostic push
82  #pragma clang diagnostic ignored "-Wextra-semi"
83 
84  BOOST_STRONG_TYPEDEF(boost::blank, PeptideCluster)
85 
86  BOOST_STRONG_TYPEDEF(double, ProteinGroup)
87 
88  BOOST_STRONG_TYPEDEF(String, Peptide)
89 
90  BOOST_STRONG_TYPEDEF(Size, RunIndex)
91 
92  BOOST_STRONG_TYPEDEF(int, Charge)
93 
94  #pragma clang diagnostic pop
95 
96  //typedefs
97  //TODO rename ProteinGroup type since it collides with the actual OpenMS ProteinGroup
98  typedef boost::variant<ProteinHit*, ProteinGroup, PeptideCluster, Peptide, RunIndex, Charge, PeptideHit*> IDPointer;
99  typedef boost::variant<const ProteinHit*, const ProteinGroup*, const PeptideCluster*, const Peptide, const RunIndex, const Charge, const PeptideHit*> IDPointerConst;
100  //TODO check the impact of different data structures to store nodes/edges
101  // Directed graphs would make the internal computations much easier (less in/out edge checking) but boost
102  // does not allow computation of "non-strongly" connected components for directed graphs, which is what we would
103  // need. We can think about after/while copying to CCs, to insert it into a directed graph!
104  typedef boost::adjacency_list <boost::setS, boost::vecS, boost::undirectedS, IDPointer> Graph;
105  //typedef boost::subgraph<boost::adjacency_list <boost::setS, boost::vecS, boost::undirectedS, IDPointer>> SubGraph;
106  typedef std::vector<Graph> Graphs;
107  //typedef std::vector<SubGraph> SubGraphs;
108  typedef boost::adjacency_list <boost::setS, boost::vecS, boost::undirectedS, IDPointer> GraphConst;
109  typedef boost::graph_traits<Graph>::vertex_descriptor vertex_t;
110  typedef boost::graph_traits<Graph>::edge_descriptor edge_t;
111  typedef boost::filtered_graph<Graph, boost::function<bool(edge_t)>, boost::function<bool(vertex_t)> > FilteredGraph;
112  typedef std::set<IDBoostGraph::vertex_t> ProteinNodeSet;
113  typedef std::set<IDBoostGraph::vertex_t> PeptideNodeSet;
114 
115 
118  public boost::default_dfs_visitor
119  {
120  public:
122  : gs(vgs), curr_v(0), next_v(0), m()
123  {}
124 
125  template < typename Vertex, typename Graph >
126  void start_vertex(Vertex u, const Graph & tg)
127  {
128  gs.emplace_back();
129  next_v = boost::add_vertex(tg[u], gs.back());
130  m[u] = next_v;
131  }
132 
133  template < typename Vertex, typename Graph >
134  void discover_vertex(Vertex /*u*/, const Graph & /*tg*/)
135  {
136  curr_v = next_v;
137  }
138 
139  template < typename Edge, typename Graph >
140  void examine_edge(Edge e, const Graph & tg)
141  {
142  if (m.find(e.m_target) == m.end())
143  {
144  next_v = boost::add_vertex(tg[e.m_target], gs.back());
145  m[e.m_target] = next_v;
146  }
147  else
148  {
149  next_v = m[e.m_target];
150  }
151 
152  boost::add_edge(m[e.m_source], next_v, gs.back());
153  }
154 
156  vertex_t curr_v, next_v;
158  std::map<vertex_t, vertex_t> m;
159  };
160 
161  //TODO group visitors by templates
164  public boost::static_visitor<OpenMS::String>
165  {
166  public:
167 
169  {
170  return pep->getSequence().toString() + "_" + pep->getCharge();
171  }
172 
174  {
175  return prot->getAccession();
176  }
177 
178  OpenMS::String operator()(const ProteinGroup& /*protgrp*/) const
179  {
180  return String("PG");
181  }
182 
183  OpenMS::String operator()(const PeptideCluster& /*pc*/) const
184  {
185  return String("PepClust");
186  }
187 
188  OpenMS::String operator()(const Peptide& peptide) const
189  {
190  return peptide;
191  }
192 
193  OpenMS::String operator()(const RunIndex& ri) const
194  {
195  return String("rep" + String(ri));
196  }
197 
198  OpenMS::String operator()(const Charge& chg) const
199  {
200  return String("chg" + String(chg));
201  }
202 
203  };
204 
207  template<class CharT>
209  public boost::static_visitor<>
210  {
211  public:
212 
213  explicit PrintAddressVisitor(std::basic_ostream<CharT> stream):
214  stream_(stream)
215  {}
216 
217  void operator()(PeptideHit* pep) const
218  {
219  stream_ << pep->getSequence().toUnmodifiedString() << ": " << pep << std::endl;
220  }
221 
222  void operator()(ProteinHit* prot) const
223  {
224  stream_ << prot->getAccession() << ": " << prot << std::endl;
225  }
226 
227  void operator()(const ProteinGroup& /*protgrp*/) const
228  {
229  stream_ << "PG" << std::endl;
230  }
231 
232  void operator()(const PeptideCluster& /*pc*/) const
233  {
234  stream_ << "PepClust" << std::endl;
235  }
236 
237  void operator()(const Peptide& peptide) const
238  {
239  stream_ << peptide << std::endl;
240  }
241 
242  void operator()(const RunIndex& ri) const
243  {
244  stream_ << "rep" << ri << std::endl;
245  }
246 
247  void operator()(const Charge& chg) const
248  {
249  stream_ << "chg" << chg << std::endl;
250  }
251 
252  std::basic_ostream<CharT> stream_;
253  };
254 
258  public boost::static_visitor<>
259  {
260  public:
261 
262  void operator()(PeptideHit* pep, double posterior) const
263  {
264  pep->setScore(posterior);
265  }
266 
267  void operator()(ProteinHit* prot, double posterior) const
268  {
269  #ifdef INFERENCE_DEBUG
270  std::cout << "set score " << posterior << " for " << prot->getAccession() << std::endl;
271  #endif
272  prot->setScore(posterior);
273  }
274 
275  void operator()(ProteinGroup& pg, double posterior) const
276  {
277  pg = posterior;
278  }
279 
280  // Everything else, do nothing for now
281  template <class T>
282  void operator()(T& /*any node type*/, double /*posterior*/) const
283  {
284  // do nothing
285  }
286 
287  };
288 
290  IDBoostGraph(ProteinIdentification &proteins, std::vector<PeptideIdentification>& idedSpectra);
291  IDBoostGraph(
292  ProteinIdentification &proteins,
293  std::vector<PeptideIdentification>& idedSpectra,
294  const ExperimentalDesign& ed);
295 
297  void applyFunctorOnCCs(std::function<void(Graph&)> functor);
298  void applyFunctorOnCCsST(std::function<void(Graph&)> functor);
299 
302  void clusterIndistProteinsAndPeptides();
304  void clusterIndistProteinsAndPeptidesAndExtendGraph();
305 
309  void annotateIndistProteins(bool addSingletons = true) const;
310 
312  void computeConnectedComponents();
313 
319  void buildGraph(Size use_top_psms);
320  void buildGraphWithRunInfo(Size use_top_psms, bool readstore_run_info = true);
321 
322  Size getNrConnectedComponents();
323 
324  //TODO docu
325  //void buildExtendedGraph(bool use_all_psms, std::pair<int,int> chargeRange, unsigned int nrReplicates);
326 
327  private:
328 
329  struct SequenceToReplicateChargeVariantHierarchy;
330 
332  Graph g;
333 
336 
337  #ifdef INFERENCE_BENCH
338  std::vector<std::pair<Size,double>> sizes_and_times_{1};
340  #endif
341 
343  //TODO for consensusXML this probably needs to become a vector.
346  std::vector<PeptideIdentification>& idedSpectra_;
348  //TODO think about using a pointer here instead of copying. But it's usually not too big
350 
353  //TODO think about preallocating it, but the number of peptide hits is not easily computed
354  //since they are inside the pepIDs
355  //TODO would multiple sets be better?
356  std::unordered_map<vertex_t, Size> pepHitVtx_to_run_;
357 
360 
363  vertex_t addVertexWithLookup_(IDPointer& ptr, std::unordered_map<IDPointer, vertex_t, boost::hash<IDPointer>>& vertex_map);
364  //vertex_t addVertexWithLookup_(IDPointerConst& ptr, std::unordered_map<IDPointerConst, vertex_t, boost::hash<IDPointerConst>>& vertex_map);
365 
366 
368  void annotateIndistProteins_(const Graph& fg, bool addSingletons) const;
369 
370  void printFilteredGraph(std::ostream& out, const FilteredGraph& fg) const;
371  void printGraph(std::ostream& out, const Graph& fg) const;
372 
373  };
374 
375 } //namespace OpenMS
376 
377 #endif // OPENMS_ANALYSIS_ID_IDBOOSTGRAPH_H
Representation of a protein identification run.
Definition: ProteinIdentification.h:68
void operator()(const Peptide &peptide) const
Definition: IDBoostGraph.h:237
const String & getAccession() const
returns the accession of the protein
void operator()(PeptideHit *pep) const
Definition: IDBoostGraph.h:217
A more convenient string class.
Definition: String.h:58
void operator()(T &, double) const
Definition: IDBoostGraph.h:282
OpenMS::String operator()(const ProteinGroup &) const
Definition: IDBoostGraph.h:178
std::set< IDBoostGraph::vertex_t > PeptideNodeSet
Definition: IDBoostGraph.h:113
boost::variant< ProteinHit *, ProteinGroup, PeptideCluster, Peptide, RunIndex, Charge, PeptideHit * > IDPointer
Definition: IDBoostGraph.h:98
void operator()(ProteinHit *prot, double posterior) const
Definition: IDBoostGraph.h:267
Creates and maintains a boost graph based on the OpenMS ID datastructures.
Definition: IDBoostGraph.h:76
OpenMS::String operator()(const RunIndex &ri) const
Definition: IDBoostGraph.h:193
void discover_vertex(Vertex, const Graph &)
Definition: IDBoostGraph.h:134
void setScore(const double score)
sets the score of the protein hit
Definition: IDBoostGraph.h:257
void examine_edge(Edge e, const Graph &tg)
Definition: IDBoostGraph.h:140
const AASequence & getSequence() const
returns the peptide sequence without trailing or following spaces
void operator()(const RunIndex &ri) const
Definition: IDBoostGraph.h:242
OpenMS::String operator()(const ProteinHit *prot) const
Definition: IDBoostGraph.h:173
boost::variant< const ProteinHit *, const ProteinGroup *, const PeptideCluster *, const Peptide, const RunIndex, const Charge, const PeptideHit * > IDPointerConst
Definition: IDBoostGraph.h:99
vertex_t next_v
Definition: IDBoostGraph.h:156
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
ProteinIdentification & proteins_
underlying protein identification object
Definition: IDBoostGraph.h:344
Graph g
the initial boost Graph
Definition: IDBoostGraph.h:329
LabelVisitor lv_
a visitor that creates labels based on the node type (e.g. for printing)
Definition: IDBoostGraph.h:359
Representation of the Experimental Design in OpenMS. Instances can be loaded via the ExperimentalDesi...
Definition: ExperimentalDesign.h:85
void start_vertex(Vertex u, const Graph &tg)
Definition: IDBoostGraph.h:126
std::unordered_map< vertex_t, Size > pepHitVtx_to_run_
Definition: IDBoostGraph.h:356
String toString() const
returns the peptide as string with modifications embedded in brackets
void operator()(const PeptideCluster &) const
Definition: IDBoostGraph.h:232
OpenMS::String operator()(const Peptide &peptide) const
Definition: IDBoostGraph.h:188
void operator()(PeptideHit *pep, double posterior) const
Definition: IDBoostGraph.h:262
boost::adjacency_list< boost::setS, boost::vecS, boost::undirectedS, IDPointer > GraphConst
Definition: IDBoostGraph.h:108
Representation of a peptide hit.
Definition: PeptideHit.h:54
PrintAddressVisitor(std::basic_ostream< CharT > stream)
Definition: IDBoostGraph.h:213
boost::adjacency_list< boost::setS, boost::vecS, boost::undirectedS, IDPointer > Graph
Definition: IDBoostGraph.h:104
std::vector< PeptideIdentification > & idedSpectra_
underlying peptide identifications
Definition: IDBoostGraph.h:346
boost::graph_traits< Graph >::edge_descriptor edge_t
Definition: IDBoostGraph.h:110
A boost dfs visitor that copies connected components into a vector of graphs.
Definition: IDBoostGraph.h:117
std::set< IDBoostGraph::vertex_t > ProteinNodeSet
Definition: IDBoostGraph.h:112
const ExperimentalDesign exp_design_
underlying experimental design, if not given it will be default constructed
Definition: IDBoostGraph.h:349
Representation of a protein hit.
Definition: ProteinHit.h:57
void operator()(ProteinHit *prot) const
Definition: IDBoostGraph.h:222
void operator()(ProteinGroup &pg, double posterior) const
Definition: IDBoostGraph.h:275
OpenMS::String operator()(const Charge &chg) const
Definition: IDBoostGraph.h:198
Definition: IDBoostGraph.h:208
void operator()(const ProteinGroup &) const
Definition: IDBoostGraph.h:227
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
OpenMS::String operator()(const PeptideHit *pep) const
Definition: IDBoostGraph.h:168
Visits nodes in the boost graph (ptrs to an ID Object) and depending on their type creates a label...
Definition: IDBoostGraph.h:163
dfs_ccsplit_visitor(Graphs &vgs)
Definition: IDBoostGraph.h:121
std::basic_ostream< CharT > stream_
Definition: IDBoostGraph.h:252
String toUnmodifiedString() const
returns the peptide as string without any modifications
void operator()(const Charge &chg) const
Definition: IDBoostGraph.h:247
OpenMS::String operator()(const PeptideCluster &) const
Definition: IDBoostGraph.h:183
Graphs ccs_
the Graph split into connected components
Definition: IDBoostGraph.h:335
boost::filtered_graph< Graph, boost::function< bool(edge_t)>, boost::function< bool(vertex_t)> > FilteredGraph
Definition: IDBoostGraph.h:111
std::map< vertex_t, vertex_t > m
A mapping from old node id to new node id to not duplicate existing ones in the new graph...
Definition: IDBoostGraph.h:158
Graphs & gs
Definition: IDBoostGraph.h:155
void setScore(double score)
sets the PSM score
boost::graph_traits< Graph >::vertex_descriptor vertex_t
Definition: IDBoostGraph.h:109
Int getCharge() const
returns the charge of the peptide
std::vector< Graph > Graphs
Definition: IDBoostGraph.h:106