OpenMS  2.4.0
IDBoostGraph.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2017.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Julianus Pfeuffer $
32 // $Authors: Julianus Pfeuffer $
33 // --------------------------------------------------------------------------
34 
35 #ifndef OPENMS_ANALYSIS_ID_IDBOOSTGRAPH_H
36 #define OPENMS_ANALYSIS_ID_IDBOOSTGRAPH_H
37 
38 // define to get timings for connected components
39 //#define INFERENCE_BENCH
40 
41 #include <OpenMS/ANALYSIS/ID/MessagePasserFactory.h> //included in BPI
43 #include <OpenMS/CONCEPT/Types.h>
48 
49 #include <vector>
50 #include <unordered_map>
51 
52 #include <boost/function.hpp>
53 #include <boost/graph/adjacency_list.hpp>
54 #include <boost/graph/depth_first_search.hpp>
55 #include <boost/graph/filtered_graph.hpp>
56 #include <boost/graph/properties.hpp>
57 #include <boost/variant.hpp>
58 #include <boost/variant/detail/hash_variant.hpp>
59 #include <boost/variant/static_visitor.hpp>
60 
61 namespace OpenMS
62 {
63 
76  //TODO Add OPENMS_DLLAPI everywhere
77  class OPENMS_DLLAPI IDBoostGraph
78  {
79 
80  public:
81 
82  #pragma clang diagnostic push
83  #pragma clang diagnostic ignored "-Wextra-semi"
84 
85  BOOST_STRONG_TYPEDEF(boost::blank, PeptideCluster)
86 
87  BOOST_STRONG_TYPEDEF(double, ProteinGroup)
88 
89  BOOST_STRONG_TYPEDEF(String, Peptide)
90 
91  BOOST_STRONG_TYPEDEF(Size, RunIndex)
92 
93  BOOST_STRONG_TYPEDEF(int, Charge)
94 
95  #pragma clang diagnostic pop
96 
97  //typedefs
98  //TODO rename ProteinGroup type since it collides with the actual OpenMS ProteinGroup
99  typedef boost::variant<ProteinHit*, ProteinGroup, PeptideCluster, Peptide, RunIndex, Charge, PeptideHit*> IDPointer;
100  typedef boost::variant<const ProteinHit*, const ProteinGroup*, const PeptideCluster*, const Peptide, const RunIndex, const Charge, const PeptideHit*> IDPointerConst;
101  //TODO check the impact of different data structures to store nodes/edges
102  // Directed graphs would make the internal computations much easier (less in/out edge checking) but boost
103  // does not allow computation of "non-strongly" connected components for directed graphs, which is what we would
104  // need. We can think about after/while copying to CCs, to insert it into a directed graph!
105  typedef boost::adjacency_list <boost::setS, boost::vecS, boost::undirectedS, IDPointer> Graph;
106  //typedef boost::subgraph<boost::adjacency_list <boost::setS, boost::vecS, boost::undirectedS, IDPointer>> SubGraph;
107  typedef std::vector<Graph> Graphs;
108  //typedef std::vector<SubGraph> SubGraphs;
109  typedef boost::adjacency_list <boost::setS, boost::vecS, boost::undirectedS, IDPointer> GraphConst;
110  typedef boost::graph_traits<Graph>::vertex_descriptor vertex_t;
111  typedef boost::graph_traits<Graph>::edge_descriptor edge_t;
112  typedef boost::filtered_graph<Graph, boost::function<bool(edge_t)>, boost::function<bool(vertex_t)> > FilteredGraph;
113  typedef std::set<IDBoostGraph::vertex_t> ProteinNodeSet;
114  typedef std::set<IDBoostGraph::vertex_t> PeptideNodeSet;
115 
116 
119  public boost::default_dfs_visitor
120  {
121  public:
123  : gs(vgs), curr_v(0), next_v(0), m()
124  {}
125 
126  template < typename Vertex, typename Graph >
127  void start_vertex(Vertex u, const Graph & tg)
128  {
129  gs.emplace_back();
130  next_v = boost::add_vertex(tg[u], gs.back());
131  m[u] = next_v;
132  }
133 
134  template < typename Vertex, typename Graph >
135  void discover_vertex(Vertex /*u*/, const Graph & /*tg*/)
136  {
137  curr_v = next_v;
138  }
139 
140  template < typename Edge, typename Graph >
141  void examine_edge(Edge e, const Graph & tg)
142  {
143  if (m.find(e.m_target) == m.end())
144  {
145  next_v = boost::add_vertex(tg[e.m_target], gs.back());
146  m[e.m_target] = next_v;
147  }
148  else
149  {
150  next_v = m[e.m_target];
151  }
152 
153  boost::add_edge(m[e.m_source], next_v, gs.back());
154  }
155 
157  vertex_t curr_v, next_v;
159  std::map<vertex_t, vertex_t> m;
160  };
161 
162  //TODO group visitors by templates
165  public boost::static_visitor<OpenMS::String>
166  {
167  public:
168 
170  {
171  return pep->getSequence().toString() + "_" + pep->getCharge();
172  }
173 
175  {
176  return prot->getAccession();
177  }
178 
179  OpenMS::String operator()(const ProteinGroup& /*protgrp*/) const
180  {
181  return String("PG");
182  }
183 
184  OpenMS::String operator()(const PeptideCluster& /*pc*/) const
185  {
186  return String("PepClust");
187  }
188 
189  OpenMS::String operator()(const Peptide& peptide) const
190  {
191  return peptide;
192  }
193 
194  OpenMS::String operator()(const RunIndex& ri) const
195  {
196  return String("rep" + String(ri));
197  }
198 
199  OpenMS::String operator()(const Charge& chg) const
200  {
201  return String("chg" + String(chg));
202  }
203 
204  };
205 
208  template<class CharT>
210  public boost::static_visitor<>
211  {
212  public:
213 
214  explicit PrintAddressVisitor(std::basic_ostream<CharT> stream):
215  stream_(stream)
216  {}
217 
218  void operator()(PeptideHit* pep) const
219  {
220  stream_ << pep->getSequence().toUnmodifiedString() << ": " << pep << std::endl;
221  }
222 
223  void operator()(ProteinHit* prot) const
224  {
225  stream_ << prot->getAccession() << ": " << prot << std::endl;
226  }
227 
228  void operator()(const ProteinGroup& /*protgrp*/) const
229  {
230  stream_ << "PG" << std::endl;
231  }
232 
233  void operator()(const PeptideCluster& /*pc*/) const
234  {
235  stream_ << "PepClust" << std::endl;
236  }
237 
238  void operator()(const Peptide& peptide) const
239  {
240  stream_ << peptide << std::endl;
241  }
242 
243  void operator()(const RunIndex& ri) const
244  {
245  stream_ << "rep" << ri << std::endl;
246  }
247 
248  void operator()(const Charge& chg) const
249  {
250  stream_ << "chg" << chg << std::endl;
251  }
252 
253  std::basic_ostream<CharT> stream_;
254  };
255 
259  public boost::static_visitor<>
260  {
261  public:
262 
263  void operator()(PeptideHit* pep, double posterior) const
264  {
265  pep->setScore(posterior);
266  }
267 
268  void operator()(ProteinHit* prot, double posterior) const
269  {
270  #ifdef INFERENCE_DEBUG
271  std::cout << "set score " << posterior << " for " << prot->getAccession() << std::endl;
272  #endif
273  prot->setScore(posterior);
274  }
275 
276  void operator()(ProteinGroup& pg, double posterior) const
277  {
278  pg = posterior;
279  }
280 
281  // Everything else, do nothing for now
282  template <class T>
283  void operator()(T& /*any node type*/, double /*posterior*/) const
284  {
285  // do nothing
286  }
287 
288  };
289 
291  IDBoostGraph(ProteinIdentification &proteins, std::vector<PeptideIdentification>& idedSpectra);
292  IDBoostGraph(
293  ProteinIdentification &proteins,
294  std::vector<PeptideIdentification>& idedSpectra,
295  const ExperimentalDesign& ed);
296 
298  void applyFunctorOnCCs(std::function<unsigned long(Graph&)> functor);
299  void applyFunctorOnCCsST(std::function<void(Graph&)> functor);
300 
303  void clusterIndistProteinsAndPeptides();
305  void clusterIndistProteinsAndPeptidesAndExtendGraph();
306 
310  void annotateIndistProteins(bool addSingletons = true) const;
311 
313  void computeConnectedComponents();
314 
320  void buildGraph(Size use_top_psms);
321  void buildGraphWithRunInfo(Size use_top_psms, bool readstore_run_info = true);
322 
323  Size getNrConnectedComponents();
324 
325  //TODO docu
326  //void buildExtendedGraph(bool use_all_psms, std::pair<int,int> chargeRange, unsigned int nrReplicates);
327 
328  static void printFilteredGraph(std::ostream& out, const FilteredGraph& fg);
329  static void printGraph(std::ostream& out, const Graph& fg);
330 
331  private:
332 
333  struct SequenceToReplicateChargeVariantHierarchy;
334 
336  Graph g;
337 
340 
341  #ifdef INFERENCE_BENCH
342  std::vector<std::tuple<vertex_t, vertex_t, unsigned long, double>> sizes_and_times_{1};
344  #endif
345 
347  //TODO for consensusXML this probably needs to become a vector.
350  std::vector<PeptideIdentification>& idedSpectra_;
352  //TODO think about using a pointer here instead of copying. But it's usually not too big
354 
357  //TODO think about preallocating it, but the number of peptide hits is not easily computed
358  //since they are inside the pepIDs
359  //TODO would multiple sets be better?
360  std::unordered_map<vertex_t, Size> pepHitVtx_to_run_;
361 
363  //LabelVisitor lv_; //currently created locally
364 
367  vertex_t addVertexWithLookup_(IDPointer& ptr, std::unordered_map<IDPointer, vertex_t, boost::hash<IDPointer>>& vertex_map);
368  //vertex_t addVertexWithLookup_(IDPointerConst& ptr, std::unordered_map<IDPointerConst, vertex_t, boost::hash<IDPointerConst>>& vertex_map);
369 
370 
372  void annotateIndistProteins_(const Graph& fg, bool addSingletons) const;
373 
374  };
375 
376 } //namespace OpenMS
377 
378 #endif // OPENMS_ANALYSIS_ID_IDBOOSTGRAPH_H
const ExperimentalDesign exp_design_
underlying experimental design, if not given it will be default constructed
Definition: IDBoostGraph.h:353
boost::variant< const ProteinHit *, const ProteinGroup *, const PeptideCluster *, const Peptide, const RunIndex, const Charge, const PeptideHit * > IDPointerConst
Definition: IDBoostGraph.h:100
Int getCharge() const
returns the charge of the peptide
void operator()(PeptideHit *pep) const
Definition: IDBoostGraph.h:218
void operator()(ProteinHit *prot, double posterior) const
Definition: IDBoostGraph.h:268
OpenMS::String operator()(const RunIndex &ri) const
Definition: IDBoostGraph.h:194
boost::variant< ProteinHit *, ProteinGroup, PeptideCluster, Peptide, RunIndex, Charge, PeptideHit * > IDPointer
Definition: IDBoostGraph.h:99
std::set< IDBoostGraph::vertex_t > ProteinNodeSet
Definition: IDBoostGraph.h:113
Representation of a peptide hit.
Definition: PeptideHit.h:54
void operator()(T &, double) const
Definition: IDBoostGraph.h:283
A boost dfs visitor that copies connected components into a vector of graphs.
Definition: IDBoostGraph.h:118
std::set< IDBoostGraph::vertex_t > PeptideNodeSet
Definition: IDBoostGraph.h:114
void operator()(const RunIndex &ri) const
Definition: IDBoostGraph.h:243
A more convenient string class.
Definition: String.h:58
std::vector< Graph > Graphs
Definition: IDBoostGraph.h:107
Representation of the Experimental Design in OpenMS. Instances can be loaded via the ExperimentalDesi...
Definition: ExperimentalDesign.h:85
void examine_edge(Edge e, const Graph &tg)
Definition: IDBoostGraph.h:141
std::vector< PeptideIdentification > & idedSpectra_
underlying peptide identifications
Definition: IDBoostGraph.h:350
const String & getAccession() const
returns the accession of the protein
void operator()(ProteinGroup &pg, double posterior) const
Definition: IDBoostGraph.h:276
Representation of a protein hit.
Definition: ProteinHit.h:57
Graph g
the initial boost Graph
Definition: IDBoostGraph.h:333
OpenMS::String operator()(const PeptideHit *pep) const
Definition: IDBoostGraph.h:169
OpenMS::String operator()(const ProteinGroup &) const
Definition: IDBoostGraph.h:179
Representation of a protein identification run.
Definition: ProteinIdentification.h:68
void operator()(const ProteinGroup &) const
Definition: IDBoostGraph.h:228
boost::graph_traits< Graph >::vertex_descriptor vertex_t
Definition: IDBoostGraph.h:110
void operator()(const PeptideCluster &) const
Definition: IDBoostGraph.h:233
boost::filtered_graph< Graph, boost::function< bool(edge_t)>, boost::function< bool(vertex_t)> > FilteredGraph
Definition: IDBoostGraph.h:112
Definition: IDBoostGraph.h:209
boost::adjacency_list< boost::setS, boost::vecS, boost::undirectedS, IDPointer > Graph
Definition: IDBoostGraph.h:105
Creates and maintains a boost graph based on the OpenMS ID datastructures.
Definition: IDBoostGraph.h:77
boost::adjacency_list< boost::setS, boost::vecS, boost::undirectedS, IDPointer > GraphConst
Definition: IDBoostGraph.h:109
void discover_vertex(Vertex, const Graph &)
Definition: IDBoostGraph.h:135
String toUnmodifiedString() const
returns the peptide as string without any modifications
vertex_t next_v
Definition: IDBoostGraph.h:157
void operator()(PeptideHit *pep, double posterior) const
Definition: IDBoostGraph.h:263
Visits nodes in the boost graph (ptrs to an ID Object) and depending on their type creates a label...
Definition: IDBoostGraph.h:164
Graphs ccs_
the Graph split into connected components
Definition: IDBoostGraph.h:339
void operator()(const Peptide &peptide) const
Definition: IDBoostGraph.h:238
void operator()(ProteinHit *prot) const
Definition: IDBoostGraph.h:223
std::map< vertex_t, vertex_t > m
A mapping from old node id to new node id to not duplicate existing ones in the new graph...
Definition: IDBoostGraph.h:159
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
std::unordered_map< vertex_t, Size > pepHitVtx_to_run_
Definition: IDBoostGraph.h:360
OpenMS::String operator()(const ProteinHit *prot) const
Definition: IDBoostGraph.h:174
OpenMS::String operator()(const Charge &chg) const
Definition: IDBoostGraph.h:199
Definition: IDBoostGraph.h:258
OpenMS::String operator()(const PeptideCluster &) const
Definition: IDBoostGraph.h:184
void start_vertex(Vertex u, const Graph &tg)
Definition: IDBoostGraph.h:127
PrintAddressVisitor(std::basic_ostream< CharT > stream)
Definition: IDBoostGraph.h:214
void setScore(const double score)
sets the score of the protein hit
void setScore(double score)
sets the PSM score
Graphs & gs
Definition: IDBoostGraph.h:156
dfs_ccsplit_visitor(Graphs &vgs)
Definition: IDBoostGraph.h:122
std::basic_ostream< CharT > stream_
Definition: IDBoostGraph.h:253
OpenMS::String operator()(const Peptide &peptide) const
Definition: IDBoostGraph.h:189
String toString() const
returns the peptide as string with modifications embedded in brackets
void operator()(const Charge &chg) const
Definition: IDBoostGraph.h:248
ProteinIdentification & proteins_
underlying protein identification object
Definition: IDBoostGraph.h:348
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
const AASequence & getSequence() const
returns the peptide sequence without trailing or following spaces
boost::graph_traits< Graph >::edge_descriptor edge_t
Definition: IDBoostGraph.h:111