OpenMS  2.4.0
IDFilter.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2018.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Mathias Walzer $
32 // $Authors: Nico Pfeifer, Mathias Walzer, Hendrik Weisser $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
37 #include <OpenMS/config.h>
46 
47 #include <algorithm>
48 #include <climits>
49 #include <vector>
50 #include <set>
51 #include <map>
52 #include <unordered_set>
53 
54 namespace OpenMS
55 {
76  class OPENMS_DLLAPI IDFilter
77  {
78 public:
79 
81  IDFilter();
82 
84  virtual ~IDFilter();
85 
86 
92 
95  template <class HitType>
96  struct HasGoodScore
97  {
98  typedef HitType argument_type; // for use as a predicate
99 
100  double score;
102 
103  HasGoodScore(double score_, bool higher_score_better_) :
104  score(score_),
105  higher_score_better(higher_score_better_)
106  {}
107 
108  bool operator()(const HitType& hit) const
109  {
110  if (higher_score_better)
111  {
112  return hit.getScore() >= score;
113  }
114  return hit.getScore() <= score;
115  }
116  };
117 
123  template <class HitType>
124  struct HasMaxRank
125  {
126  typedef HitType argument_type; // for use as a predicate
127 
129 
130  HasMaxRank(Size rank_):
131  rank(rank_)
132  {
133  if (rank_ == 0)
134  {
135  throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "The cut-off value for rank filtering must not be zero!");
136  }
137  }
138 
139  bool operator()(const HitType& hit) const
140  {
141  Size hit_rank = hit.getRank();
142  if (hit_rank == 0)
143  {
144  throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No rank assigned to peptide or protein hit");
145  }
146  return hit_rank <= rank;
147  }
148  };
149 
155  template <class HitType>
157  {
158  typedef HitType argument_type; // for use as a predicate
159 
162 
163  HasMetaValue(const String& key_, const DataValue& value_):
164  key(key_),
165  value(value_)
166  {}
167 
168  bool operator()(const HitType& hit) const
169  {
170  DataValue found = hit.getMetaValue(key);
171  if (found.isEmpty()) return false; // meta value "key" not set
172  if (value.isEmpty()) return true; // "key" is set, value doesn't matter
173  return found == value;
174  }
175  };
176 
178  template <class HitType>
180  {
181  typedef HitType argument_type; // for use as a predicate
182 
184  double value;
185 
186  HasMaxMetaValue(const String& key_, const double& value_):
187  key(key_),
188  value(value_)
189  {}
190 
191  bool operator()(const HitType& hit) const
192  {
193  DataValue found = hit.getMetaValue(key);
194  if (found.isEmpty()) return false; // meta value "key" not set
195  return double(found) <= value;
196  }
197  };
198 
200  template <class HitType>
202  {
203  typedef HitType argument_type; // for use as a predicate
204 
205  struct HasMetaValue<HitType> target_decoy, is_decoy;
206 
208  target_decoy("target_decoy", "decoy"), is_decoy("isDecoy", "true")
209  {}
210 
211  bool operator()(const HitType& hit) const
212  {
213  // @TODO: this could be done slightly more efficiently by returning
214  // false if the "target_decoy" meta value is "target" or "target+decoy",
215  // without checking for an "isDecoy" meta value in that case
216  return target_decoy(hit) || is_decoy(hit);
217  }
218  };
219 
225  template <class HitType>
227  {
228  typedef HitType argument_type; // for use as a predicate
229 
230  const std::unordered_set<String>& accessions;
231 
232  HasMatchingAccessionUnordered(const std::unordered_set<String>& accessions_):
233  accessions(accessions_)
234  {}
235 
236  bool operator()(const PeptideHit& hit) const
237  {
238  std::set<String> present_accessions = hit.extractProteinAccessionsSet();
239  for (std::set<String>::iterator it = present_accessions.begin();
240  it != present_accessions.end(); ++it)
241  {
242  if (accessions.count(*it) > 0) return true;
243  }
244  return false;
245  }
246 
247  bool operator()(const ProteinHit& hit) const
248  {
249  return (accessions.count(hit.getAccession()) > 0);
250  }
251 
252  bool operator()(const PeptideEvidence& evidence) const
253  {
254  return (accessions.count(evidence.getProteinAccession()) > 0);
255  }
256  };
257 
263  template <class HitType>
265  {
266  typedef HitType argument_type; // for use as a predicate
267 
268  const std::set<String>& accessions;
269 
270  HasMatchingAccession(const std::set<String>& accessions_):
271  accessions(accessions_)
272  {}
273 
274  bool operator()(const PeptideHit& hit) const
275  {
276  std::set<String> present_accessions = hit.extractProteinAccessionsSet();
277  for (std::set<String>::iterator it = present_accessions.begin();
278  it != present_accessions.end(); ++it)
279  {
280  if (accessions.count(*it) > 0) return true;
281  }
282  return false;
283  }
284 
285  bool operator()(const ProteinHit& hit) const
286  {
287  return (accessions.count(hit.getAccession()) > 0);
288  }
289 
290  bool operator()(const PeptideEvidence& evidence) const
291  {
292  return (accessions.count(evidence.getProteinAccession()) > 0);
293  }
294  };
295 
301  template <class HitType, class Entry>
303  {
304  typedef HitType argument_type; // for use as a predicate
305  typedef std::map<String, Entry*> ItemMap;//Store pointers to avoid copying data
307 
308  GetMatchingItems(std::vector<Entry>& records)
309  {
310  for(typename std::vector<Entry>::iterator rec_it = records.begin();
311  rec_it != records.end(); ++rec_it)
312  {
313  items[getKey(*rec_it)] = &(*rec_it);
314  }
315  }
316 
318 
319  const String& getKey(const FASTAFile::FASTAEntry& entry) const
320  {
321  return entry.identifier;
322  }
323 
324  bool exists(const HitType& hit) const
325  {
326  return items.count(getHitKey(hit)) > 0;
327  }
328 
329  const String& getHitKey(const PeptideEvidence& p) const
330  {
331  return p.getProteinAccession();
332  }
333 
334  const Entry& getValue(const PeptideEvidence& evidence) const
335  {
336  if(!exists(evidence)){
337  throw Exception::InvalidParameter(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Accession: '"+ getHitKey(evidence) + "'. peptide evidence accession not in data");
338  }
339  return *(items.find(getHitKey(evidence))->second);
340  }
341 
342  };
343 
344 
346 
347 
353 
356  struct HasMinPeptideLength;
357 
359  struct HasMinCharge;
360 
362  struct HasLowMZError;
363 
369  struct HasMatchingModification;
370 
376  struct HasMatchingSequence;
377 
379  struct HasNoEvidence;
380 
381 
388  {
389  private:
393 
394  public:
397  digestion_(digestion), min_cleavages_(min), max_cleavages_(max)
398  {}
399 
400  static inline Int disabledValue(){ return -1; }
401 
405  {
406  return digestion_.filterByMissedCleavages(
408  [&](const Int missed_cleavages)
409  {
410 
411  bool max_filter = max_cleavages_ != disabledValue() ?
412  missed_cleavages > max_cleavages_ : false;
413  bool min_filter = min_cleavages_ != disabledValue() ?
414  missed_cleavages < min_cleavages_ : false;
415  return max_filter || min_filter;
416  });
417  }
418 
419  void filterPeptideSequences(std::vector<PeptideHit>& hits)
420  {
421  hits.erase(std::remove_if(hits.begin(), hits.end(), (*this)), hits.end());
422  }
423 
424  };
425 
426 
433  {
435 
436  // Build an accession index to avoid the linear search cost
441 
442  DigestionFilter(std::vector<FASTAFile::FASTAEntry>& entries,
443  ProteaseDigestion& digestion,
444  bool ignore_missed_cleavages,
445  bool methionine_cleavage) :
446  accession_resolver_(entries),
447  digestion_(digestion),
448  ignore_missed_cleavages_(ignore_missed_cleavages),
449  methionine_cleavage_(methionine_cleavage)
450  {}
451 
452  bool operator()(const PeptideEvidence& evidence) const
453  {
454  if(!evidence.hasValidLimits())
455  {
456  LOG_WARN << "Invalid limits! Peptide '" << evidence.getProteinAccession() << "' not filtered" << std::endl;
457  return true;
458  }
459 
460  if (accession_resolver_.exists(evidence))
461  {
462  return digestion_.isValidProduct(
463  AASequence::fromString(accession_resolver_.getValue(evidence).sequence),
464  evidence.getStart(), evidence.getEnd() - evidence.getStart(), ignore_missed_cleavages_, methionine_cleavage_);
465  }
466  else
467  {
468  if (evidence.getProteinAccession().empty())
469  {
470  LOG_WARN << "Peptide accession not available! Skipping Evidence." << std::endl;
471  }
472  else
473  {
474  LOG_WARN << "Peptide accession '" << evidence.getProteinAccession()
475  << "' not found in fasta file!" << std::endl;
476  }
477  return true;
478  }
479  }
480 
481  void filterPeptideEvidences(std::vector<PeptideIdentification>& peptides)
482  {
483  IDFilter::FilterPeptideEvidences<IDFilter::DigestionFilter>(*this,peptides);
484  }
485 
486  };
487 
488 
490 
491 
494 
496  template <class IdentificationType>
497  struct HasNoHits
498  {
499  typedef IdentificationType argument_type; // for use as a predicate
500 
501  bool operator()(const IdentificationType& id) const
502  {
503  return id.getHits().empty();
504  }
505  };
506 
508 
509 
512 
514  struct HasRTInRange;
515 
517  struct HasMZInRange;
518 
520 
521 
527 
530  template <class Container, class Predicate>
531  static void removeMatchingItems(Container& items, const Predicate& pred)
532  {
533  items.erase(std::remove_if(items.begin(), items.end(), pred),
534  items.end());
535  }
536 
538  template <class Container, class Predicate>
539  static void keepMatchingItems(Container& items, const Predicate& pred)
540  {
541  items.erase(std::remove_if(items.begin(), items.end(), std::not1(pred)),
542  items.end());
543  }
544 
546 
547 
550 
552  template <class IdentificationType>
553  static Size countHits(const std::vector<IdentificationType>& ids)
554  {
555  Size counter = 0;
556  for (typename std::vector<IdentificationType>::const_iterator id_it =
557  ids.begin(); id_it != ids.end(); ++id_it)
558  {
559  counter += id_it->getHits().size();
560  }
561  return counter;
562  }
563 
576  template <class IdentificationType>
577  static bool getBestHit(
578  const std::vector<IdentificationType>& identifications,
579  bool assume_sorted, typename IdentificationType::HitType& best_hit)
580  {
581  if (identifications.empty()) return false;
582 
583  typename std::vector<IdentificationType>::const_iterator best_id_it =
584  identifications.end();
585  typename std::vector<typename IdentificationType::HitType>::const_iterator
586  best_hit_it;
587 
588  for (typename std::vector<IdentificationType>::const_iterator id_it =
589  identifications.begin(); id_it != identifications.end(); ++id_it)
590  {
591  if (id_it->getHits().empty()) continue;
592 
593  if (best_id_it == identifications.end()) // no previous "best" hit
594  {
595  best_id_it = id_it;
596  best_hit_it = id_it->getHits().begin();
597  }
598  else if (best_id_it->getScoreType() != id_it->getScoreType())
599  {
600  throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Can't compare scores of different types", best_id_it->getScoreType() + "/" + id_it->getScoreType());
601  }
602 
603  bool higher_better = best_id_it->isHigherScoreBetter();
604  for (typename std::vector<typename IdentificationType::HitType>::
605  const_iterator hit_it = id_it->getHits().begin(); hit_it !=
606  id_it->getHits().end(); ++hit_it)
607  {
608  if ((higher_better && (hit_it->getScore() >
609  best_hit_it->getScore())) ||
610  (!higher_better && (hit_it->getScore() <
611  best_hit_it->getScore())))
612  {
613  best_hit_it = hit_it;
614  }
615  if (assume_sorted) break; // only consider the first hit
616  }
617  }
618 
619  if (best_id_it == identifications.end())
620  {
621  return false; // no hits in any IDs
622  }
623 
624  best_hit = *best_hit_it;
625  return true;
626  }
627 
635  static void extractPeptideSequences(
636  const std::vector<PeptideIdentification>& peptides,
637  std::set<String>& sequences, bool ignore_mods = false);
638 
645  template<class EvidenceFilter>
647  EvidenceFilter& filter,
648  std::vector<PeptideIdentification>& peptides)
649  {
650  for(std::vector<PeptideIdentification>::iterator pep_it = peptides.begin();
651  pep_it != peptides.end(); ++pep_it)
652  {
653  for(std::vector<PeptideHit>::iterator hit_it = pep_it->getHits().begin();
654  hit_it != pep_it->getHits().end(); ++hit_it )
655  {
656  std::vector<PeptideEvidence> evidences;
657  remove_copy_if(hit_it->getPeptideEvidences().begin(),
658  hit_it->getPeptideEvidences().end(),
659  back_inserter(evidences),
660  std::not1(filter));
661  hit_it->setPeptideEvidences(evidences);
662  }
663  }
664  }
665 
666 
668 
669 
672 
674  template <class IdentificationType>
675  static void updateHitRanks(std::vector<IdentificationType>& ids)
676  {
677  for (typename std::vector<IdentificationType>::iterator it = ids.begin();
678  it != ids.end(); ++it)
679  {
680  it->assignRanks();
681  }
682  }
683 
685  static void removeUnreferencedProteins(
686  std::vector<ProteinIdentification>& proteins,
687  const std::vector<PeptideIdentification>& peptides);
688 
696  static void updateProteinReferences(
697  std::vector<PeptideIdentification>& peptides,
698  const std::vector<ProteinIdentification>& proteins,
699  bool remove_peptides_without_reference = false);
700 
709  static bool updateProteinGroups(
710  std::vector<ProteinIdentification::ProteinGroup>& groups,
711  const std::vector<ProteinHit>& hits);
712 
714 
715 
718 
720  template <class IdentificationType>
721  static void removeEmptyIdentifications(std::vector<IdentificationType>& ids)
722  {
723  struct HasNoHits<IdentificationType> empty_filter;
724  removeMatchingItems(ids, empty_filter);
725  }
726 
732  template <class IdentificationType>
733  static void filterHitsByScore(std::vector<IdentificationType>& ids,
734  double threshold_score)
735  {
736  for (typename std::vector<IdentificationType>::iterator id_it =
737  ids.begin(); id_it != ids.end(); ++id_it)
738  {
739  struct HasGoodScore<typename IdentificationType::HitType> score_filter(
740  threshold_score, id_it->isHigherScoreBetter());
741  keepMatchingItems(id_it->getHits(), score_filter);
742  }
743  }
744 
750  template <class IdentificationType>
751  static void filterHitsByScore(IdentificationType& id,
752  double threshold_score)
753  {
754  struct HasGoodScore<typename IdentificationType::HitType> score_filter(
755  threshold_score, id->isHigherScoreBetter());
756  keepMatchingItems(id->getHits(), score_filter);
757  }
758 
764  template <class IdentificationType>
765  static void filterHitsBySignificance(std::vector<IdentificationType>& ids,
766  double threshold_fraction = 1.0)
767  {
768  for (typename std::vector<IdentificationType>::iterator id_it =
769  ids.begin(); id_it != ids.end(); ++id_it)
770  {
771  double threshold_score = (threshold_fraction *
772  id_it->getSignificanceThreshold());
773  struct HasGoodScore<typename IdentificationType::HitType> score_filter(
774  threshold_score, id_it->isHigherScoreBetter());
775  keepMatchingItems(id_it->getHits(), score_filter);
776  }
777  }
778 
784  template <class IdentificationType>
785  static void keepNBestHits(std::vector<IdentificationType>& ids, Size n)
786  {
787  for (typename std::vector<IdentificationType>::iterator id_it =
788  ids.begin(); id_it != ids.end(); ++id_it)
789  {
790  id_it->sort();
791  if (n < id_it->getHits().size()) id_it->getHits().resize(n);
792  }
793  }
794 
809  template <class IdentificationType>
810  static void filterHitsByRank(std::vector<IdentificationType>& ids,
811  Size min_rank, Size max_rank)
812  {
813  updateHitRanks(ids);
814  if (min_rank > 1)
815  {
816  struct HasMaxRank<typename IdentificationType::HitType>
817  rank_filter(min_rank - 1);
818  for (typename std::vector<IdentificationType>::iterator id_it =
819  ids.begin(); id_it != ids.end(); ++id_it)
820  {
821  removeMatchingItems(id_it->getHits(), rank_filter);
822  }
823  }
824  if (max_rank >= min_rank)
825  {
826  struct HasMaxRank<typename IdentificationType::HitType>
827  rank_filter(max_rank);
828  for (typename std::vector<IdentificationType>::iterator id_it =
829  ids.begin(); id_it != ids.end(); ++id_it)
830  {
831  keepMatchingItems(id_it->getHits(), rank_filter);
832  }
833  }
834  }
835 
843  template <class IdentificationType>
844  static void removeDecoyHits(std::vector<IdentificationType>& ids)
845  {
846  struct HasDecoyAnnotation<typename IdentificationType::HitType>
847  decoy_filter;
848  for (typename std::vector<IdentificationType>::iterator id_it =
849  ids.begin(); id_it != ids.end(); ++id_it)
850  {
851  removeMatchingItems(id_it->getHits(), decoy_filter);
852  }
853  }
854 
862  template <class IdentificationType>
863  static void removeHitsMatchingProteins(std::vector<IdentificationType>& ids,
864  const std::set<String> accessions)
865  {
866  struct HasMatchingAccession<typename IdentificationType::HitType>
867  acc_filter(accessions);
868  for (typename std::vector<IdentificationType>::iterator id_it =
869  ids.begin(); id_it != ids.end(); ++id_it)
870  {
871  removeMatchingItems(id_it->getHits(), acc_filter);
872  }
873  }
874 
882  template <class IdentificationType>
883  static void keepHitsMatchingProteins(std::vector<IdentificationType>& ids,
884  const std::set<String> accessions)
885  {
886  struct HasMatchingAccession<typename IdentificationType::HitType>
887  acc_filter(accessions);
888  for (typename std::vector<IdentificationType>::iterator id_it =
889  ids.begin(); id_it != ids.end(); ++id_it)
890  {
891  keepMatchingItems(id_it->getHits(), acc_filter);
892  }
893  }
894 
895 
896 
898 
899 
902 
909  static void keepBestPeptideHits(
910  std::vector<PeptideIdentification>& peptides, bool strict = false);
911 
920  static void filterPeptidesByLength(
921  std::vector<PeptideIdentification>& peptides, Size min_length,
922  Size max_length = UINT_MAX);
923 
932  static void filterPeptidesByCharge(
933  std::vector<PeptideIdentification>& peptides, Int min_charge,
934  Int max_charge);
935 
937  static void filterPeptidesByRT(std::vector<PeptideIdentification>& peptides,
938  double min_rt, double max_rt);
939 
941  static void filterPeptidesByMZ(std::vector<PeptideIdentification>& peptides,
942  double min_mz, double max_mz);
943 
955  static void filterPeptidesByMZError(
956  std::vector<PeptideIdentification>& peptides, double mass_error,
957  bool unit_ppm);
958 
959 
966  template <class Filter>
967  static void filterPeptideEvidences(
968  Filter& filter,
969  std::vector<PeptideIdentification>& peptides);
970 
982  static void filterPeptidesByRTPredictPValue(
983  std::vector<PeptideIdentification>& peptides,
984  const String& metavalue_key, double threshold = 0.05);
985 
987  static void removePeptidesWithMatchingModifications(
988  std::vector<PeptideIdentification>& peptides,
989  const std::set<String>& modifications);
990 
992  static void keepPeptidesWithMatchingModifications(
993  std::vector<PeptideIdentification>& peptides,
994  const std::set<String>& modifications);
995 
1003  static void removePeptidesWithMatchingSequences(
1004  std::vector<PeptideIdentification>& peptides,
1005  const std::vector<PeptideIdentification>& bad_peptides,
1006  bool ignore_mods = false);
1007 
1015  static void keepPeptidesWithMatchingSequences(
1016  std::vector<PeptideIdentification>& peptides,
1017  const std::vector<PeptideIdentification>& good_peptides,
1018  bool ignore_mods = false);
1019 
1021  static void keepUniquePeptidesPerProtein(std::vector<PeptideIdentification>&
1022  peptides);
1023 
1029  static void removeDuplicatePeptideHits(std::vector<PeptideIdentification>&
1030  peptides, bool seq_only = false);
1031 
1033 
1034 
1037 
1039  static void filterHitsByScore(PeakMap& experiment,
1040  double peptide_threshold_score,
1041  double protein_threshold_score)
1042  {
1043  // filter protein hits:
1044  filterHitsByScore(experiment.getProteinIdentifications(),
1045  protein_threshold_score);
1046  // don't remove empty protein IDs - they contain search meta data and may
1047  // be referenced by peptide IDs (via run ID)
1048 
1049  // filter peptide hits:
1050  for (PeakMap::Iterator exp_it = experiment.begin();
1051  exp_it != experiment.end(); ++exp_it)
1052  {
1053  filterHitsByScore(exp_it->getPeptideIdentifications(),
1054  peptide_threshold_score);
1055  removeEmptyIdentifications(exp_it->getPeptideIdentifications());
1056  updateProteinReferences(exp_it->getPeptideIdentifications(),
1057  experiment.getProteinIdentifications());
1058  }
1059  // @TODO: remove proteins that aren't referenced by peptides any more?
1060  }
1061 
1063  static void filterHitsBySignificance(PeakMap& experiment,
1064  double peptide_threshold_fraction,
1065  double protein_threshold_fraction)
1066  {
1067  // filter protein hits:
1068  filterHitsBySignificance(experiment.getProteinIdentifications(),
1069  protein_threshold_fraction);
1070  // don't remove empty protein IDs - they contain search meta data and may
1071  // be referenced by peptide IDs (via run ID)
1072 
1073  // filter peptide hits:
1074  for (PeakMap::Iterator exp_it = experiment.begin();
1075  exp_it != experiment.end(); ++exp_it)
1076  {
1077  filterHitsBySignificance(exp_it->getPeptideIdentifications(),
1078  peptide_threshold_fraction);
1079  removeEmptyIdentifications(exp_it->getPeptideIdentifications());
1080  updateProteinReferences(exp_it->getPeptideIdentifications(),
1081  experiment.getProteinIdentifications());
1082  }
1083  // @TODO: remove proteins that aren't referenced by peptides any more?
1084  }
1085 
1087  static void keepNBestHits(PeakMap& experiment, Size n)
1088  {
1089  // don't filter the protein hits by "N best" here - filter the peptides
1090  // and update the protein hits!
1091  std::vector<PeptideIdentification> all_peptides; // IDs from all spectra
1092 
1093  // filter peptide hits:
1094  for (PeakMap::Iterator exp_it = experiment.begin();
1095  exp_it != experiment.end(); ++exp_it)
1096  {
1097  std::vector<PeptideIdentification>& peptides =
1098  exp_it->getPeptideIdentifications();
1099  keepNBestHits(peptides, n);
1100  removeEmptyIdentifications(peptides);
1101  updateProteinReferences(peptides,
1102  experiment.getProteinIdentifications());
1103  all_peptides.insert(all_peptides.end(), peptides.begin(),
1104  peptides.end());
1105  }
1106  // update protein hits:
1107  removeUnreferencedProteins(experiment.getProteinIdentifications(),
1108  all_peptides);
1109  }
1110 
1112  static void filterEmptyPeptideIDs(std::vector<PeptideIdentification>& pep_ids)
1113  {
1114  pep_ids.erase(std::remove_if(pep_ids.begin(), pep_ids.end(),
1115  [](PeptideIdentification& p){return p.getHits().empty();}),pep_ids.end());
1116 
1117  }
1118 
1120  static void filterBestPerPeptide(std::vector<PeptideIdentification>& pep_ids, bool ignore_mods, bool ignore_charges, Size nr_best_spectrum)
1121  {
1122 
1123  annotateBestPerPeptide(pep_ids, ignore_mods, ignore_charges, nr_best_spectrum);
1124 
1125  for (auto &pep : pep_ids)
1126  {
1127  auto& hits = pep.getHits();
1128  hits.erase(std::remove_if(hits.begin(), hits.end(),
1129  [](const PeptideHit& p){return !p.metaValueExists("bestForItsPep") || !p.getMetaValue("bestForItsPep").toBool();}), hits.end());
1130  }
1131 
1132  }
1133 
1136  static void annotateBestPerPeptide(std::vector<PeptideIdentification>& pep_ids, bool ignore_mods, bool ignore_charges, Size nr_best_spectrum)
1137  {
1138  std::unordered_map<std::string, std::map<Int, PeptideHit*>> best_pep;
1139  for (auto &pep : pep_ids)
1140  {
1141  //skip if no hits (which almost could be considered and error or warning.
1142  if (pep.getHits().empty())
1143  continue;
1144 
1145  bool higher_score_better = pep.isHigherScoreBetter();
1146  //make sure that first = best hit
1147  pep.sort();
1148 
1149  auto pepIt = pep.getHits().begin();
1150  auto pepItEnd = nr_best_spectrum == 0 || pep.getHits().size() <= nr_best_spectrum ? pep.getHits().end() : pep.getHits().begin() + nr_best_spectrum;
1151  for (; pepIt != pepItEnd; ++pepIt)
1152  {
1153  PeptideHit &hit = *pepIt;
1154 
1155  String lookup_seq;
1156  if (ignore_mods)
1157  {
1158  lookup_seq = hit.getSequence().toUnmodifiedString();
1159  }
1160  else
1161  {
1162  lookup_seq = hit.getSequence().toString();
1163  }
1164 
1165  int lookup_charge = 0;
1166  if (!ignore_charges)
1167  {
1168  lookup_charge = hit.getCharge();
1169  }
1170 
1171  auto it_inserted = best_pep.emplace(std::move(lookup_seq), std::map<Int, PeptideHit*>());
1172  auto it_inserted_chg = it_inserted.first->second.emplace(lookup_charge, &hit);
1173  PeptideHit* &p = it_inserted_chg.first->second; //either the old one if already present, or this
1174  if (!it_inserted_chg.second) //was already present -> possibly update
1175  {
1176  if (
1177  (higher_score_better && (hit.getScore() > p->getScore())) ||
1178  (!higher_score_better && (hit.getScore() < p->getScore()))
1179  )
1180  {
1181  p->setMetaValue("bestForItsPep", "false");
1182  hit.setMetaValue("bestForItsPep", "true");
1183  p = &hit;
1184  }
1185  else //note that this was def. not the best
1186  {
1187  // TODO if it is only about filtering, we can omit writing this metavalue (absence = false)
1188  hit.setMetaValue("bestForItsPep", "false");
1189  }
1190  }
1191  else //first for that sequence (and optionally charge)
1192  {
1193  hit.setMetaValue("bestForItsPep", "true");
1194  }
1195  }
1196  }
1197  }
1198 
1201  PeakMap& experiment,
1202  const std::vector<FASTAFile::FASTAEntry>& proteins)
1203  {
1204  std::set<String> accessions;
1205  for (std::vector<FASTAFile::FASTAEntry>::const_iterator it =
1206  proteins.begin(); it != proteins.end(); ++it)
1207  {
1208  accessions.insert(it->identifier);
1209  }
1210 
1211  // filter protein hits:
1212  keepHitsMatchingProteins(experiment.getProteinIdentifications(),
1213  accessions);
1214  updateHitRanks(experiment.getProteinIdentifications());
1215 
1216  // filter peptide hits:
1217  for (PeakMap::Iterator exp_it = experiment.begin();
1218  exp_it != experiment.end(); ++exp_it)
1219  {
1220  if (exp_it->getMSLevel() == 2)
1221  {
1222  keepHitsMatchingProteins(exp_it->getPeptideIdentifications(),
1223  accessions);
1224  removeEmptyIdentifications(exp_it->getPeptideIdentifications());
1225  updateHitRanks(exp_it->getPeptideIdentifications());
1226  }
1227  }
1228  }
1229 
1231 
1232 
1233  };
1234 
1235 } // namespace OpenMS
1236 
static bool getBestHit(const std::vector< IdentificationType > &identifications, bool assume_sorted, typename IdentificationType::HitType &best_hit)
Finds the best-scoring hit in a vector of peptide or protein identifications.
Definition: IDFilter.h:577
const std::set< String > & accessions
Definition: IDFilter.h:268
HitType argument_type
Definition: IDFilter.h:228
HitType argument_type
Definition: IDFilter.h:203
static Int disabledValue()
Definition: IDFilter.h:400
static void removeDecoyHits(std::vector< IdentificationType > &ids)
Removes hits annotated as decoys from peptide or protein identifications.
Definition: IDFilter.h:844
static void filterBestPerPeptide(std::vector< PeptideIdentification > &pep_ids, bool ignore_mods, bool ignore_charges, Size nr_best_spectrum)
Filters PeptideHits from PeptideIdentification by keeping only the best peptide hits for every peptid...
Definition: IDFilter.h:1120
Int getStart() const
get the position in the protein (starting at 0 for the N-terminus). If not available UNKNOWN_POSITION...
bool operator()(const HitType &hit) const
Definition: IDFilter.h:168
Int getCharge() const
returns the charge of the peptide
bool operator()(PeptideHit &p)
Definition: IDFilter.h:404
double getScore() const
returns the PSM score
Int getEnd() const
get the position of the last AA of the peptide in protein coordinates (starting at 0 for the N-termin...
#define LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged...
Definition: LogStream.h:452
String identifier
Definition: FASTAFile.h:78
void filterPeptideEvidences(std::vector< PeptideIdentification > &peptides)
Definition: IDFilter.h:481
HasMaxRank(Size rank_)
Definition: IDFilter.h:130
bool methionine_cleavage_
Definition: IDFilter.h:440
HitType argument_type
Definition: IDFilter.h:158
static void removeEmptyIdentifications(std::vector< IdentificationType > &ids)
Removes peptide or protein identifications that have no hits in them.
Definition: IDFilter.h:721
static void removeHitsMatchingProteins(std::vector< IdentificationType > &ids, const std::set< String > accessions)
Filters peptide or protein identifications according to the given proteins (negative).
Definition: IDFilter.h:863
bool isEmpty() const
Test if the value is empty.
Definition: DataValue.h:372
PeptideHit argument_type
Definition: IDFilter.h:395
DataValue value
Definition: IDFilter.h:161
static void filterEmptyPeptideIDs(std::vector< PeptideIdentification > &pep_ids)
Filters PeptideIdentifications with no hits.
Definition: IDFilter.h:1112
Int max_cleavages_
Definition: IDFilter.h:392
bool operator()(const ProteinHit &hit) const
Definition: IDFilter.h:285
const String & getKey(const FASTAFile::FASTAEntry &entry) const
Definition: IDFilter.h:319
static AASequence fromString(const String &s, bool permissive=true)
create AASequence object by parsing an OpenMS string
const std::vector< ProteinIdentification > & getProteinIdentifications() const
returns a const reference to the protein ProteinIdentification vector
Is the score of this hit at least as good as the given value?
Definition: IDFilter.h:96
Representation of a peptide hit.
Definition: PeptideHit.h:54
bool operator()(const PeptideEvidence &evidence) const
Definition: IDFilter.h:452
HitType argument_type
Definition: IDFilter.h:304
bool operator()(const HitType &hit) const
Definition: IDFilter.h:191
EnzymaticDigestion & digestion_
Definition: IDFilter.h:390
String key
Definition: IDFilter.h:160
A more convenient string class.
Definition: String.h:58
bool operator()(const IdentificationType &id) const
Definition: IDFilter.h:501
ItemMap items
Definition: IDFilter.h:306
Given a list of protein accessions, do any occur in the annotation(s) of this hit?
Definition: IDFilter.h:226
Exception indicating that an invalid parameter was handed over to an algorithm.
Definition: Exception.h:347
bool operator()(const PeptideHit &hit) const
Definition: IDFilter.h:236
bool operator()(const HitType &hit) const
Definition: IDFilter.h:108
PeptideEvidence argument_type
Definition: IDFilter.h:434
const String & getAccession() const
returns the accession of the protein
String key
Definition: IDFilter.h:183
static void filterHitsByRank(std::vector< IdentificationType > &ids, Size min_rank, Size max_rank)
Filters peptide or protein identifications according to the ranking of the hits.
Definition: IDFilter.h:810
static void filterHitsByScore(PeakMap &experiment, double peptide_threshold_score, double protein_threshold_score)
Filters an MS/MS experiment according to score thresholds.
Definition: IDFilter.h:1039
double value
Definition: IDFilter.h:184
static void updateHitRanks(std::vector< IdentificationType > &ids)
Updates the hit ranks on all peptide or protein IDs.
Definition: IDFilter.h:675
Class for the enzymatic digestion of sequences.
Definition: EnzymaticDigestion.h:62
Representation of a protein hit.
Definition: ProteinHit.h:57
static void filterHitsByScore(std::vector< IdentificationType > &ids, double threshold_score)
Filters peptide or protein identifications according to the score of the hits.
Definition: IDFilter.h:733
static void removeMatchingItems(Container &items, const Predicate &pred)
Remove items that satisfy a condition from a container (e.g. vector)
Definition: IDFilter.h:531
bool hasValidLimits() const
start and end numbers in evidence represent actual numeric indices
bool exists(const HitType &hit) const
Definition: IDFilter.h:324
bool operator()(const HitType &hit) const
Definition: IDFilter.h:139
Represents the peptide hits for a spectrum.
Definition: PeptideIdentification.h:62
bool filterByMissedCleavages(const String &sequence, std::function< bool(const Int)> filter) const
Filter based on the number of missed cleavages.
PeptideDigestionFilter(EnzymaticDigestion &digestion, Int min, Int max)
Definition: IDFilter.h:396
FASTA entry type (identifier, description and sequence)
Definition: FASTAFile.h:76
Not all required information provided.
Definition: Exception.h:195
static void FilterPeptideEvidences(EvidenceFilter &filter, std::vector< PeptideIdentification > &peptides)
remove peptide evidences based on a filter
Definition: IDFilter.h:646
Is a meta value with given key and value set on this hit?
Definition: IDFilter.h:156
static void filterHitsBySignificance(PeakMap &experiment, double peptide_threshold_fraction, double protein_threshold_fraction)
Filters an MS/MS experiment according to fractions of the significance thresholds.
Definition: IDFilter.h:1063
Class to hold strings, numeric values, lists of strings and lists of numeric values.
Definition: DataValue.h:56
const String & getHitKey(const PeptideEvidence &p) const
Definition: IDFilter.h:329
Collection of functions for filtering peptide and protein identifications.
Definition: IDFilter.h:76
bool isValidProduct(const String &protein, int pep_pos, int pep_length, bool ignore_missed_cleavages=true, bool allow_nterm_protein_cleavage=false, bool allow_random_asp_pro_cleavage=false) const
Variant of EnzymaticDigestion::isValidProduct() with support for n-term protein cleavage and random D...
DigestionFilter(std::vector< FASTAFile::FASTAEntry > &entries, ProteaseDigestion &digestion, bool ignore_missed_cleavages, bool methionine_cleavage)
Definition: IDFilter.h:442
void setMetaValue(const String &name, const DataValue &value)
Sets the DataValue corresponding to a name.
bool ignore_missed_cleavages_
Definition: IDFilter.h:439
A method or algorithm argument contains illegal values.
Definition: Exception.h:648
static void annotateBestPerPeptide(std::vector< PeptideIdentification > &pep_ids, bool ignore_mods, bool ignore_charges, Size nr_best_spectrum)
Definition: IDFilter.h:1136
Builds a map index of data that have a String index to find matches and return the objects...
Definition: IDFilter.h:302
bool operator()(const PeptideEvidence &evidence) const
Definition: IDFilter.h:252
Is this a decoy hit?
Definition: IDFilter.h:201
ProteaseDigestion & digestion_
Definition: IDFilter.h:438
double score
Definition: IDFilter.h:100
Class for the enzymatic digestion of proteins.
Definition: ProteaseDigestion.h:60
HitType argument_type
Definition: IDFilter.h:181
String toUnmodifiedString() const
returns the peptide as string without any modifications
static void keepMatchingItems(Container &items, const Predicate &pred)
Keep items that satisfy a condition in a container (e.g. vector), removing all others.
Definition: IDFilter.h:539
Is the rank of this hit below or at the given cut-off?
Definition: IDFilter.h:124
static void keepHitsMatchingProteins(std::vector< IdentificationType > &ids, const std::set< String > accessions)
Filters peptide or protein identifications according to the given proteins (positive).
Definition: IDFilter.h:883
HasMetaValue(const String &key_, const DataValue &value_)
Definition: IDFilter.h:163
HasDecoyAnnotation()
Definition: IDFilter.h:207
const std::unordered_set< String > & accessions
Definition: IDFilter.h:230
Iterator end()
Definition: MSExperiment.h:167
Is peptide evidence digestion product of some protein.
Definition: IDFilter.h:432
Given a list of protein accessions, do any occur in the annotation(s) of this hit?
Definition: IDFilter.h:264
HasMatchingAccession(const std::set< String > &accessions_)
Definition: IDFilter.h:270
Int min_cleavages_
Definition: IDFilter.h:391
HasMaxMetaValue(const String &key_, const double &value_)
Definition: IDFilter.h:186
int Int
Signed integer type.
Definition: Types.h:102
Does a meta value of this hit have at most the given value?
Definition: IDFilter.h:179
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
GetMatchingItems()
Definition: IDFilter.h:317
static void keepHitsMatchingProteins(PeakMap &experiment, const std::vector< FASTAFile::FASTAEntry > &proteins)
Filters an MS/MS experiment according to the given proteins.
Definition: IDFilter.h:1200
HasMatchingAccessionUnordered(const std::unordered_set< String > &accessions_)
Definition: IDFilter.h:232
HitType argument_type
Definition: IDFilter.h:126
Size rank
Definition: IDFilter.h:128
Representation of a peptide evidence.
Definition: PeptideEvidence.h:50
static void keepNBestHits(PeakMap &experiment, Size n)
Filters an MS/MS experiment by keeping the N best peptide hits for every spectrum.
Definition: IDFilter.h:1087
bool operator()(const ProteinHit &hit) const
Definition: IDFilter.h:247
Filter Peptide Hit by its digestion product.
Definition: IDFilter.h:387
In-Memory representation of a mass spectrometry experiment.
Definition: MSExperiment.h:77
GetMatchingItems(std::vector< Entry > &records)
Definition: IDFilter.h:308
const String & getProteinAccession() const
get the protein accession the peptide matches to. If not available the empty string is returned...
bool higher_score_better
Definition: IDFilter.h:101
std::map< String, Entry * > ItemMap
Definition: IDFilter.h:305
HitType argument_type
Definition: IDFilter.h:98
std::vector< SpectrumType >::iterator Iterator
Mutable iterator.
Definition: MSExperiment.h:111
static Size countHits(const std::vector< IdentificationType > &ids)
Returns the total number of peptide/protein hits in a vector of peptide/protein identifications.
Definition: IDFilter.h:553
HitType argument_type
Definition: IDFilter.h:266
bool operator()(const HitType &hit) const
Definition: IDFilter.h:211
GetMatchingItems< PeptideEvidence, FASTAFile::FASTAEntry > accession_resolver_
Definition: IDFilter.h:437
String toString() const
returns the peptide as string with modifications embedded in brackets
Invalid value exception.
Definition: Exception.h:335
Is the list of hits of this peptide/protein ID empty?
Definition: IDFilter.h:497
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
Iterator begin()
Definition: MSExperiment.h:157
void filterPeptideSequences(std::vector< PeptideHit > &hits)
Definition: IDFilter.h:419
const AASequence & getSequence() const
returns the peptide sequence without trailing or following spaces
std::set< String > extractProteinAccessionsSet() const
extracts the set of non-empty protein accessions from peptide evidences
const Entry & getValue(const PeptideEvidence &evidence) const
Definition: IDFilter.h:334
IdentificationType argument_type
Definition: IDFilter.h:499
bool operator()(const PeptideEvidence &evidence) const
Definition: IDFilter.h:290
HasGoodScore(double score_, bool higher_score_better_)
Definition: IDFilter.h:103
bool operator()(const PeptideHit &hit) const
Definition: IDFilter.h:274