OpenMS  2.7.0
StatisticFunctions.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2021.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Timo Sachsenberg $
32 // $Authors: Clemens Groepl, Johannes Junker, Mathias Walzer, Chris Bielow $
33 // --------------------------------------------------------------------------
34 #pragma once
35 
36 #include <vector>
38 #include <OpenMS/CONCEPT/Types.h>
39 
40 // array_wrapper needs to be included before it is used
41 // only in boost1.64+. See issue #2790
42 #if OPENMS_BOOST_VERSION_MINOR >= 64
43 #include <boost/serialization/array_wrapper.hpp>
44 #endif
45 #include <boost/accumulators/accumulators.hpp>
46 #include <boost/accumulators/statistics/covariance.hpp>
47 #include <boost/accumulators/statistics/mean.hpp>
48 #include <boost/accumulators/statistics/stats.hpp>
49 #include <boost/accumulators/statistics/variance.hpp>
50 #include <boost/accumulators/statistics/variates/covariate.hpp>
51 #include <boost/function/function_base.hpp>
52 #include <boost/lambda/casts.hpp>
53 #include <boost/lambda/lambda.hpp>
54 
55 #include <iterator>
56 #include <algorithm>
57 
58 namespace OpenMS
59 {
60 
61  namespace Math
62  {
63 
71  template <typename IteratorType>
72  static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
73  {
74  if (begin == end)
75  {
76  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
77  }
78  }
79 
87  template <typename IteratorType>
88  static void checkIteratorsEqual(IteratorType begin, IteratorType end)
89  {
90  if (begin != end)
91  {
92  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
93  }
94  }
95 
103  template <typename IteratorType1, typename IteratorType2>
105  IteratorType1 begin_b, IteratorType1 end_b,
106  IteratorType2 begin_a, IteratorType2 end_a)
107  {
108  if (begin_b != end_b && begin_a == end_a)
109  {
110  throw Exception::InvalidRange(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
111  }
112  }
113 
119  template <typename IteratorType>
120  static double sum(IteratorType begin, IteratorType end)
121  {
122  return std::accumulate(begin, end, 0.0);
123  }
124 
132  template <typename IteratorType>
133  static double mean(IteratorType begin, IteratorType end)
134  {
135  checkIteratorsNotNULL(begin, end);
136  return sum(begin, end) / std::distance(begin, end);
137  }
138 
150  template <typename IteratorType>
151  static double median(IteratorType begin, IteratorType end,
152  bool sorted = false)
153  {
154  checkIteratorsNotNULL(begin, end);
155  if (!sorted)
156  {
157  std::sort(begin, end);
158  }
159 
160  Size size = std::distance(begin, end);
161  if (size % 2 == 0) // even size => average two middle values
162  {
163  IteratorType it1 = begin;
164  std::advance(it1, size / 2 - 1);
165  IteratorType it2 = it1;
166  std::advance(it2, 1);
167  return (*it1 + *it2) / 2.0;
168  }
169  else
170  {
171  IteratorType it = begin;
172  std::advance(it, (size - 1) / 2);
173  return *it;
174  }
175  }
176 
177 
197  template <typename IteratorType>
198  double MAD(IteratorType begin, IteratorType end, double median_of_numbers)
199  {
200  std::vector<double> diffs;
201  diffs.reserve(std::distance(begin, end));
202  for (IteratorType it = begin; it != end; ++it)
203  {
204  diffs.push_back(fabs(*it - median_of_numbers));
205  }
206  return median(diffs.begin(), diffs.end(), false);
207  }
208 
227  template <typename IteratorType>
228  double MeanAbsoluteDeviation(IteratorType begin, IteratorType end, double mean_of_numbers)
229  {
230  double mean {0};
231  for (IteratorType it = begin; it != end; ++it)
232  {
233  mean += fabs(*it - mean_of_numbers);
234  }
235  return mean / std::distance(begin, end);
236  }
237 
251  template <typename IteratorType>
252  static double quantile1st(IteratorType begin, IteratorType end,
253  bool sorted = false)
254  {
255  checkIteratorsNotNULL(begin, end);
256 
257  if (!sorted)
258  {
259  std::sort(begin, end);
260  }
261 
262  Size size = std::distance(begin, end);
263  if (size % 2 == 0)
264  {
265  return median(begin, begin + (size/2)-1, true); //-1 to exclude median values
266  }
267  return median(begin, begin + (size/2), true);
268  }
269 
283  template <typename IteratorType>
284  static double quantile3rd(
285  IteratorType begin, IteratorType end, bool sorted = false)
286  {
287  checkIteratorsNotNULL(begin, end);
288  if (!sorted)
289  {
290  std::sort(begin, end);
291  }
292 
293  Size size = std::distance(begin, end);
294  return median(begin + (size/2)+1, end, true); //+1 to exclude median values
295  }
296 
306  template <typename IteratorType>
307  static double variance(IteratorType begin, IteratorType end,
308  double mean = std::numeric_limits<double>::max())
309  {
310  checkIteratorsNotNULL(begin, end);
311  double sum = 0.0;
312  if (mean == std::numeric_limits<double>::max())
313  {
314  mean = Math::mean(begin, end);
315  }
316  for (IteratorType iter=begin; iter!=end; ++iter)
317  {
318  double diff = *iter - mean;
319  sum += diff * diff;
320  }
321  return sum / (std::distance(begin, end)-1);
322  }
323 
333  template <typename IteratorType>
334  static double sd(IteratorType begin, IteratorType end,
335  double mean = std::numeric_limits<double>::max())
336  {
337  checkIteratorsNotNULL(begin, end);
338  return std::sqrt( variance(begin, end, mean) );
339  }
340 
348  template <typename IteratorType>
349  static double absdev(IteratorType begin, IteratorType end,
350  double mean = std::numeric_limits<double>::max())
351  {
352  checkIteratorsNotNULL(begin, end);
353  double sum = 0.0;
354  if (mean == std::numeric_limits<double>::max())
355  {
356  mean = Math::mean(begin, end);
357  }
358  for (IteratorType iter=begin; iter!=end; ++iter)
359  {
360  sum += *iter - mean;
361  }
362  return sum / std::distance(begin, end);
363  }
364 
374  template <typename IteratorType1, typename IteratorType2>
375  static double covariance(IteratorType1 begin_a, IteratorType1 end_a,
376  IteratorType2 begin_b, IteratorType2 end_b)
377  {
378  //no data or different lengths
379  checkIteratorsNotNULL(begin_a, end_a);
380 
381  double sum = 0.0;
382  double mean_a = Math::mean(begin_a, end_a);
383  double mean_b = Math::mean(begin_b, end_b);
384  IteratorType1 iter_a = begin_a;
385  IteratorType2 iter_b = begin_b;
386  for (; iter_a != end_a; ++iter_a, ++iter_b)
387  {
388  /* assure both ranges have the same number of elements */
389  checkIteratorsAreValid(begin_b, end_b, begin_a, end_a);
390  sum += (*iter_a - mean_a) * (*iter_b - mean_b);
391  }
392  /* assure both ranges have the same number of elements */
393  checkIteratorsEqual(iter_b, end_b);
394  Size n = std::distance(begin_a, end_a);
395  return sum / (n-1);
396  }
397 
407  template <typename IteratorType1, typename IteratorType2>
408  static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a,
409  IteratorType2 begin_b, IteratorType2 end_b)
410  {
411  //no data or different lengths
412  checkIteratorsNotNULL(begin_a, end_a);
413 
414  SignedSize dist = std::distance(begin_a, end_a);
415  double error = 0;
416  IteratorType1 iter_a = begin_a;
417  IteratorType2 iter_b = begin_b;
418  for (; iter_a != end_a; ++iter_a, ++iter_b)
419  {
420  /* assure both ranges have the same number of elements */
421  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
422 
423  double tmp(*iter_a - *iter_b);
424  error += tmp * tmp;
425  }
426  /* assure both ranges have the same number of elements */
427  checkIteratorsEqual(iter_b, end_b);
428 
429  return error / dist;
430  }
431 
441  template <typename IteratorType1, typename IteratorType2>
442  static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a,
443  IteratorType2 begin_b, IteratorType2 end_b)
444  {
445  //no data or different lengths
446  checkIteratorsNotNULL(begin_a, end_a);
447 
448  SignedSize dist = std::distance(begin_a, end_a);
449  SignedSize correct = dist;
450  IteratorType1 iter_a = begin_a;
451  IteratorType2 iter_b = begin_b;
452  for (; iter_a != end_a; ++iter_a, ++iter_b)
453  {
454  /* assure both ranges have the same number of elements */
455  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
456  if ((*iter_a < 0 && *iter_b >= 0) || (*iter_a >= 0 && *iter_b < 0))
457  {
458  --correct;
459  }
460 
461  }
462  /* assure both ranges have the same number of elements */
463  checkIteratorsEqual(iter_b, end_b);
464 
465  return double(correct) / dist;
466  }
467 
480  template <typename IteratorType1, typename IteratorType2>
482  IteratorType1 begin_a, IteratorType1 end_a,
483  IteratorType2 begin_b, IteratorType2 end_b)
484  {
485  //no data or different lengths
486  checkIteratorsNotNULL(begin_a, end_b);
487 
488  double tp = 0;
489  double fp = 0;
490  double tn = 0;
491  double fn = 0;
492  IteratorType1 iter_a = begin_a;
493  IteratorType2 iter_b = begin_b;
494  for (; iter_a != end_a; ++iter_a, ++iter_b)
495  {
496  /* assure both ranges have the same number of elements */
497  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
498 
499  if (*iter_a < 0 && *iter_b >= 0)
500  {
501  ++fn;
502  }
503  else if (*iter_a < 0 && *iter_b < 0)
504  {
505  ++tn;
506  }
507  else if (*iter_a >= 0 && *iter_b >= 0)
508  {
509  ++tp;
510  }
511  else if (*iter_a >= 0 && *iter_b < 0)
512  {
513  ++fp;
514  }
515  }
516  /* assure both ranges have the same number of elements */
517  checkIteratorsEqual(iter_b, end_b);
518 
519  return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn));
520  }
521 
533  template <typename IteratorType1, typename IteratorType2>
535  IteratorType1 begin_a, IteratorType1 end_a,
536  IteratorType2 begin_b, IteratorType2 end_b)
537  {
538  //no data or different lengths
539  checkIteratorsNotNULL(begin_a, end_a);
540 
541  //calculate average
542  SignedSize dist = std::distance(begin_a, end_a);
543  double avg_a = std::accumulate(begin_a, end_a, 0.0) / dist;
544  double avg_b = std::accumulate(begin_b, end_b, 0.0) / dist;
545 
546  double numerator = 0;
547  double denominator_a = 0;
548  double denominator_b = 0;
549  IteratorType1 iter_a = begin_a;
550  IteratorType2 iter_b = begin_b;
551  for (; iter_a != end_a; ++iter_a, ++iter_b)
552  {
553  /* assure both ranges have the same number of elements */
554  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
555  double temp_a = *iter_a - avg_a;
556  double temp_b = *iter_b - avg_b;
557  numerator += (temp_a * temp_b);
558  denominator_a += (temp_a * temp_a);
559  denominator_b += (temp_b * temp_b);
560  }
561  /* assure both ranges have the same number of elements */
562  checkIteratorsEqual(iter_b, end_b);
563  return numerator / sqrt(denominator_a * denominator_b);
564  }
565 
567  template <typename Value>
568  static void computeRank(std::vector<Value> & w)
569  {
570  Size i = 0; // main index
571  Size z = 0; // "secondary" index
572  Value rank = 0;
573  Size n = (w.size() - 1);
574  //store original indices for later
575  std::vector<std::pair<Size, Value> > w_idx;
576  for (Size j = 0; j < w.size(); ++j)
577  {
578  w_idx.push_back(std::make_pair(j, w[j]));
579  }
580  //sort
581  std::sort(w_idx.begin(), w_idx.end(),
582  boost::lambda::ret<bool>((&boost::lambda::_1->*& std::pair<Size, Value>::second) <
583  (&boost::lambda::_2->*& std::pair<Size, Value>::second)));
584  //replace pairs <orig_index, value> in w_idx by pairs <orig_index, rank>
585  while (i < n)
586  {
587  // test for equality with tolerance:
588  if (fabs(w_idx[i + 1].second - w_idx[i].second) > 0.0000001 * fabs(w_idx[i + 1].second)) // no tie
589  {
590  w_idx[i].second = Value(i + 1);
591  ++i;
592  }
593  else // tie, replace by mean rank
594  {
595  // count number of ties
596  for (z = i + 1; (z <= n) && fabs(w_idx[z].second - w_idx[i].second) <= 0.0000001 * fabs(w_idx[z].second); ++z)
597  {
598  }
599  // compute mean rank of tie
600  rank = 0.5 * (i + z + 1);
601  // replace intensities by rank
602  for (Size v = i; v <= z - 1; ++v)
603  {
604  w_idx[v].second = rank;
605  }
606  i = z;
607  }
608  }
609  if (i == n)
610  w_idx[n].second = Value(n + 1);
611  //restore original order and replace elements of w with their ranks
612  for (Size j = 0; j < w.size(); ++j)
613  {
614  w[w_idx[j].first] = w_idx[j].second;
615  }
616  }
617 
629  template <typename IteratorType1, typename IteratorType2>
631  IteratorType1 begin_a, IteratorType1 end_a,
632  IteratorType2 begin_b, IteratorType2 end_b)
633  {
634  //no data or different lengths
635  checkIteratorsNotNULL(begin_a, end_a);
636 
637  // store and sort intensities of model and data
638  SignedSize dist = std::distance(begin_a, end_a);
639  std::vector<double> ranks_data;
640  ranks_data.reserve(dist);
641  std::vector<double> ranks_model;
642  ranks_model.reserve(dist);
643  IteratorType1 iter_a = begin_a;
644  IteratorType2 iter_b = begin_b;
645  for (; iter_a != end_a; ++iter_a, ++iter_b)
646  {
647  /* assure both ranges have the same number of elements */
648  checkIteratorsAreValid(iter_b, end_b, iter_a, end_a);
649 
650  ranks_model.push_back(*iter_a);
651  ranks_data.push_back(*iter_b);
652  }
653  /* assure both ranges have the same number of elements */
654  checkIteratorsEqual(iter_b, end_b);
655 
656  // replace entries by their ranks
657  computeRank(ranks_data);
658  computeRank(ranks_model);
659 
660  double mu = double(ranks_data.size() + 1) / 2.; // mean of ranks
661  // Was the following, but I think the above is more correct ... (Clemens)
662  // double mu = (ranks_data.size() + 1) / 2;
663 
664  double sum_model_data = 0;
665  double sqsum_data = 0;
666  double sqsum_model = 0;
667 
668  for (Int i = 0; i < dist; ++i)
669  {
670  sum_model_data += (ranks_data[i] - mu) * (ranks_model[i] - mu);
671  sqsum_data += (ranks_data[i] - mu) * (ranks_data[i] - mu);
672  sqsum_model += (ranks_model[i] - mu) * (ranks_model[i] - mu);
673  }
674 
675  // check for division by zero
676  if (!sqsum_data || !sqsum_model)
677  {
678  return 0;
679  }
680 
681  return sum_model_data / (sqrt(sqsum_data) * sqrt(sqsum_model));
682  }
683 
685  template<typename T>
687  {
689  :mean(0), variance(0), min(0), lowerq(0), median(0), upperq(0), max(0)
690  {
691  }
692 
693  // Ctor with data
695  {
696  count = data.size();
697  // Sanity check: avoid core dump if no data points present.
698  if (data.empty())
699  {
700  mean = variance = min = lowerq = median = upperq = max = 0.0;
701  }
702  else
703  {
704  sort(data.begin(), data.end());
705  mean = Math::mean(data.begin(), data.end());
706  variance = Math::variance(data.begin(), data.end(), mean);
707  min = data.front();
708  lowerq = Math::quantile1st(data.begin(), data.end(), true);
709  median = Math::median(data.begin(), data.end(), true);
710  upperq = Math::quantile3rd(data.begin(), data.end(), true);
711  max = data.back();
712  }
713  }
714 
716  typename T::value_type min, max;
717  size_t count;
718  };
719 
720  } // namespace Math
721 } // namespace OpenMS
722 
Invalid range exception.
Definition: Exception.h:279
int Int
Signed integer type.
Definition: Types.h:102
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:127
static double classificationRate(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the classification rate for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:442
static double median(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the median of a range of values.
Definition: StatisticFunctions.h:151
static double mean(IteratorType begin, IteratorType end)
Calculates the mean of a range of values.
Definition: StatisticFunctions.h:133
static double covariance(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the covariance of two ranges of values.
Definition: StatisticFunctions.h:375
static double quantile3rd(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the third quantile of a range of values.
Definition: StatisticFunctions.h:284
static void checkIteratorsNotNULL(IteratorType begin, IteratorType end)
Helper function checking if two iterators are not equal.
Definition: StatisticFunctions.h:72
static double matthewsCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Matthews correlation coefficient for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:481
double MeanAbsoluteDeviation(IteratorType begin, IteratorType end, double mean_of_numbers)
mean absolute deviation (MeanAbsoluteDeviation)
Definition: StatisticFunctions.h:228
static double sum(IteratorType begin, IteratorType end)
Calculates the sum of a range of values.
Definition: StatisticFunctions.h:120
static double absdev(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the absolute deviation of a range of values.
Definition: StatisticFunctions.h:349
static double pearsonCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Pearson correlation coefficient for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:534
static double sd(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the standard deviation of a range of values.
Definition: StatisticFunctions.h:334
double MAD(IteratorType begin, IteratorType end, double median_of_numbers)
median absolute deviation (MAD)
Definition: StatisticFunctions.h:198
static double rankCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
calculates the rank correlation coefficient for the values in [begin_a, end_a) and [begin_b,...
Definition: StatisticFunctions.h:630
static void checkIteratorsAreValid(IteratorType1 begin_b, IteratorType1 end_b, IteratorType2 begin_a, IteratorType2 end_a)
Helper function checking if an iterator and a co-iterator both have a next element.
Definition: StatisticFunctions.h:104
static double quantile1st(IteratorType begin, IteratorType end, bool sorted=false)
Calculates the first quantile of a range of values.
Definition: StatisticFunctions.h:252
static void checkIteratorsEqual(IteratorType begin, IteratorType end)
Helper function checking if two iterators are equal.
Definition: StatisticFunctions.h:88
static double variance(IteratorType begin, IteratorType end, double mean=std::numeric_limits< double >::max())
Calculates the variance of a range of values.
Definition: StatisticFunctions.h:307
static double meanSquareError(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the mean square error for the values in [begin_a, end_a) and [begin_b, end_b)
Definition: StatisticFunctions.h:408
static void computeRank(std::vector< Value > &w)
Replaces the elements in vector w by their ranks.
Definition: StatisticFunctions.h:568
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:47
Helper class to gather (and dump) some statistics from a e.g. vector<double>.
Definition: StatisticFunctions.h:687
double lowerq
Definition: StatisticFunctions.h:715
double variance
Definition: StatisticFunctions.h:715
SummaryStatistics()
Definition: StatisticFunctions.h:688
T::value_type max
Definition: StatisticFunctions.h:716
SummaryStatistics(T &data)
Definition: StatisticFunctions.h:694
double median
Definition: StatisticFunctions.h:715
size_t count
Definition: StatisticFunctions.h:717
double mean
Definition: StatisticFunctions.h:715
double upperq
Definition: StatisticFunctions.h:715
T::value_type min
Definition: StatisticFunctions.h:716