OpenMS
SignalToNoiseEstimatorMedian.h
Go to the documentation of this file.
1 // Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin
2 // SPDX-License-Identifier: BSD-3-Clause
3 //
4 // --------------------------------------------------------------------------
5 // $Maintainer: Chris Bielow $
6 // $Authors: $
7 // --------------------------------------------------------------------------
8 //
9 
10 #pragma once
11 
12 
17 #include <vector>
18 #include <algorithm> //for std::max_element
19 
20 namespace OpenMS
21 {
54  template <typename Container = MSSpectrum>
56  public SignalToNoiseEstimator<Container>
57  {
58 
59 public:
60 
63 
67 
70 
72 
75  {
76  //set the name for DefaultParamHandler error messages
77  this->setName("SignalToNoiseEstimatorMedian");
78 
79  defaults_.setValue("max_intensity", -1, "maximal intensity considered for histogram construction. By default, it will be calculated automatically (see auto_mode)." \
80  " Only provide this parameter if you know what you are doing (and change 'auto_mode' to '-1')!" \
81  " All intensities EQUAL/ABOVE 'max_intensity' will be added to the LAST histogram bin." \
82  " If you choose 'max_intensity' too small, the noise estimate might be too small as well. " \
83  " If chosen too big, the bins become quite large (which you could counter by increasing 'bin_count', which increases runtime)." \
84  " In general, the Median-S/N estimator is more robust to a manual max_intensity than the MeanIterative-S/N.", {"advanced"});
85  defaults_.setMinInt("max_intensity", -1);
86 
87  defaults_.setValue("auto_max_stdev_factor", 3.0, "parameter for 'max_intensity' estimation (if 'auto_mode' == 0): mean + 'auto_max_stdev_factor' * stdev", {"advanced"});
88  defaults_.setMinFloat("auto_max_stdev_factor", 0.0);
89  defaults_.setMaxFloat("auto_max_stdev_factor", 999.0);
90 
91  defaults_.setValue("auto_max_percentile", 95, "parameter for 'max_intensity' estimation (if 'auto_mode' == 1): auto_max_percentile th percentile", {"advanced"});
92  defaults_.setMinInt("auto_max_percentile", 0);
93  defaults_.setMaxInt("auto_max_percentile", 100);
94 
95  defaults_.setValue("auto_mode", 0, "method to use to determine maximal intensity: -1 --> use 'max_intensity'; 0 --> 'auto_max_stdev_factor' method (default); 1 --> 'auto_max_percentile' method", {"advanced"});
96  defaults_.setMinInt("auto_mode", -1);
97  defaults_.setMaxInt("auto_mode", 1);
98 
99  defaults_.setValue("win_len", 200.0, "window length in Thomson");
100  defaults_.setMinFloat("win_len", 1.0);
101 
102  defaults_.setValue("bin_count", 30, "number of bins for intensity values");
103  defaults_.setMinInt("bin_count", 3);
104 
105  defaults_.setValue("min_required_elements", 10, "minimum number of elements required in a window (otherwise it is considered sparse)");
106  defaults_.setMinInt("min_required_elements", 1);
107 
108  defaults_.setValue("noise_for_empty_window", std::pow(10.0, 20), "noise value used for sparse windows", {"advanced"});
109 
110  defaults_.setValue("write_log_messages", "true", "Write out log messages in case of sparse windows or median in rightmost histogram bin");
111  defaults_.setValidStrings("write_log_messages", {"true","false"});
112 
114  }
115 
118  SignalToNoiseEstimator<Container>(source)
119  {
120  updateMembers_();
121  }
122 
128  {
129  if (&source == this) return *this;
130 
132  updateMembers_();
133  return *this;
134  }
135 
137 
140  {}
141 
143  double getSparseWindowPercent() const
144  {
145  return sparse_window_percent_;
146  }
147 
150  {
151  return histogram_oob_percent_;
152  }
153 
154 protected:
155 
156 
162  void computeSTN_(const Container& c) override
163  {
164  //first element in the scan
165  PeakIterator scan_first_ = c.begin();
166  //last element in the scan
167  PeakIterator scan_last_ = c.end();
168 
169  // reset counter for sparse windows
171  // reset counter for histogram overflow
173 
174  // reset the results
175  stn_estimates_.clear();
176  stn_estimates_.resize(c.size());
177 
178  // maximal range of histogram needs to be calculated first
179  if (auto_mode_ == AUTOMAXBYSTDEV)
180  {
181  // use MEAN+auto_max_intensity_*STDEV as threshold
182  GaussianEstimate gauss_global = SignalToNoiseEstimator<Container>::estimate_(scan_first_, scan_last_);
183  max_intensity_ = gauss_global.mean + std::sqrt(gauss_global.variance) * auto_max_stdev_Factor_;
184  }
185  else if (auto_mode_ == AUTOMAXBYPERCENT)
186  {
187  // get value at "auto_max_percentile_"th percentile
188  // we use a histogram approach here as well.
189  if ((auto_max_percentile_ < 0) || (auto_max_percentile_ > 100))
190  {
192  throw Exception::InvalidValue(__FILE__,
193  __LINE__,
194  OPENMS_PRETTY_FUNCTION,
195  "auto_mode is on AUTOMAXBYPERCENT! auto_max_percentile is not in [0,100]. Use setAutoMaxPercentile(<value>) to change it!",
196  s);
197  }
198 
199  std::vector<int> histogram_auto(100, 0);
200 
201  // find maximum of current scan
202  auto maxIt = std::max_element(c.begin(), c.end() ,[](const PeakType& a, const PeakType& b){ return a.getIntensity() > b.getIntensity();});
203  typename PeakType::IntensityType maxInt = maxIt->getIntensity();
204 
205  double bin_size = maxInt / 100;
206 
207  // fill histogram
208  for(const auto& peak : c)
209  {
210  ++histogram_auto[(int) ((peak.getIntensity() - 1) / bin_size)];
211  }
212 
213  // add up element counts in histogram until ?th percentile is reached
214  int elements_below_percentile = (int) (auto_max_percentile_ * c.size() / 100);
215  int elements_seen = 0;
216  int i = -1;
217  PeakIterator run = scan_first_;
218 
219  while (run != scan_last_ && elements_seen < elements_below_percentile)
220  {
221  ++i;
222  elements_seen += histogram_auto[i];
223  ++run;
224  }
225 
226  max_intensity_ = (((double)i) + 0.5) * bin_size;
227  }
228  else //if (auto_mode_ == MANUAL)
229  {
230  if (max_intensity_ <= 0)
231  {
233  throw Exception::InvalidValue(__FILE__,
234  __LINE__,
235  OPENMS_PRETTY_FUNCTION,
236  "auto_mode is on MANUAL! max_intensity is <=0. Needs to be positive! Use setMaxIntensity(<value>) or enable auto_mode!",
237  s);
238  }
239  }
240 
241  if (max_intensity_ < 0)
242  {
243  std::cerr << "TODO SignalToNoiseEstimatorMedian: the max_intensity_ value should be positive! " << max_intensity_ << std::endl;
244  return;
245  }
246 
247  PeakIterator window_pos_center = scan_first_;
248  PeakIterator window_pos_borderleft = scan_first_;
249  PeakIterator window_pos_borderright = scan_first_;
250 
251  double window_half_size = win_len_ / 2;
252  double bin_size = std::max(1.0, max_intensity_ / bin_count_); // at least size of 1 for intensity bins
253  int bin_count_minus_1 = bin_count_ - 1;
254 
255  std::vector<int> histogram(bin_count_, 0);
256  std::vector<double> bin_value(bin_count_, 0);
257  // calculate average intensity that is represented by a bin
258  for (int bin = 0; bin < bin_count_; bin++)
259  {
260  histogram[bin] = 0;
261  bin_value[bin] = (bin + 0.5) * bin_size;
262  }
263  // bin in which a datapoint would fall
264  int to_bin = 0;
265 
266  // index of bin where the median is located
267  int median_bin = 0;
268  // additive number of elements from left to x in histogram
269  int element_inc_count = 0;
270 
271  // tracks elements in current window, which may vary because of unevenly spaced data
272  int elements_in_window = 0;
273  // number of windows
274  int window_count = 0;
275 
276  // number of elements where we find the median
277  int element_in_window_half = 0;
278 
279  double noise; // noise value of a datapoint
280 
282  SignalToNoiseEstimator<Container>::startProgress(0, c.size(), "noise estimation of data");
283 
284  // MAIN LOOP
285  while (window_pos_center != scan_last_)
286  {
287 
288  // erase all elements from histogram that will leave the window on the LEFT side
289  while ((*window_pos_borderleft).getMZ() < (*window_pos_center).getMZ() - window_half_size)
290  {
291  to_bin = std::max(std::min<int>((int)((*window_pos_borderleft).getIntensity() / bin_size), bin_count_minus_1), 0);
292  --histogram[to_bin];
293  --elements_in_window;
294  ++window_pos_borderleft;
295  }
296 
297  // add all elements to histogram that will enter the window on the RIGHT side
298  while ((window_pos_borderright != scan_last_)
299  && ((*window_pos_borderright).getMZ() <= (*window_pos_center).getMZ() + window_half_size))
300  {
301  //std::cerr << (*window_pos_borderright).getIntensity() << " " << bin_size << " " << bin_count_minus_1 << std::endl;
302  to_bin = std::max(std::min<int>((int)((*window_pos_borderright).getIntensity() / bin_size), bin_count_minus_1), 0);
303  ++histogram[to_bin];
304  ++elements_in_window;
305  ++window_pos_borderright;
306  }
307 
308  if (elements_in_window < min_required_elements_)
309  {
310  noise = noise_for_empty_window_;
312  }
313  else
314  {
315  // find bin i where ceil[elements_in_window/2] <= sum_c(0..i){ histogram[c] }
316  median_bin = -1;
317  element_inc_count = 0;
318  element_in_window_half = (elements_in_window + 1) / 2;
319  while (median_bin < bin_count_minus_1 && element_inc_count < element_in_window_half)
320  {
321  ++median_bin;
322  element_inc_count += histogram[median_bin];
323  }
324 
325  // increase the error count
326  if (median_bin == bin_count_minus_1) {++histogram_oob_percent_; }
327 
328  // just avoid division by 0
329  noise = std::max(1.0, bin_value[median_bin]);
330  }
331 
332  // store result
333  stn_estimates_[window_count] = (*window_pos_center).getIntensity() / noise;
334 
335 
336  // advance the window center by one datapoint
337  ++window_pos_center;
338  ++window_count;
339  // update progress
341 
342  } // end while
343 
345 
346  sparse_window_percent_ = sparse_window_percent_ * 100 / window_count;
347  histogram_oob_percent_ = histogram_oob_percent_ * 100 / window_count;
348 
349  // warn if percentage of sparse windows is above 20%
351  {
352  OPENMS_LOG_WARN << "WARNING in SignalToNoiseEstimatorMedian: "
354  << "% of all windows were sparse. You should consider increasing 'win_len' or decreasing 'min_required_elements'"
355  << std::endl;
356  }
357 
358  // warn if percentage of possibly wrong median estimates is above 1%
360  {
361  OPENMS_LOG_WARN << "WARNING in SignalToNoiseEstimatorMedian: "
363  << "% of all Signal-to-Noise estimates are too high, because the median was found in the rightmost histogram-bin. "
364  << "You should consider increasing 'max_intensity' (and maybe 'bin_count' with it, to keep bin width reasonable)"
365  << std::endl;
366  }
367 
368  } // end of shiftWindow_
369 
371  void updateMembers_() override
372  {
373  max_intensity_ = (double)param_.getValue("max_intensity");
374  auto_max_stdev_Factor_ = (double)param_.getValue("auto_max_stdev_factor");
375  auto_max_percentile_ = param_.getValue("auto_max_percentile");
376  auto_mode_ = param_.getValue("auto_mode");
377  win_len_ = (double)param_.getValue("win_len");
378  bin_count_ = param_.getValue("bin_count");
379  min_required_elements_ = param_.getValue("min_required_elements");
380  noise_for_empty_window_ = (double)param_.getValue("noise_for_empty_window");
381  write_log_messages_ = (bool)param_.getValue("write_log_messages").toBool();
382  stn_estimates_.clear();
383  }
384 
394  double win_len_;
402 
403  // whether to write out log messages in the case of failure
405 
406  // counter for sparse windows
408  // counter for histogram overflow
410 
411 
412  };
413 
414 } // namespace OpenMS
415 
#define OPENMS_LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged.
Definition: LogStream.h:444
void defaultsToParam_()
Updates the parameters after the defaults have been set in the constructor.
Param param_
Container for current parameters.
Definition: DefaultParamHandler.h:139
Param defaults_
Container for default parameters. This member should be filled in the constructor of derived classes!
Definition: DefaultParamHandler.h:146
void setName(const String &name)
Mutable access to the name.
Invalid value exception.
Definition: Exception.h:305
bool toBool() const
Conversion to bool.
void setValidStrings(const std::string &key, const std::vector< std::string > &strings)
Sets the valid strings for the parameter key.
void setMaxFloat(const std::string &key, double max)
Sets the maximum value for the floating point or floating point list parameter key.
void setMaxInt(const std::string &key, int max)
Sets the maximum value for the integer or integer list parameter key.
const ParamValue & getValue(const std::string &key) const
Returns a value of a parameter.
void setMinInt(const std::string &key, int min)
Sets the minimum value for the integer or integer list parameter key.
void setValue(const std::string &key, const ParamValue &value, const std::string &description="", const std::vector< std::string > &tags=std::vector< std::string >())
Sets a value.
void setMinFloat(const std::string &key, double min)
Sets the minimum value for the floating point or floating point list parameter key.
float IntensityType
Intensity type.
Definition: Peak2D.h:36
void setProgress(SignedSize value) const
Sets the current progress.
void startProgress(SignedSize begin, SignedSize end, const String &label) const
Initializes the progress display.
void endProgress(UInt64 bytes_processed=0) const
Estimates the signal/noise (S/N) ratio of each data point in a scan by using the median (histogram ba...
Definition: SignalToNoiseEstimatorMedian.h:57
SignalToNoiseEstimator< Container >::PeakIterator PeakIterator
Definition: SignalToNoiseEstimatorMedian.h:68
SignalToNoiseEstimatorMedian & operator=(const SignalToNoiseEstimatorMedian &source)
Definition: SignalToNoiseEstimatorMedian.h:127
double win_len_
range of data points which belong to a window in Thomson
Definition: SignalToNoiseEstimatorMedian.h:394
double noise_for_empty_window_
Definition: SignalToNoiseEstimatorMedian.h:401
~SignalToNoiseEstimatorMedian() override
Destructor.
Definition: SignalToNoiseEstimatorMedian.h:139
SignalToNoiseEstimatorMedian()
default constructor
Definition: SignalToNoiseEstimatorMedian.h:74
double max_intensity_
maximal intensity considered during binning (values above get discarded)
Definition: SignalToNoiseEstimatorMedian.h:386
double auto_max_percentile_
parameter for initial automatic estimation of "max_intensity_" percentile or a stdev
Definition: SignalToNoiseEstimatorMedian.h:390
double histogram_oob_percent_
Definition: SignalToNoiseEstimatorMedian.h:409
void computeSTN_(const Container &c) override
Definition: SignalToNoiseEstimatorMedian.h:162
bool write_log_messages_
Definition: SignalToNoiseEstimatorMedian.h:404
void updateMembers_() override
overridden function from DefaultParamHandler to keep members up to date, when a parameter is changed
Definition: SignalToNoiseEstimatorMedian.h:371
int min_required_elements_
minimal number of elements a window needs to cover to be used
Definition: SignalToNoiseEstimatorMedian.h:398
SignalToNoiseEstimatorMedian(const SignalToNoiseEstimatorMedian &source)
Copy Constructor.
Definition: SignalToNoiseEstimatorMedian.h:117
double getSparseWindowPercent() const
Returns how many percent of the windows were sparse.
Definition: SignalToNoiseEstimatorMedian.h:143
double sparse_window_percent_
Definition: SignalToNoiseEstimatorMedian.h:407
double getHistogramRightmostPercent() const
Returns the percentage where the median was found in the rightmost bin.
Definition: SignalToNoiseEstimatorMedian.h:149
SignalToNoiseEstimator< Container >::PeakType PeakType
Definition: SignalToNoiseEstimatorMedian.h:69
SignalToNoiseEstimator< Container >::GaussianEstimate GaussianEstimate
Definition: SignalToNoiseEstimatorMedian.h:71
int auto_mode_
determines which method shall be used for estimating "max_intensity_". valid are MANUAL=-1,...
Definition: SignalToNoiseEstimatorMedian.h:392
IntensityThresholdCalculation
method to use for estimating the maximal intensity that is used for histogram calculation
Definition: SignalToNoiseEstimatorMedian.h:62
@ MANUAL
Definition: SignalToNoiseEstimatorMedian.h:62
@ AUTOMAXBYSTDEV
Definition: SignalToNoiseEstimatorMedian.h:62
@ AUTOMAXBYPERCENT
Definition: SignalToNoiseEstimatorMedian.h:62
int bin_count_
number of bins in the histogram
Definition: SignalToNoiseEstimatorMedian.h:396
double auto_max_stdev_Factor_
parameter for initial automatic estimation of "max_intensity_": a stdev multiplier
Definition: SignalToNoiseEstimatorMedian.h:388
This class represents the abstract base class of a signal to noise estimator.
Definition: SignalToNoiseEstimator.h:33
double variance
variance of estimated Gaussian
Definition: SignalToNoiseEstimator.h:108
SignalToNoiseEstimator & operator=(const SignalToNoiseEstimator &source)
Assignment operator.
Definition: SignalToNoiseEstimator.h:60
PeakIterator::value_type PeakType
Definition: SignalToNoiseEstimator.h:40
GaussianEstimate estimate_(const PeakIterator &scan_first_, const PeakIterator &scan_last_) const
calculate mean & stdev of intensities of a spectrum
Definition: SignalToNoiseEstimator.h:113
double mean
mean of estimated Gaussian
Definition: SignalToNoiseEstimator.h:107
std::vector< double > stn_estimates_
stores the noise estimate for each peak
Definition: SignalToNoiseEstimator.h:146
Container::const_iterator PeakIterator
Definition: SignalToNoiseEstimator.h:39
protected struct to store parameters my, sigma for a Gaussian distribution
Definition: SignalToNoiseEstimator.h:106
A more convenient string class.
Definition: String.h:34
const double c
Definition: Constants.h:188
Main OpenMS namespace.
Definition: openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19