OpenMS
2.7.0
|
A variant of QT clustering for the detection of feature groups. More...
#include <OpenMS/ANALYSIS/MAPMATCHING/QTClusterFinder.h>
Public Types | |
typedef OpenMSBoost::unordered_map< std::pair< OpenMS::GridFeature *, OpenMS::GridFeature * >, double > | PairDistances |
Distances between pairs of grid features. More... | |
typedef OpenMSBoost::unordered_map< const OpenMS::GridFeature *, std::unordered_set< Size > > | ElementMapping |
Map to store which grid features are next to which clusters (saves the clusters ids) More... | |
typedef boost::heap::fibonacci_heap< QTCluster > | Heap |
Heap to efficiently find the best clusters. More... | |
typedef HashGrid< OpenMS::GridFeature * > | Grid |
Public Types inherited from ProgressLogger | |
enum | LogType { CMD , GUI , NONE } |
Possible log types. More... | |
Public Member Functions | |
QTClusterFinder () | |
Constructor. More... | |
~QTClusterFinder () override | |
Destructor. More... | |
void | run (const std::vector< ConsensusMap > &input_maps, ConsensusMap &result_map) override |
Runs the algorithm on consensus maps. More... | |
void | run (const std::vector< FeatureMap > &input_maps, ConsensusMap &result_map) |
Runs the algorithm on feature maps. More... | |
Public Member Functions inherited from BaseGroupFinder | |
BaseGroupFinder () | |
Default constructor. More... | |
~BaseGroupFinder () override | |
Destructor. More... | |
Public Member Functions inherited from DefaultParamHandler | |
DefaultParamHandler (const String &name) | |
Constructor with name that is displayed in error messages. More... | |
DefaultParamHandler (const DefaultParamHandler &rhs) | |
Copy constructor. More... | |
virtual | ~DefaultParamHandler () |
Destructor. More... | |
virtual DefaultParamHandler & | operator= (const DefaultParamHandler &rhs) |
Assignment operator. More... | |
virtual bool | operator== (const DefaultParamHandler &rhs) const |
Equality operator. More... | |
void | setParameters (const Param ¶m) |
Sets the parameters. More... | |
const Param & | getParameters () const |
Non-mutable access to the parameters. More... | |
const Param & | getDefaults () const |
Non-mutable access to the default parameters. More... | |
const String & | getName () const |
Non-mutable access to the name. More... | |
void | setName (const String &name) |
Mutable access to the name. More... | |
const std::vector< String > & | getSubsections () const |
Non-mutable access to the registered subsections. More... | |
Public Member Functions inherited from ProgressLogger | |
ProgressLogger () | |
Constructor. More... | |
virtual | ~ProgressLogger () |
Destructor. More... | |
ProgressLogger (const ProgressLogger &other) | |
Copy constructor. More... | |
ProgressLogger & | operator= (const ProgressLogger &other) |
Assignment Operator. More... | |
void | setLogType (LogType type) const |
Sets the progress log that should be used. The default type is NONE! More... | |
LogType | getLogType () const |
Returns the type of progress log being used. More... | |
void | startProgress (SignedSize begin, SignedSize end, const String &label) const |
Initializes the progress display. More... | |
void | setProgress (SignedSize value) const |
Sets the current progress. More... | |
void | endProgress () const |
Ends the progress display. More... | |
void | nextProgress () const |
increment progress by 1 (according to range begin-end) More... | |
Static Public Member Functions | |
static const String | getProductName () |
Returns the name of the product. More... | |
static BaseGroupFinder * | create () |
Returns an instance of this class. More... | |
Static Public Member Functions inherited from BaseGroupFinder | |
static void | registerChildren () |
Register all derived classes here. More... | |
Static Public Member Functions inherited from DefaultParamHandler | |
static void | writeParametersToMetaValues (const Param &write_this, MetaInfoInterface &write_here, const String &prefix="") |
Writes all parameters to meta values. More... | |
Protected Types | |
enum | { RT = Peak2D::RT , MZ = Peak2D::MZ } |
Private Member Functions | |
double | getDistance_ (const OpenMS::GridFeature *left, const OpenMS::GridFeature *right) |
Calculates the distance between two grid features. More... | |
void | setParameters_ (double max_intensity, double max_mz) |
Sets algorithm parameters. More... | |
bool | makeConsensusFeature_ (Heap &cluster_heads, ConsensusFeature &feature, ElementMapping &element_mapping, const Grid &grid, const std::vector< Heap::handle_type > &handles) |
Extract the best cluster from cluster_heads and turn it into a consensus feature. More... | |
void | computeClustering_ (const Grid &grid, Heap &cluster_heads, std::vector< QTCluster::BulkData > &cluster_data, std::vector< Heap::handle_type > &handles, ElementMapping &element_mapping) |
Computes an initial QT clustering of the points in the hash grid. More... | |
void | removeFromElementMapping_ (const QTCluster &cluster, ElementMapping &element_mapping) |
Removes id of current top cluster in the heap from element mapping. More... | |
void | createConsensusFeature_ (ConsensusFeature &feature, const double quality, const QTCluster::Elements &elements) |
creates a consensus feature from the given elements More... | |
void | updateClustering_ (ElementMapping &element_mapping, const Grid &grid, const QTCluster::Elements &elements, Heap &cluster_heads, const std::vector< Heap::handle_type > &handles, Size best_id) |
update the clustering: More... | |
template<typename MapType > | |
void | run_ (const std::vector< MapType > &input_maps, ConsensusMap &result_map) |
Runs the algorithm on feature maps or consensus maps. More... | |
template<typename MapType > | |
void | run_internal_ (const std::vector< MapType > &input_maps, ConsensusMap &result_map, bool do_progress) |
Runs the algorithm on feature maps or consensus maps (internal) More... | |
void | addClusterElements_ (const Grid &grid, QTCluster &cluster) |
Adds elements to the cluster based on the elements hashed in the grid. More... | |
bool | distIsOutlier_ (double dist, double rt) |
Looks up the matching bin for rt in bin_tolerances_ and checks if dist is in the allowed range. More... | |
Private Attributes | |
Size | num_maps_ |
Number of input maps. More... | |
bool | use_IDs_ |
Consider peptide identifications for grouping? More... | |
Size | min_nr_diffs_per_bin_ |
Min. nr. of differences from matched IDs requested to calculate a linking tolerance per RT bin. More... | |
double | min_score_ |
Min. score for an ID to be considered for tolerance estimation. More... | |
double | noID_penalty_ |
double | max_diff_rt_ |
Maximum RT difference. More... | |
double | max_diff_mz_ |
Maximum m/z difference. More... | |
int | nr_partitions_ |
Maximum m/z difference. More... | |
FeatureDistance | feature_distance_ |
Feature distance functor. More... | |
std::unordered_set< const OpenMS::GridFeature * > | already_used_ |
Set of features already used. More... | |
std::map< double, double > | bin_tolerances_ |
Additional Inherited Members | |
Protected Member Functions inherited from BaseGroupFinder | |
void | checkIds_ (const std::vector< ConsensusMap > &maps) const |
Checks if all file descriptions have disjoint map identifiers. More... | |
Protected Member Functions inherited from DefaultParamHandler | |
virtual void | updateMembers_ () |
This method is used to update extra member variables at the end of the setParameters() method. More... | |
void | defaultsToParam_ () |
Updates the parameters after the defaults have been set in the constructor. More... | |
Static Protected Member Functions inherited from ProgressLogger | |
static String | logTypeToFactoryName_ (LogType type) |
Return the name of the factory product used for this log type. More... | |
Protected Attributes inherited from DefaultParamHandler | |
Param | param_ |
Container for current parameters. More... | |
Param | defaults_ |
Container for default parameters. This member should be filled in the constructor of derived classes! More... | |
std::vector< String > | subsections_ |
Container for registered subsections. This member should be filled in the constructor of derived classes! More... | |
String | error_name_ |
Name that is displayed in error messages during the parameter checking. More... | |
bool | check_defaults_ |
If this member is set to false no checking if parameters in done;. More... | |
bool | warn_empty_defaults_ |
If this member is set to false no warning is emitted when defaults are empty;. More... | |
Protected Attributes inherited from ProgressLogger | |
LogType | type_ |
time_t | last_invoke_ |
ProgressLoggerImpl * | current_logger_ |
Static Protected Attributes inherited from ProgressLogger | |
static int | recursion_depth_ |
A variant of QT clustering for the detection of feature groups.
The algorithm accumulates all features from all input maps, then applies a variant of QT clustering to find groups of corresponding features. In more detail, every feature from every input map is considered as a potential cluster center. For every center, its nearest neighbors from the other input maps are detected and added to the potential cluster. Iteratively, the cluster with the highest quality is extracted and the clustering is updated.
Properties affecting the grouping
To be included in a particular cluster, a feature has to fulfill the following conditions:
distance_RT:max_difference
and distance_MZ:max_difference
),ignore_charge
is set),use_identifications
is set and both the feature and the cluster center are annotated with peptide identifications, the identifications have to match.Every cluster contains at most one feature from each input map - namely the feature closest to the cluster center that meets the criteria and does not belong to a better cluster.
The notion of "closeness" for features is defined by the distance function implemented in FeatureDistance, the parameters of which can be set by the user.
The quality of a cluster is computed from the number of elements in it and their distances to the cluster center. For more details see QTCluster.
Optimization
This algorithm includes a number of optimizations to reduce run-time:
Name | Type | Default | Restrictions | Description |
---|---|---|---|---|
use_identifications | string | false | true, false | Never link features that are annotated with different peptides (only the best hit per peptide identification is taken into account). |
nr_partitions | int | 100 | min: 1 | How many partitions in m/z space should be used for the algorithm (more partitions means faster runtime and more memory efficient execution). |
min_nr_diffs_per_bin | int | 50 | min: 5 | If IDs are used: How many differences from matching IDs should be used to calculate a linking tolerance for unIDed features in an RT region. RT regions will be extended until that number is reached. |
min_IDscore_forTolCalc | float | 1.0 | If IDs are used: What is the minimum score of an ID to assume a reliable match for tolerance calculation. Check your current score type! | |
noID_penalty | float | 0.0 | min: 0.0 max: 1.0 | If IDs are used: For the normalized distances, how high should the penalty for missing IDs be? 0 = no bias, 1 = IDs inside the max tolerances always preferred (even if much further away). |
ignore_charge | string | false | true, false | false [default]: pairing requires equal charge state (or at least one unknown charge '0'); true: Pairing irrespective of charge state |
ignore_adduct | string | true | true, false | true [default]: pairing requires equal adducts (or at least one without adduct annotation); true: Pairing irrespective of adducts |
distance_RT:max_difference | float | 100.0 | min: 0.0 | Never pair features with a larger RT distance (in seconds). |
distance_RT:exponent | float | 1.0 | min: 0.0 | Normalized RT differences ([0-1], relative to 'max_difference') are raised to this power (using 1 or 2 will be fast, everything else is REALLY slow) |
distance_RT:weight | float | 1.0 | min: 0.0 | Final RT distances are weighted by this factor |
distance_MZ:max_difference | float | 0.3 | min: 0.0 | Never pair features with larger m/z distance (unit defined by 'unit') |
distance_MZ:unit | string | Da | Da, ppm | Unit of the 'max_difference' parameter |
distance_MZ:exponent | float | 2.0 | min: 0.0 | Normalized ([0-1], relative to 'max_difference') m/z differences are raised to this power (using 1 or 2 will be fast, everything else is REALLY slow) |
distance_MZ:weight | float | 1.0 | min: 0.0 | Final m/z distances are weighted by this factor |
distance_intensity:exponent | float | 1.0 | min: 0.0 | Differences in relative intensity ([0-1]) are raised to this power (using 1 or 2 will be fast, everything else is REALLY slow) |
distance_intensity:weight | float | 0.0 | min: 0.0 | Final intensity distances are weighted by this factor |
distance_intensity:log_transform | string | disabled | enabled, disabled | Log-transform intensities? If disabled, d = |int_f2 - int_f1| / int_max. If enabled, d = |log(int_f2 + 1) - log(int_f1 + 1)| / log(int_max + 1)) |
typedef OpenMSBoost::unordered_map< const OpenMS::GridFeature*, std::unordered_set<Size> > ElementMapping |
Map to store which grid features are next to which clusters (saves the clusters ids)
typedef HashGrid<OpenMS::GridFeature*> Grid |
typedef OpenMSBoost::unordered_map< std::pair<OpenMS::GridFeature*, OpenMS::GridFeature*>, double> PairDistances |
Distances between pairs of grid features.
QTClusterFinder | ( | ) |
Constructor.
|
override |
Destructor.
Adds elements to the cluster based on the elements hashed in the grid.
grid | the grid is used to find neighboring features the cluster |
cluster | cluster to which the new elements are added |
|
private |
Computes an initial QT clustering of the points in the hash grid.
grid | the grid is used to find new features for clusters that have to be updated |
cluster_heads | the heap where the QTClusters are inserted |
cluster_data | vector where the BulkData objects of the clusters are stored |
handles | vector where handles of the inserted clusters are stored |
element_mapping | the element mapping where all the clusters get registered for their features |
|
inlinestatic |
Returns an instance of this class.
|
private |
creates a consensus feature from the given elements
[out] | feature | The resulting consensus feature which is constructed here |
quality | the quality of the new consensus feature | |
elements | the original features that from the new consensus feature |
|
private |
Looks up the matching bin for rt
in bin_tolerances_ and checks if dist
is in the allowed range.
|
private |
Calculates the distance between two grid features.
|
inlinestatic |
Returns the name of the product.
|
private |
Extract the best cluster from cluster_heads and turn it into a consensus feature.
cluster_heads | the heap where the clusters are stored, must not be empty | |
[out] | feature | The resulting consensus feature which is constructed here |
element_mapping | the element mapping is used to update clusters when features are removed | |
grid | the grid is used to find new features for clusters that have to be updated | |
handles | used to access clusters if we know their id from the element mapping |
|
private |
Removes id of current top cluster in the heap from element mapping.
cluster | the current top cluster in the heap which id is removed |
element_mapping | the element mapping from which the ids are removed |
|
overridevirtual |
Runs the algorithm on consensus maps.
Exception::IllegalArgument | is thrown if the input data is not valid. |
Implements BaseGroupFinder.
void run | ( | const std::vector< FeatureMap > & | input_maps, |
ConsensusMap & | result_map | ||
) |
Runs the algorithm on feature maps.
Exception::IllegalArgument | is thrown if the input data is not valid. |
|
private |
Runs the algorithm on feature maps or consensus maps.
|
private |
Runs the algorithm on feature maps or consensus maps (internal)
|
private |
Sets algorithm parameters.
|
private |
update the clustering:
1. remove current best cluster from the heap 2. update all clusters accordingly by removing neighbors used by the current best 3. invalidate clusters whose center has been used by the current best
element_mapping | the element mapping is used to update clusters and updated itself |
grid | the grid is used to find new features for clusters that have to be updated |
cluster_heads | the heap is updated for changing clusters and popped in the end |
elements | the features that now have to be removed from other clusters than the current best |
handles | used to access clusters if we know their id from the element mapping |
best_id | id of the current best cluster, will be removed from the element mapping |
|
private |
Set of features already used.
|
private |
Map of median RTs to allowed linking tolerances (on the same RT scale) for unIDed features. This should be interpreted as bins from the current median RT to the next.
|
private |
Feature distance functor.
|
private |
Maximum m/z difference.
|
private |
Maximum RT difference.
|
private |
Min. nr. of differences from matched IDs requested to calculate a linking tolerance per RT bin.
|
private |
Min. score for an ID to be considered for tolerance estimation.
|
private |
Distance penalty for unidentified features when finding best neighbor per map and for cluster quality calculation and therefore the order in which they are popped from the heap. Since distances are normalized, a penalty of 1.0 will always prefer IDed features.
|
private |
Maximum m/z difference.
|
private |
Number of input maps.
|
private |
Consider peptide identifications for grouping?