BALL  1.4.79
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
binaryFingerprintMethods.h
Go to the documentation of this file.
1 // -*- Mode: C++; tab-width: 2; -*-
2 // vi: set ts=2:
3 //
4 
5 #ifndef BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
6 #define BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
7 
8 #ifndef BALL_COMMON_H
9 # include <BALL/common.h>
10 #endif
11 
12 #ifndef BALL_DATATYPE_OPTIONS_H
13 # include <BALL/DATATYPE/options.h>
14 #endif
15 
16 
17 #include <boost/graph/adjacency_list.hpp>
18 #include <boost/graph/graph_traits.hpp>
19 #include <boost/graph/incremental_components.hpp>
20 #include <boost/thread/mutex.hpp>
21 #include <boost/thread/thread.hpp>
22 #include <boost/unordered_set.hpp>
23 
24 
25 #include <map>
26 #include <set>
27 #include <vector>
28 
29 
30 namespace BALL
31 {
67  {
68  struct InvertedIndex;
69 
74 
75  typedef std::vector<InvertedIndex*> InvertedIndices;
76  typedef std::vector<std::vector<unsigned short> > FingerprintFeatures;
77 
78  typedef boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS> SimilarityGraph;
79  typedef boost::graph_traits<SimilarityGraph>::vertex_descriptor Vertex;
80  typedef boost::graph_traits<SimilarityGraph>::vertices_size_type VertexIndex;
81  typedef boost::disjoint_sets<Vertex*, VertexIndex*, boost::find_with_full_path_compression> DisjointSet;
82 
84 
85  public:
90 
95  {
99  static const String BLOCKSIZE;
100 
104  static const String SIM_CUTOFF;
105 
109  static const String PRECISION;
110 
114  static const String STORE_NN;
115 
120  static const String MAX_CLUSTERS;
121 
125  static const String N_THREADS;
126 
130  static const String VERBOSITY;
131  };
132 
137  {
138  static const unsigned short BLOCKSIZE;
139  static const float SIM_CUTOFF;
140  static const float PRECISION;
141  static const unsigned int MAX_CLUSTERS;
142  static const bool STORE_NN;
143  static const unsigned int N_THREADS;
144  static const int VERBOSITY;
145  };
146 
147 
152 
157 
158 
163  BinaryFingerprintMethods(const Options& options);
164 
165 
171  BinaryFingerprintMethods(const Options& options, const FingerprintFeatures& lib_features);
172 
173 
180  BinaryFingerprintMethods(const Options& options, const FingerprintFeatures& lib_features, const FingerprintFeatures& query_features);
181 
182 
187 
188 
192  virtual ~BinaryFingerprintMethods();
193 
195 
200 
204  BinaryFingerprintMethods& operator = (const BinaryFingerprintMethods& bfm);
205 
207 
208 
218  static bool parseBinaryFingerprint(const String& fprint, std::vector<unsigned short>& features, unsigned int fp_type, const char* delim=",");
219 
220 
225 
231  bool setLibraryFeatures(const FingerprintFeatures& lib_features);
232 
233 
239  bool setQueryFeatures(const FingerprintFeatures& query_features);
240 
241 
246  unsigned int getTargetLibrarySize() const;
247 
248 
253  unsigned int getQueryLibrarySize() const;
254 
255 
260  const Options& getOptions() const;
261 
262 
267  void setVerbosityLevel(const int verbosity);
268 
270 
271 
276 
282  bool cutoffSearch(const float sim_cutoff, const String& outfile_name);
283 
284 
295  bool connectedComponents(const std::vector<unsigned int>& selection,
296  std::vector<std::vector<unsigned int> >& ccs,
297  std::vector<std::vector<std::pair<unsigned int, float> > >& nn_data,
298  const float cutoff,
299  const bool store_nns = false);
300 
301 
309  bool averageLinkageClustering(const std::vector<unsigned int>& selection,
310  std::vector<std::pair<unsigned int, float> >& nn_data,
311  std::map<unsigned int, std::vector<unsigned int> >& cluster_selection);
312 
313 
321  bool calculateSelectionMedoid(const std::vector<unsigned int>& selection, unsigned int& medoid_index, std::vector<float>& avg_sims);
322 
324 
325  private:
329  struct ThreadData
330  {
331  LongSize first;
332  LongSize last;
333 
334  float nn_sim;
335  unsigned int blocksize;
336  unsigned int dataset_size;
337  unsigned int active_iids_size;
338 
339  unsigned short* cc_row;
340  unsigned short** cc_matrix;
341 
342  float* float_array;
343  double** dprec_matrix;
344  unsigned int* uint_array;
345 
346  String outfile_name;
347  };
348 
349 
355  struct FeatureList
356  {
357  // FeatureID which corresponds in principle to the index of a bit in a 2D fingerprint.
358  unsigned short feature_id;
359 
360  // Array which stores the sorted list of all molecules which all share the associated feature_id.
361  // Molecules are only represented by their positions in the Block they belong to.
362  unsigned short* block_positions;
363  };
364 
365 
369  struct InvertedIndex
370  {
371  // Number of molecules in the block.
372  unsigned int n_molecules;
373 
374  // Array which stores the number of features of every molecule in the InvertedIndex
375  unsigned short* n_features;
376 
377  // Array which stores the ID of the parent cluster to which the molecule at the InvertedIndex position belongs to.
378  unsigned int* parent_clusters;
379 
380  // Array of FeatureLists implemented as a skip list.
381  FeatureList* feature_skip_list;
382  };
383 
384 
388  struct Cluster
389  {
390  // Molecule ID for leaf nodes. Otherwise internal Cluster ID.
391  unsigned int c_id;
392 
393  // Cluster index [0, n_molecules-1] is used to map from 2D matrix to 1D array.
394  unsigned int c_index;
395 
396  // Number of leaf nodes inside in the corresponding cluster.
397  unsigned int c_size;
398 
399  // Left child. NULL if cluster is a leaf.
400  Cluster* l_child;
401 
402  // Right child. NULL if cluster is a leaf.
403  Cluster* r_child;
404 
405  // Pointer to the clusters current nearest neigbour.
406  Cluster* nn;
407 
408  // Flag for various purposes.
409  bool is_rnn;
410 
411  // Sum of all intra-cluster pairwise similarities within this cluster. Used finally for KGS penalty calculation.
412  double sim_sum;
413 
414  // Pointer to predecessor cluster in the Nearest Neighbour Chain. NULL if not member of the Nearest Neighbour Chain.
415  Cluster* predecessor;
416 
417  // Similarity to predecessor in Nearest Neighbour Chain. 0.0 if not member of Nearest Neighbour Chain.
418  float predecessor_sim;
419 
420  // Sum of all inter-cluster pairwise similarities of clusters nn_chain[i] and nn_chain[i-1]. 0.0 if not member of the NNChain.
421  double predecessor_sim_sum;
422 
423  // Inverted indices of all leaf molecules within this cluster.
424  boost::unordered_set<InvertedIndex*> c_members;
425 
426  // Vector of pointers to FingerprintFeatures in lib_features_ of all leaf molecules within this cluster.
427  std::vector<const std::vector<unsigned short>* > leaf_members;
428  };
429 
430 
431  enum clustering_methods_
432  {
433  STORED_DATA_PARALLEL,
434  STORED_MATRIX
435  };
436 
437 
441  DisjointSet* ds;
442 
443 
447  std::vector<Vertex> parent;
448 
449 
453  std::vector<VertexIndex> rank;
454 
455 
459  Options options_;
460 
461 
465  unsigned int n_threads_;
466 
467 
472  const FingerprintFeatures* lib_features_;
473 
474 
478  InvertedIndices lib_iindices_;
479 
480 
485  const FingerprintFeatures* query_features_;
486 
487 
491  InvertedIndices query_iindices_;
492 
493 
497  boost::thread* threads_;
498 
499 
503  boost::mutex out_mutex_;
504 
505 
509  ThreadData* thread_data_;
510 
511 
515  unsigned short blocksize_;
516 
517 
521  LongSize cc_matrix_size_;
522 
523 
527  float cutoff_;
528 
529 
533  float precision_;
534 
535 
539  bool store_nns_;
540 
541 
547  LongSize n_comparisons_;
548  LongSize n_comparisons_backup_;
549 
550 
554  int verbosity_;
555 
556 
560  std::vector<Cluster*> leaf_clusters_;
561 
562 
566  std::map<float, std::vector<Cluster*> > internal_clusters_;
567 
568 
572  std::vector<Cluster*> vec_actives_;
573 
577  std::vector<Cluster*> vec_inactives_;
578 
579 
583  float* sim_matrix_;
584 
588  double* dprec_sim_matrix_;
589 
590 
594  std::vector<std::pair<std::vector<InvertedIndex*>, unsigned int> > active_iids_;
595 
596 
601  unsigned int active_iids_size_;
602 
603 
607  Cluster* nn_chain_tip_;
608 
609 
613  unsigned int nn_chain_size_;
614 
615 
619  std::vector<Cluster*>::iterator current_nn_;
620 
621 
625  float current_nn_sim_;
626 
627 
631  unsigned short clustering_method_;
632 
633 
637  unsigned int n_clusters_;
638 
639 
644  unsigned int max_clusters_;
645 
646 
651  void setup(const Options& options);
652 
653 
658  void clear();
659 
660 
665  void assign(const BinaryFingerprintMethods& bfm);
666 
667 
673  bool checkInputData(const std::vector<unsigned int>& selection) const;
674 
675 
682  void createThreadData(const unsigned int blocksize, const unsigned int dataset_size, const unsigned int active_iids_size);
683 
684 
688  void destroyThreadData();
689 
690 
696  InvertedIndex* createInvertedIndex(const std::vector<std::pair<const std::vector<unsigned short>*, unsigned int> >& members);
697 
698 
703  void destroyInvertedIndex(InvertedIndex* ii);
704 
705 
710  void destroyInvertedIndices(InvertedIndices& ii_destroy);
711 
712 
718  void createInvertedIndices(const std::vector<std::pair<const std::vector<unsigned short>*, unsigned int> >& molecules, InvertedIndices& ii_target);
719 
720 
725  void setBlockSize(const unsigned short blocksize);
726 
727 
734  void arrayToUpperTriangluarMatrix(LongSize& row, LongSize& col, const LongSize array_index) const;
735 
736 
743  void arrayToStrictUpperTriangularMatrix(LongSize& row, LongSize& col, const LongSize array_index) const;
744 
745 
752  LongSize strictUpperTriangularMatrixToArray(const LongSize row, const LongSize col) const;
753 
754 
760  bool getNextComparisonIndex(LongSize& index);
761 
762 
771  bool checkSimilaritySwitch(const float a_sim, const float b_sim, const unsigned int a_id, const unsigned int b_id) const;
772 
773 
780  void calculateCommonCounts_1_1(const FeatureList* ii1, const FeatureList* ii2, unsigned short& cc_count);
781 
782 
789  void calculateCommonCounts_1_N(const FeatureList* ii1, const FeatureList* ii2, unsigned short* cc_row);
790 
791 
798  void calculateCommonCounts_M_N(const InvertedIndex* ii_1, const InvertedIndex* ii_2, unsigned short** cc_matrix);
799 
800 
808  void cutoffSearchSimilarities(const unsigned int query_index, const unsigned int lib_index, unsigned short** cc_matrix, File& outfile);
809 
810 
815  void cutoffSearchThread(const unsigned int thread_id);
816 
817 
824  typedef void (BinaryFingerprintMethods::*PairwiseSimilaritiesBase)(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
825  PairwiseSimilaritiesBase pairwiseSimilaritiesBase;
826 
827 
834  void pairwiseSimilaritiesNearestNeighbours(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
835 
836 
843  void pairwiseSimilaritiesStoredMatrix(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
844 
845 
853  void pairwiseSimilaritiesConnectedComponents(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
854 
855 
862  void pairwiseSimilaritiesMedoids(const unsigned int ii1_index, const unsigned int ii2_index, ThreadData* t_data);
863 
864 
871  bool pairwiseSimilarities(const std::vector<unsigned int>& selection, std::vector<std::pair<unsigned int, float> >& nn_data);
872 
873 
878  void pairwiseSimilaritiesThread(const unsigned int thread_id);
879 
880 
888  void calculateParallelSimilaritiesActives(const InvertedIndex* ii1, const InvertedIndex* ii2, unsigned short** cc_matrix, double** sim_matrix);
889 
890 
895  void calculateParallelSimilaritiesActivesThread(const unsigned int thread_id);
896 
897 
905  void calculateParallelSimilarities(const InvertedIndex* ii1, const InvertedIndex* ii2, unsigned short** cc_matrix, double** sim_matrix);
906 
907 
912  void calculateParallelSimilaritiesThread(const unsigned int thread_id);
913 
914 
921  void similarityUpdateAverageLinkage(const Cluster* merged_cluster, const Cluster* current);
922 
928  void similarityUpdateAverageLinkageThread(const unsigned int thread_id, const Cluster* merged_cluster);
929 
930 
938  void clusterSimilaritySum_1_N(const InvertedIndex* ii1, const InvertedIndex* ii2, const unsigned short* cc_row, double& sim_sum);
939 
940 
948  void clusterSimilaritySum_M_N(const InvertedIndex* ii1, const InvertedIndex* ii2, unsigned short** cc_matrix, double& sim_sum);
949 
950 
955  void similarityMatrixFromClustersThread(const unsigned int thread_id);
956 
957 
963  void averageLinkageParallel(Cluster*& root);
964 
965 
971  void NNChainCore(Cluster*& root);
972 
973 
979  void clusterSelectionKGS(std::map<unsigned int, std::vector<unsigned int> >& cluster_selection);
980 
981 
987  void enumerateClusterMembers(Cluster* cl, unsigned int cluster_id);
988 
989 
993  void nextNearestNeighbour();
994 
995 
999  void moveNearestNeighbour();
1000 
1001 
1009  Cluster* mergeClusters(Cluster* c1, Cluster* c2, double sim_sum);
1010 
1011 
1016  Cluster* createCluster();
1017 
1021  void switchStorageMethod();
1022 
1023 
1027  void finalizeClustering();
1028  };
1029 } // namespace BALL
1030 
1031 #endif // BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
BALL_EXPORT MoleculeList molecules(const AtomContainer &fragment, bool selected_only=false)
BALL_ULONG64_TYPE LongSize
#define BALL_EXPORT
Definition: COMMON/global.h:50
This class provides efficient similarity calculation functionality for 2D binary fingerprints.