5 #ifndef BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
6 #define BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
12 #ifndef BALL_DATATYPE_OPTIONS_H
17 #include <boost/graph/adjacency_list.hpp>
18 #include <boost/graph/graph_traits.hpp>
19 #include <boost/graph/incremental_components.hpp>
20 #include <boost/thread/mutex.hpp>
21 #include <boost/thread/thread.hpp>
22 #include <boost/unordered_set.hpp>
75 typedef std::vector<InvertedIndex*> InvertedIndices;
76 typedef std::vector<std::vector<unsigned short> > FingerprintFeatures;
78 typedef boost::adjacency_list<boost::vecS, boost::vecS, boost::undirectedS> SimilarityGraph;
79 typedef boost::graph_traits<SimilarityGraph>::vertex_descriptor Vertex;
80 typedef boost::graph_traits<SimilarityGraph>::vertices_size_type VertexIndex;
81 typedef boost::disjoint_sets<Vertex*, VertexIndex*, boost::find_with_full_path_compression> DisjointSet;
218 static bool parseBinaryFingerprint(
const String& fprint, std::vector<unsigned short>& features,
unsigned int fp_type,
const char* delim=
",");
231 bool setLibraryFeatures(
const FingerprintFeatures& lib_features);
239 bool setQueryFeatures(
const FingerprintFeatures& query_features);
246 unsigned int getTargetLibrarySize()
const;
253 unsigned int getQueryLibrarySize()
const;
260 const Options& getOptions()
const;
267 void setVerbosityLevel(
const int verbosity);
282 bool cutoffSearch(
const float sim_cutoff,
const String& outfile_name);
295 bool connectedComponents(
const std::vector<unsigned int>& selection,
296 std::vector<std::vector<unsigned int> >& ccs,
297 std::vector<std::vector<std::pair<unsigned int, float> > >& nn_data,
299 const bool store_nns =
false);
309 bool averageLinkageClustering(
const std::vector<unsigned int>& selection,
310 std::vector<std::pair<unsigned int, float> >& nn_data,
311 std::map<
unsigned int, std::vector<unsigned int> >& cluster_selection);
321 bool calculateSelectionMedoid(
const std::vector<unsigned int>& selection,
unsigned int& medoid_index, std::vector<float>& avg_sims);
335 unsigned int blocksize;
336 unsigned int dataset_size;
337 unsigned int active_iids_size;
339 unsigned short* cc_row;
340 unsigned short** cc_matrix;
343 double** dprec_matrix;
344 unsigned int* uint_array;
358 unsigned short feature_id;
362 unsigned short* block_positions;
372 unsigned int n_molecules;
375 unsigned short* n_features;
378 unsigned int* parent_clusters;
381 FeatureList* feature_skip_list;
394 unsigned int c_index;
415 Cluster* predecessor;
418 float predecessor_sim;
421 double predecessor_sim_sum;
424 boost::unordered_set<InvertedIndex*> c_members;
427 std::vector<const std::vector<unsigned short>* > leaf_members;
431 enum clustering_methods_
433 STORED_DATA_PARALLEL,
447 std::vector<Vertex> parent;
453 std::vector<VertexIndex> rank;
465 unsigned int n_threads_;
472 const FingerprintFeatures* lib_features_;
478 InvertedIndices lib_iindices_;
485 const FingerprintFeatures* query_features_;
491 InvertedIndices query_iindices_;
497 boost::thread* threads_;
503 boost::mutex out_mutex_;
509 ThreadData* thread_data_;
515 unsigned short blocksize_;
560 std::vector<Cluster*> leaf_clusters_;
566 std::map<float, std::vector<Cluster*> > internal_clusters_;
572 std::vector<Cluster*> vec_actives_;
577 std::vector<Cluster*> vec_inactives_;
588 double* dprec_sim_matrix_;
594 std::vector<std::pair<std::vector<InvertedIndex*>,
unsigned int> > active_iids_;
601 unsigned int active_iids_size_;
607 Cluster* nn_chain_tip_;
613 unsigned int nn_chain_size_;
619 std::vector<Cluster*>::iterator current_nn_;
625 float current_nn_sim_;
631 unsigned short clustering_method_;
637 unsigned int n_clusters_;
644 unsigned int max_clusters_;
651 void setup(
const Options& options);
665 void assign(
const BinaryFingerprintMethods& bfm);
673 bool checkInputData(
const std::vector<unsigned int>& selection)
const;
682 void createThreadData(
const unsigned int blocksize,
const unsigned int dataset_size,
const unsigned int active_iids_size);
688 void destroyThreadData();
696 InvertedIndex* createInvertedIndex(
const std::vector<std::pair<
const std::vector<unsigned short>*,
unsigned int> >& members);
703 void destroyInvertedIndex(InvertedIndex* ii);
710 void destroyInvertedIndices(InvertedIndices& ii_destroy);
718 void createInvertedIndices(
const std::vector<std::pair<
const std::vector<unsigned short>*,
unsigned int> >&
molecules, InvertedIndices& ii_target);
725 void setBlockSize(
const unsigned short blocksize);
760 bool getNextComparisonIndex(
LongSize& index);
771 bool checkSimilaritySwitch(
const float a_sim,
const float b_sim,
const unsigned int a_id,
const unsigned int b_id)
const;
780 void calculateCommonCounts_1_1(
const FeatureList* ii1,
const FeatureList* ii2,
unsigned short& cc_count);
789 void calculateCommonCounts_1_N(
const FeatureList* ii1,
const FeatureList* ii2,
unsigned short* cc_row);
798 void calculateCommonCounts_M_N(
const InvertedIndex* ii_1,
const InvertedIndex* ii_2,
unsigned short** cc_matrix);
808 void cutoffSearchSimilarities(
const unsigned int query_index,
const unsigned int lib_index,
unsigned short** cc_matrix, File& outfile);
815 void cutoffSearchThread(
const unsigned int thread_id);
824 typedef void (BinaryFingerprintMethods::*PairwiseSimilaritiesBase)(
const unsigned int ii1_index,
const unsigned int ii2_index, ThreadData* t_data);
825 PairwiseSimilaritiesBase pairwiseSimilaritiesBase;
834 void pairwiseSimilaritiesNearestNeighbours(
const unsigned int ii1_index,
const unsigned int ii2_index, ThreadData* t_data);
843 void pairwiseSimilaritiesStoredMatrix(
const unsigned int ii1_index,
const unsigned int ii2_index, ThreadData* t_data);
853 void pairwiseSimilaritiesConnectedComponents(
const unsigned int ii1_index,
const unsigned int ii2_index, ThreadData* t_data);
862 void pairwiseSimilaritiesMedoids(
const unsigned int ii1_index,
const unsigned int ii2_index, ThreadData* t_data);
871 bool pairwiseSimilarities(
const std::vector<unsigned int>& selection, std::vector<std::pair<unsigned int, float> >& nn_data);
878 void pairwiseSimilaritiesThread(
const unsigned int thread_id);
888 void calculateParallelSimilaritiesActives(
const InvertedIndex* ii1,
const InvertedIndex* ii2,
unsigned short** cc_matrix,
double** sim_matrix);
895 void calculateParallelSimilaritiesActivesThread(
const unsigned int thread_id);
905 void calculateParallelSimilarities(
const InvertedIndex* ii1,
const InvertedIndex* ii2,
unsigned short** cc_matrix,
double** sim_matrix);
912 void calculateParallelSimilaritiesThread(
const unsigned int thread_id);
921 void similarityUpdateAverageLinkage(
const Cluster* merged_cluster,
const Cluster* current);
928 void similarityUpdateAverageLinkageThread(
const unsigned int thread_id,
const Cluster* merged_cluster);
938 void clusterSimilaritySum_1_N(
const InvertedIndex* ii1,
const InvertedIndex* ii2,
const unsigned short* cc_row,
double& sim_sum);
948 void clusterSimilaritySum_M_N(
const InvertedIndex* ii1,
const InvertedIndex* ii2,
unsigned short** cc_matrix,
double& sim_sum);
955 void similarityMatrixFromClustersThread(
const unsigned int thread_id);
963 void averageLinkageParallel(Cluster*& root);
971 void NNChainCore(Cluster*& root);
979 void clusterSelectionKGS(std::map<
unsigned int, std::vector<unsigned int> >& cluster_selection);
987 void enumerateClusterMembers(Cluster* cl,
unsigned int cluster_id);
993 void nextNearestNeighbour();
999 void moveNearestNeighbour();
1009 Cluster* mergeClusters(Cluster* c1, Cluster* c2,
double sim_sum);
1016 Cluster* createCluster();
1021 void switchStorageMethod();
1027 void finalizeClustering();
1031 #endif // BALL_STRUCTURE_BINARYFINGERPRINTMETHODS_H
static const String BLOCKSIZE
static const String STORE_NN
static const String SIM_CUTOFF
static const String PRECISION
static const unsigned short BLOCKSIZE
static const bool STORE_NN
BALL_EXPORT MoleculeList molecules(const AtomContainer &fragment, bool selected_only=false)
static const int VERBOSITY
BALL_ULONG64_TYPE LongSize
static const float PRECISION
static const unsigned int N_THREADS
static const String VERBOSITY
static const String MAX_CLUSTERS
static const unsigned int MAX_CLUSTERS
static const float SIM_CUTOFF
static const String N_THREADS
This class provides efficient similarity calculation functionality for 2D binary fingerprints.