BALL  1.4.79
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
QSARData.h
Go to the documentation of this file.
1 // -*- Mode: C++; tab-width: 2; -*-
2 // vi: set ts=2:
3 //
4 //
5 
6 #ifndef QSARH
7 #define QSARH
8 
9 #include <iostream>
10 #include <BALL/KERNEL/system.h>
11 #include <BALL/FORMAT/SDFile.h>
12 #include <BALL/FORMAT/PDBFile.h>
13 #include <BALL/FORMAT/HINFile.h>
14 #include <BALL/FORMAT/MOLFile.h>
15 #include <vector>
16 #include <list>
17 #include <set>
18 #include <map>
19 #include <math.h>
20 #include <sstream>
21 #include <fstream>
22 #include <limits>
23 #include <fstream>
28 #include <BALL/COMMON/exception.h>
29 #include <cstring>
30 
31 #ifndef STATISTICS
32 #include <BALL/QSAR/statistics.h>
33 #endif
34 
35 #ifndef QSAR_EXCEPTION
36 #include <BALL/QSAR/exception.h>
37 #endif
38 
39 #include <BALL/CONCEPT/timeStamp.h>
40 
41 // #ifndef MODEL
42 // #include "Model.h"
43 // #endif
44 
45 namespace BALL
46 {
47  class MolecularSimilarity;
48 
49  namespace QSAR
50  {
51  typedef vector<double> Column;
52  typedef vector<Column> VMatrix;
53 
56  {
57  public:
58 
59  QSARData();
60 
61  ~QSARData();
62 
66  bool isDataCentered() const;
67 
69  bool isResponseCentered() const;
70 
75  vector<String>* readPropertyNames(String sd_file);
76 
80  void readSDFile(const char* file);
81 
87  void readSDFile(const char* file, std::multiset<int>& act, bool useExDesc=1, bool append=0, bool translate_class_labels=0);
88 
89  void readSDFile(const char* file, std::set<String>& activity_names, bool useExDesc=1, bool append=0, bool translate_class_labels=0, bool calc_phychem_properties=1, bool calc_topological_properties=1);
90 
92  void displayMatrix();
93 
96  void centerData(bool center_Y=0);
97 
99  void scaleAllDescriptors();
100 
102  unsigned int getNoSubstances() const;
103 
105  unsigned int getNoDescriptors() const;
106 
114  void readCSVFile(const char* file, int no_y, bool xlabels, bool ylabels, const char* sep=",", bool appendDescriptors=0, bool translate_class_labels=0);
115 
117  void manipulateY(std::vector<String> v);
118 
121  void manipulateY(String v);
122 
125  void discretizeY(std::vector<double> thresholds);
126 
127  void transformX(std::vector<String> v);
128 
130  std::vector<QSARData*> partitionInputData(int p);
131 
133  void saveToFile(string filename) const;
134 
136  void readFromFile(string filename);
137 
140  std::vector<QSARData*> generateExternalSet(double fraction) const;
141 
146  std::vector<QSARData*> evenSplit(int no_test_splits, int current_test_split_id, int response_id=0) const;
147 
149  std::vector<double>* getSubstance(int s) const;
150 
152  std::vector<double>* getActivity(int s) const;
153 
155  unsigned int getNoResponseVariables() const;
156 
157  const std::vector<string>* getSubstanceNames() const;
158 
160  bool checkforDiscreteY() const;
161 
162 
164  bool checkforDiscreteY(const char* file, std::multiset<int>& activity_IDs) const;
165 
167  void setDataFolder(const char* folder);
168 
171  void removeHighlyCorrelatedCompounds(double& compound_cor_threshold, double& feature_cor_threshold);
172 
178  void getSimilarDescriptors(int descriptor_ID, double correlation, std::list<std::pair<uint,String> >& similar_descriptor_IDs) const;
180 
181 
182  protected:
183 
187 
191  void calculateBALLDescriptors(Molecule& m);
192 
194  void calculateTopologicalDescriptors(Molecule& mol, MolecularSimilarity& molsim, const std::map<String,int>& descriptor_map);
195 
197  void setDescriptorNames(const Molecule& m, std::multiset<int>& activity_IDs, bool useExDesc=1, bool resize=1);
198 
201  void removeInvalidDescriptors(std::multiset<int>& invalidDescriptors);
202 
203  void removeInvalidSubstances(std::multiset<int>& inv);
204 
206  void readMatrix(VMatrix& mat, std::ifstream& in, char seperator, unsigned int lines, unsigned int col);
207 
210  void checkActivityIDs(std::multiset<int>& act, int no_properties);
211 
214  void insertSubstance(const QSARData* source, int s, bool backtransformation=0);
215 
217  void printMatrix(const VMatrix& mat, std::ostream& out) const;
219 
225 
228 
231 
234 
236  vector<string> column_names_;
237 
239  vector<string> substance_names_;
240 
242  std::multiset<int> invalidDescriptors_;
243 
244  std::multiset<int> invalidSubstances_;
245 
247 
249  std::map<String,int> class_names_;
251 
252 
253 
255  friend class RegressionValidation;
256  friend class Validation;
257  friend class Model;
258  friend class FitModel;
259  friend class FeatureSelection;
260 
261  };
262 
263  }
264 }
265 
266 #endif // QSARH
std::map< String, int > class_names_
Definition: QSARData.h:249
vector< string > substance_names_
Definition: QSARData.h:239
vector< string > column_names_
Definition: QSARData.h:236
VMatrix y_transformations_
Definition: QSARData.h:233
vector< Column > VMatrix
Definition: QSARData.h:52
vector< double > Column
Definition: QSARData.h:51
VMatrix descriptor_transformations_
Definition: QSARData.h:230
std::multiset< int > invalidDescriptors_
Definition: QSARData.h:242
VMatrix descriptor_matrix_
Definition: QSARData.h:224
#define BALL_EXPORT
Definition: COMMON/global.h:50
std::multiset< int > invalidSubstances_
Definition: QSARData.h:244