BALL  1.4.2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
QSARData.h
Go to the documentation of this file.
1 /* QSARData.h
2  *
3  * Copyright (C) 2009 Marcel Schumann
4  *
5  * This file is part of QuEasy -- A Toolbox for Automated QSAR Model
6  * Construction and Validation.
7  * QuEasy is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 3 of the License, or (at
10  * your option) any later version.
11  *
12  * QuEasy is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 // -*- Mode: C++; tab-width: 2; -*-
22 // vi: set ts=2:
23 //
24 //
25 
26 #ifndef QSARH
27 #define QSARH
28 
29 #include <iostream>
30 #include <BALL/KERNEL/system.h>
31 #include <BALL/FORMAT/SDFile.h>
32 #include <BALL/FORMAT/PDBFile.h>
33 #include <BALL/FORMAT/HINFile.h>
34 #include <BALL/FORMAT/MOLFile.h>
35 #include <vector>
36 #include <list>
37 #include <set>
38 #include <map>
39 #include <math.h>
40 #include <sstream>
41 #include <fstream>
42 #include <limits>
43 #include <fstream>
48 #include <BALL/COMMON/exception.h>
49 #include <cstring>
50 
51 #ifndef STATISTICS
52 #include <BALL/QSAR/statistics.h>
53 #endif
54 
55 #ifndef QSAR_EXCEPTION
56 #include <BALL/QSAR/exception.h>
57 #endif
58 
59 #include <gsl/gsl_randist.h>
60 #include <gsl/gsl_cdf.h>
61 
62 #include <BALL/CONCEPT/timeStamp.h>
63 
64 // #ifndef MODEL
65 // #include "Model.h"
66 // #endif
67 
68 namespace BALL
69 {
70  namespace QSAR
71  {
72  typedef vector<double> Column;
73  typedef vector<Column> VMatrix;
74 
77  {
78  public:
79 
80  QSARData();
81 
82  ~QSARData();
83 
87  bool isDataCentered() const;
88 
90  bool isResponseCentered() const;
91 
96  vector<String>* readPropertyNames(String sd_file);
97 
101  void readSDFile(const char* file);
102 
108  void readSDFile(const char* file, std::multiset<int>& act, bool useExDesc=1, bool append=0, bool translate_class_labels=0);
109 
113  void calculateBALLDescriptors(Molecule& m);
114 
116  void displayMatrix();
117 
120  void centerData(bool center_Y=0);
121 
123  void scaleAllDescriptors();
124 
126  unsigned int getNoSubstances() const;
127 
129  unsigned int getNoDescriptors() const;
130 
138  void readCSVFile(const char* file, int no_y, bool xlabels, bool ylabels, const char* sep=",", bool appendDescriptors=0, bool translate_class_labels=0);
139 
141  void manipulateY(vector<String> v);
142 
145  void manipulateY(String v);
146 
149  void discretizeY(vector<double> thresholds);
150 
151  void transformX(vector<String> v);
152 
154  vector<QSARData*> partitionInputData(int p);
155 
157  void saveToFile(string filename) const;
158 
160  void readFromFile(string filename);
161 
164  vector<QSARData*> generateExternalSet(double fraction) const;
165 
170  vector<QSARData*> evenSplit(int no_test_splits, int current_test_split_id, int response_id=0) const;
171 
173  vector<double>* getSubstance(int s) const;
174 
176  vector<double>* getActivity(int s) const;
177 
179  unsigned int getNoResponseVariables() const;
180 
181  const vector<string>* getSubstanceNames() const;
182 
184  bool checkforDiscreteY() const;
185 
186 
188  bool checkforDiscreteY(const char* file, std::multiset<int>& activity_IDs) const;
189 
191  void setDataFolder(const char* folder);
192 
195  void removeHighlyCorrelatedCompounds(double& compound_cor_threshold, double& feature_cor_threshold);
196 
202  void getSimilarDescriptors(int descriptor_ID, double correlation, std::list<std::pair<uint,String> >& similar_descriptor_IDs) const;
204 
205 
206  protected:
207 
212  void setDescriptorNames(const Molecule& m, std::multiset<int>& activity_IDs, bool useExDesc=1);
213 
216  void removeInvalidDescriptors(std::multiset<int>& invalidDescriptors);
217 
218  void removeInvalidSubstances(std::multiset<int>& inv);
219 
221  void readMatrix(VMatrix& mat, std::ifstream& in, char seperator, unsigned int lines, unsigned int col);
222 
225  void checkActivityIDs(std::multiset<int>& act, int no_properties);
226 
229  void insertSubstance(const QSARData* source, int s, bool backtransformation=0);
230 
232  void printMatrix(const VMatrix& mat, std::ostream& out) const;
234 
240 
243 
246 
249 
251  vector<string> column_names_;
252 
254  vector<string> substance_names_;
255 
257  std::multiset<int> invalidDescriptors_;
258 
259  std::multiset<int> invalidSubstances_;
260 
262 
264  std::map<String,int> class_names_;
266 
267 
268 
270  friend class RegressionValidation;
271  friend class Validation;
272  friend class Model;
273  friend class FitModel;
274  friend class FeatureSelection;
275 
276  };
277 
278  }
279 }
280 
281 #endif // QSARH