00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #ifndef QSARH
00027 #define QSARH
00028
00029 #include <iostream>
00030 #include <BALL/KERNEL/system.h>
00031 #include <BALL/FORMAT/SDFile.h>
00032 #include <BALL/FORMAT/PDBFile.h>
00033 #include <BALL/FORMAT/HINFile.h>
00034 #include <BALL/FORMAT/MOLFile.h>
00035 #include <vector>
00036 #include <list>
00037 #include <set>
00038 #include <map>
00039 #include <math.h>
00040 #include <sstream>
00041 #include <fstream>
00042 #include <limits>
00043 #include <fstream>
00044 #include <BALL/QSAR/simpleDescriptors.h>
00045 #include <BALL/QSAR/connectivityDescriptors.h>
00046 #include <BALL/QSAR/partialChargeDescriptors.h>
00047 #include <BALL/QSAR/surfaceDescriptors.h>
00048 #include <BALL/COMMON/exception.h>
00049 #include <string.h>
00050
00051 #ifndef STATISTICS
00052 #include <BALL/QSAR/statistics.h>
00053 #endif
00054
00055 #ifndef QSAR_EXCEPTION
00056 #include <BALL/QSAR/exception.h>
00057 #endif
00058
00059 #include <gsl/gsl_randist.h>
00060 #include <gsl/gsl_cdf.h>
00061
00062 #include <BALL/CONCEPT/timeStamp.h>
00063
00064
00065
00066
00067
00068 namespace BALL
00069 {
00070 namespace QSAR
00071 {
00072 typedef vector<double> Column;
00073 typedef vector<Column> VMatrix;
00074
00076 class BALL_EXPORT QSARData
00077 {
00078 public:
00079
00080 QSARData();
00081
00082 ~QSARData();
00083
00087 bool isDataCentered() const;
00088
00090 bool isResponseCentered() const;
00091
00096 vector<String>* readPropertyNames(String sd_file);
00097
00101 void readSDFile(const char* file);
00102
00108 void readSDFile(const char* file, std::multiset<int>& act, bool useExDesc=1, bool append=0, bool translate_class_labels=0);
00109
00113 void calculateBALLDescriptors(Molecule& m);
00114
00116 void displayMatrix();
00117
00120 void centerData(bool center_Y=0);
00121
00123 void scaleAllDescriptors();
00124
00126 unsigned int getNoSubstances() const;
00127
00129 unsigned int getNoDescriptors() const;
00130
00138 void readCSVFile(const char* file, int no_y, bool xlabels, bool ylabels, const char* sep=",", bool appendDescriptors=0, bool translate_class_labels=0);
00139
00141 void manipulateY(vector<String> v);
00142
00145 void manipulateY(String v);
00146
00149 void discretizeY(vector<double> thresholds);
00150
00151 void transformX(vector<String> v);
00152
00154 vector<QSARData*> partitionInputData(int p);
00155
00157 void saveToFile(string filename) const;
00158
00160 void readFromFile(string filename);
00161
00164 vector<QSARData*> generateExternalSet(double fraction) const;
00165
00170 vector<QSARData*> evenSplit(int no_test_splits, int current_test_split_id, int response_id=0) const;
00171
00173 vector<double>* getSubstance(int s) const;
00174
00176 vector<double>* getActivity(int s) const;
00177
00179 unsigned int getNoResponseVariables() const;
00180
00181 const vector<string>* getSubstanceNames() const;
00182
00184 bool checkforDiscreteY() const;
00185
00186
00188 bool checkforDiscreteY(const char* file, std::multiset<int>& activity_IDs) const;
00189
00191 void setDataFolder(const char* folder);
00192
00195 void removeHighlyCorrelatedCompounds(double& compound_cor_threshold, double& feature_cor_threshold);
00196
00202 void getSimilarDescriptors(int descriptor_ID, double correlation, std::list<std::pair<uint,String> >& similar_descriptor_IDs) const;
00204
00205
00206 protected:
00207
00212 void setDescriptorNames(const Molecule& m, std::multiset<int>& activity_IDs, bool useExDesc=1);
00213
00216 void removeInvalidDescriptors(std::multiset<int>& invalidDescriptors);
00217
00218 void removeInvalidSubstances(std::multiset<int>& inv);
00219
00221 void readMatrix(VMatrix& mat, std::ifstream& in, char seperator, unsigned int lines, unsigned int col);
00222
00225 void checkActivityIDs(std::multiset<int>& act, int no_properties);
00226
00229 void insertSubstance(const QSARData* source, int s, bool backtransformation=0);
00230
00232 void printMatrix(const VMatrix& mat, std::ostream& out) const;
00234
00239 VMatrix descriptor_matrix_;
00240
00242 VMatrix Y_;
00243
00245 VMatrix descriptor_transformations_;
00246
00248 VMatrix y_transformations_;
00249
00251 vector<string> column_names_;
00252
00254 vector<string> substance_names_;
00255
00257 std::multiset<int> invalidDescriptors_;
00258
00259 std::multiset<int> invalidSubstances_;
00260
00261 String data_folder_;
00262
00264 std::map<String,int> class_names_;
00266
00267
00268
00269 friend class ClassificationValidation;
00270 friend class RegressionValidation;
00271 friend class Validation;
00272 friend class Model;
00273 friend class FitModel;
00274 friend class FeatureSelection;
00275
00276 };
00277
00278 }
00279 }
00280
00281 #endif // QSARH