00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #ifndef QSARH
00027 #define QSARH
00028
00029 #include <iostream>
00030 #include <BALL/KERNEL/system.h>
00031 #include <BALL/FORMAT/SDFile.h>
00032 #include <BALL/FORMAT/PDBFile.h>
00033 #include <BALL/FORMAT/HINFile.h>
00034 #include <BALL/FORMAT/MOLFile.h>
00035 #include <vector>
00036 #include <list>
00037 #include <set>
00038 #include <map>
00039 #include <math.h>
00040 #include <sstream>
00041 #include <fstream>
00042 #include <limits>
00043 #include <fstream>
00044 #include <BALL/QSAR/simpleDescriptors.h>
00045 #include <BALL/QSAR/connectivityDescriptors.h>
00046 #include <BALL/QSAR/partialChargeDescriptors.h>
00047 #include <BALL/QSAR/surfaceDescriptors.h>
00048 #include <BALL/COMMON/exception.h>
00049 #include <string.h>
00050
00051 #ifndef STATISTICS
00052 #include <BALL/QSAR/statistics.h>
00053 #endif
00054
00055 #ifndef QSAR_EXCEPTION
00056 #include <BALL/QSAR/exception.h>
00057 #endif
00058
00059 #include <gsl/gsl_randist.h>
00060 #include <gsl/gsl_cdf.h>
00061
00062 #include <BALL/CONCEPT/timeStamp.h>
00063
00064
00065
00066
00067
00068 namespace BALL
00069 {
00070 namespace QSAR
00071 {
00072 typedef vector<double> Column;
00073 typedef vector<Column> VMatrix;
00074
00076 class BALL_EXPORT QSARData
00077 {
00078 public:
00079
00080 QSARData();
00081
00082 ~QSARData();
00083
00087 bool isDataCentered();
00088
00090 bool isResponseCentered();
00091
00096 vector<String>* readPropertyNames(String sd_file);
00097
00101 void readSDFile(const char* file);
00102
00108 void readSDFile(const char* file, std::multiset<int>& act, bool useExDesc=1, bool append=0, bool translate_class_labels=0);
00109
00113 void calculateBALLDescriptors(Molecule& m);
00114
00116 void displayMatrix();
00117
00120 void centerData(bool center_Y=0);
00121
00123 void scaleAllDescriptors();
00124
00126 unsigned int getNoSubstances() const;
00127
00129 unsigned int getNoDescriptors() const;
00130
00138 void readCSVFile(const char* file, int no_y, bool xlabels, bool ylabels, const char* sep=",", bool appendDescriptors=0, bool translate_class_labels=0);
00139
00141 void manipulateY(vector<String> v);
00142
00145 void manipulateY(String v);
00146
00149 void discretizeY(vector<double> thresholds);
00150
00151 void transformX(vector<String> v);
00152
00154 vector<QSARData*> partitionInputData(int p);
00155
00157 void saveToFile(string filename);
00158
00160 void readFromFile(string filename);
00161
00164 vector<QSARData*> generateExternalSet(double fraction) const;
00165
00167 vector<double>* getSubstance(int s) const;
00168
00170 vector<double>* getActivity(int s) const;
00171
00173 unsigned int getNoResponseVariables() const;
00174
00175 const vector<string>* getSubstanceNames() const;
00176
00178 bool checkforDiscreteY() const;
00179
00180
00182 bool checkforDiscreteY(const char* file, std::multiset<int>& activity_IDs) const;
00183
00185 void setDataFolder(const char* folder);
00186
00189 void removeHighlyCorrelatedCompounds(double& compound_cor_threshold, double& feature_cor_threshold);
00190
00195 void getSimilarDescriptors(int descriptor_ID, double correlation, std::list<std::pair<uint,String> >& similar_descriptor_IDs);
00197
00198
00199 protected:
00200
00205 void setDescriptorNames(const Molecule& m, std::multiset<int>& activity_IDs, bool useExDesc=1);
00206
00209 void removeInvalidDescriptors(std::multiset<int>& invalidDescriptors);
00210
00211 void removeInvalidSubstances(std::multiset<int>& inv);
00212
00214 void readMatrix(VMatrix& mat, std::ifstream& in, char seperator, unsigned int lines, unsigned int col);
00215
00218 void checkActivityIDs(std::multiset<int>& act, int no_properties);
00219
00222 void insertSubstance(const QSARData* source, int s, bool backtransformation=0);
00223
00225 void printMatrix(VMatrix& mat, std::ostream& out);
00227
00232 VMatrix descriptor_matrix_;
00233
00235 VMatrix Y_;
00236
00238 VMatrix descriptor_transformations_;
00239
00241 VMatrix y_transformations_;
00242
00244 vector<string> column_names_;
00245
00247 vector<string> substance_names_;
00248
00250 std::multiset<int> invalidDescriptors_;
00251
00252 std::multiset<int> invalidSubstances_;
00253
00254 String data_folder_;
00255
00257 std::map<String,int> class_names_;
00259
00260
00261
00262 friend class ClassificationValidation;
00263 friend class RegressionValidation;
00264 friend class Validation;
00265 friend class Model;
00266 friend class FitModel;
00267 friend class FeatureSelection;
00268
00269 };
00270
00271 }
00272 }
00273
00274 #endif // QSARH