00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 #ifndef QSARH
00027 #define QSARH
00028
00029 #include <iostream>
00030 #include <vector>
00031 #include <list>
00032 #include <set>
00033 #include <map>
00034 #include <math.h>
00035 #include <sstream>
00036 #include <fstream>
00037 #include <limits>
00038 #include <fstream>
00039 #include <BALL/COMMON/exception.h>
00040 #include <string.h>
00041
00042 #ifndef STATISTICS
00043 #include <BALL/QSAR/statistics.h>
00044 #endif
00045
00046 #ifndef QSAR_EXCEPTION
00047 #include <BALL/QSAR/exception.h>
00048 #endif
00049
00050 #include <gsl/gsl_randist.h>
00051 #include <gsl/gsl_cdf.h>
00052
00053 #include <BALL/CONCEPT/timeStamp.h>
00054
00055
00056
00057
00058
00059 namespace BALL
00060 {
00061 namespace ML
00062 {
00063 typedef vector<double> Column;
00064 typedef vector<Column> VMatrix;
00065
00067 class BALL_EXPORT MLData
00068 {
00069 public:
00070
00071 MLData();
00072
00073 ~MLData();
00074
00078 bool isDataCentered() const;
00079
00081 bool isResponseCentered() const;
00082
00087 vector<String>* readPropertyNames(String sd_file);
00088
00092 void readSDFile(const char* file);
00093
00099 void readSDFile(const char* file, std::multiset<int>& act, bool useExDesc=1, bool append=0, bool translate_class_labels=0);
00100
00104 void calculateBALLDescriptors(Molecule& m);
00105
00107 void displayMatrix();
00108
00111 void centerData(bool center_Y=0);
00112
00114 void scaleAllDescriptors();
00115
00117 unsigned int getNoSubstances() const;
00118
00120 unsigned int getNoDescriptors() const;
00121
00129 void readCSVFile(const char* file, int no_y, bool xlabels, bool ylabels, const char* sep=",", bool appendDescriptors=0, bool translate_class_labels=0);
00130
00132 void manipulateY(vector<String> v);
00133
00136 void manipulateY(String v);
00137
00140 void discretizeY(vector<double> thresholds);
00141
00142 void transformX(vector<String> v);
00143
00145 vector<QSARData*> partitionInputData(int p);
00146
00148 void saveToFile(string filename) const;
00149
00151 void readFromFile(string filename);
00152
00155 vector<QSARData*> generateExternalSet(double fraction) const;
00156
00161 vector<QSARData*> evenSplit(int no_test_splits, int current_test_split_id, int response_id=0) const;
00162
00164 vector<double>* getSubstance(int s) const;
00165
00167 vector<double>* getActivity(int s) const;
00168
00170 unsigned int getNoResponseVariables() const;
00171
00172 const vector<string>* getSubstanceNames() const;
00173
00175 bool checkforDiscreteY() const;
00176
00178 bool checkforDiscreteY(const char* file, std::multiset<int>& activity_IDs) const;
00179
00181 void setDataFolder(const char* folder);
00182
00185 void removeHighlyCorrelatedCompounds(double& compound_cor_threshold, double& feature_cor_threshold);
00186
00192 void getSimilarDescriptors(int descriptor_ID, double correlation, std::list<std::pair<uint,String> >& similar_descriptor_IDs) const;
00194
00195 protected:
00196
00201 void setDescriptorNames(const Molecule& m, std::multiset<int>& activity_IDs, bool useExDesc=1);
00202
00205 void removeInvalidDescriptors(std::multiset<int>& invalidDescriptors);
00206
00207 void removeInvalidSubstances(std::multiset<int>& inv);
00208
00210 void readMatrix(VMatrix& mat, std::ifstream& in, char seperator, unsigned int lines, unsigned int col);
00211
00214 void checkActivityIDs(std::multiset<int>& act, int no_properties);
00215
00218 void insertSubstance(const QSARData* source, int s, bool backtransformation=0);
00219
00221 void printMatrix(const VMatrix& mat, std::ostream& out) const;
00223
00228 VMatrix descriptor_matrix_;
00229
00231 VMatrix Y_;
00232
00234 VMatrix descriptor_transformations_;
00235
00237 VMatrix y_transformations_;
00238
00240 vector<string> column_names_;
00241
00243 vector<string> substance_names_;
00244
00246 std::multiset<int> invalidDescriptors_;
00247
00248 std::multiset<int> invalidSubstances_;
00249
00250 String data_folder_;
00251
00253 std::map<String,int> class_names_;
00255 };
00256
00257 }
00258 }
00259
00260 #endif // QSARH