BALL  1.4.79
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Macros Groups Pages
Public Member Functions | Friends | List of all members
BALL::QSAR::QSARData Class Reference

#include <BALL/QSAR/QSARData.h>

Inheritance diagram for BALL::QSAR::QSARData:
BALL::Rescoring::RescoreQSARData

Public Member Functions

 QSARData ()
 
 ~QSARData ()
 
Predicates
bool isDataCentered () const
 
bool isResponseCentered () const
 

Protected Attributes

Attributes
VMatrix descriptor_matrix_
 
VMatrix Y_
 
VMatrix descriptor_transformations_
 
VMatrix y_transformations_
 
vector< string > column_names_
 
vector< string > substance_names_
 
std::multiset< int > invalidDescriptors_
 
std::multiset< int > invalidSubstances_
 
String data_folder_
 
std::map< String, int > class_names_
 

Friends

class ClassificationValidation
 
class RegressionValidation
 
class Validation
 
class Model
 
class FitModel
 
class FeatureSelection
 

Accessors

vector< String > * readPropertyNames (String sd_file)
 
void readSDFile (const char *file)
 
void readSDFile (const char *file, std::multiset< int > &act, bool useExDesc=1, bool append=0, bool translate_class_labels=0)
 
void readSDFile (const char *file, std::set< String > &activity_names, bool useExDesc=1, bool append=0, bool translate_class_labels=0, bool calc_phychem_properties=1, bool calc_topological_properties=1)
 
void displayMatrix ()
 
void centerData (bool center_Y=0)
 
void scaleAllDescriptors ()
 
unsigned int getNoSubstances () const
 
unsigned int getNoDescriptors () const
 
void readCSVFile (const char *file, int no_y, bool xlabels, bool ylabels, const char *sep=",", bool appendDescriptors=0, bool translate_class_labels=0)
 
void manipulateY (std::vector< String > v)
 
void manipulateY (String v)
 
void discretizeY (std::vector< double > thresholds)
 
void transformX (std::vector< String > v)
 
std::vector< QSARData * > partitionInputData (int p)
 
void saveToFile (string filename) const
 
void readFromFile (string filename)
 
std::vector< QSARData * > generateExternalSet (double fraction) const
 
std::vector< QSARData * > evenSplit (int no_test_splits, int current_test_split_id, int response_id=0) const
 
std::vector< double > * getSubstance (int s) const
 
std::vector< double > * getActivity (int s) const
 
unsigned int getNoResponseVariables () const
 
const std::vector< string > * getSubstanceNames () const
 
bool checkforDiscreteY () const
 
bool checkforDiscreteY (const char *file, std::multiset< int > &activity_IDs) const
 
void setDataFolder (const char *folder)
 
void removeHighlyCorrelatedCompounds (double &compound_cor_threshold, double &feature_cor_threshold)
 
void getSimilarDescriptors (int descriptor_ID, double correlation, std::list< std::pair< uint, String > > &similar_descriptor_IDs) const
 
void calculateBALLDescriptors (Molecule &m)
 
void calculateTopologicalDescriptors (Molecule &mol, MolecularSimilarity &molsim, const std::map< String, int > &descriptor_map)
 
void setDescriptorNames (const Molecule &m, std::multiset< int > &activity_IDs, bool useExDesc=1, bool resize=1)
 
void removeInvalidDescriptors (std::multiset< int > &invalidDescriptors)
 
void removeInvalidSubstances (std::multiset< int > &inv)
 
void readMatrix (VMatrix &mat, std::ifstream &in, char seperator, unsigned int lines, unsigned int col)
 
void checkActivityIDs (std::multiset< int > &act, int no_properties)
 
void insertSubstance (const QSARData *source, int s, bool backtransformation=0)
 
void printMatrix (const VMatrix &mat, std::ostream &out) const
 

Detailed Description

QSAR

Definition at line 55 of file QSARData.h.

Constructor & Destructor Documentation

BALL::QSAR::QSARData::QSARData ( )
BALL::QSAR::QSARData::~QSARData ( )

Member Function Documentation

void BALL::QSAR::QSARData::calculateBALLDescriptors ( Molecule m)
protected

Calculates descriptors for one molecule and saves them into one new line of descriptor_matrix

void BALL::QSAR::QSARData::calculateTopologicalDescriptors ( Molecule mol,
MolecularSimilarity molsim,
const std::map< String, int > &  descriptor_map 
)
protected

Calculates topological descriptors based on functional groups counts done by SMARTS matching

void BALL::QSAR::QSARData::centerData ( bool  center_Y = 0)

centers each descriptor to mean of 0 and stddev of 1

Parameters
center_Yif ==1, activity values are also centered. Obviously this should NOT be used for classification experiments!
void BALL::QSAR::QSARData::checkActivityIDs ( std::multiset< int > &  act,
int  no_properties 
)
protected

checks whether the given list of activity IDs contains any values <0 or values that are larger than the number of properties in the current input file.
If such values are found, an Exception of type InvalidActivityID is thrown.

bool BALL::QSAR::QSARData::checkforDiscreteY ( ) const

checks whether the response variables contain only discrete values. This can be used to check whether the current input data set is suitable for a ClassificationModel

bool BALL::QSAR::QSARData::checkforDiscreteY ( const char *  file,
std::multiset< int > &  activity_IDs 
) const

checks whether the response variables of a specified file contain only discrete values.

void BALL::QSAR::QSARData::discretizeY ( std::vector< double thresholds)

Discretize the response values. If the response variable(s) of this data object have been normalized, the given thresolds will be automatically normalized accordingly.

Parameters
thresoldsd thresholds for d+1 classes, that are to be created
void BALL::QSAR::QSARData::displayMatrix ( )

show descriptor_matrix on stdout

std::vector<QSARData*> BALL::QSAR::QSARData::evenSplit ( int  no_test_splits,
int  current_test_split_id,
int  response_id = 0 
) const

Split this data set into a training set and a test set. In contrast to generateExternalSet(), compounds for the test set are not randomly selected. Instead, this data set is first sorted according to response values (in order to ensure equal response value ranges) and then split regularly into training and test set.

Parameters
no_test_splitsthe total number of splits you want to create by successive calls of this function
current_test_split_idthe split to be produced, with 0<=current_test_split_id<no_test_splits
std::vector<QSARData*> BALL::QSAR::QSARData::generateExternalSet ( double  fraction) const

generates a training and an external validation set from the current QSARData object

Parameters
fractionthe fraction of this current coumpounds that should be used as external validation set (by random drawing)
std::vector<double>* BALL::QSAR::QSARData::getActivity ( int  s) const

returns a pointer to a new vector containing the UNcentered response values for the s'th substance of the current data set

unsigned int BALL::QSAR::QSARData::getNoDescriptors ( ) const

returns the number of descriptors

unsigned int BALL::QSAR::QSARData::getNoResponseVariables ( ) const

returns the number of response variables

unsigned int BALL::QSAR::QSARData::getNoSubstances ( ) const

returns the number of substances

void BALL::QSAR::QSARData::getSimilarDescriptors ( int  descriptor_ID,
double  correlation,
std::list< std::pair< uint, String > > &  similar_descriptor_IDs 
) const

Find all descriptors of the current data set that have a correlation of at least 'similarity' to the specified feature

Parameters
descriptor_IDthe ID of the descriptor for which similar features should be searched
similaritythe desired minimal correlation
similar_descriptor_IDslist to which the IDs of the found descriptors will be saved as pairs of descriptor ID and descriptor name
std::vector<double>* BALL::QSAR::QSARData::getSubstance ( int  s) const

returns a pointer to a new vector containing the UNcentered descriptor values for the s'th substance of the current data set

const std::vector<string>* BALL::QSAR::QSARData::getSubstanceNames ( ) const
void BALL::QSAR::QSARData::insertSubstance ( const QSARData source,
int  s,
bool  backtransformation = 0 
)
protected

appends compound no <s> taken from the given source to the data of this object.

Parameters
backtransformationif set to true, all features of the compound are back-transformed after adding them to this object.
bool BALL::QSAR::QSARData::isDataCentered ( ) const

tells whether the features have been centered

bool BALL::QSAR::QSARData::isResponseCentered ( ) const

tells whether the response variables have been centered

void BALL::QSAR::QSARData::manipulateY ( std::vector< String v)

for testing purposes only: change Y-matrix according to the given equations

void BALL::QSAR::QSARData::manipulateY ( String  v)

for testing purposes only: change Y-matrix according to the given equation

Parameters
vstring containing the equation, e.g."x1+x3*5+x10^2"
std::vector<QSARData*> BALL::QSAR::QSARData::partitionInputData ( int  p)

partitions the input data into p QSARData object of (approx.) equal size.

void BALL::QSAR::QSARData::printMatrix ( const VMatrix mat,
std::ostream &  out 
) const
protected

prints a vector-based matrix to a file

void BALL::QSAR::QSARData::readCSVFile ( const char *  file,
int  no_y,
bool  xlabels,
bool  ylabels,
const char *  sep = ",",
bool  appendDescriptors = 0,
bool  translate_class_labels = 0 
)

Read input from a csv file.
This file should contain all descriptor values in the first columns and the activity values in the last no_y columns.

Parameters
no_ythe number of activities, i.e. the number of columns containing activity values
xlabelsif ==1, names of descriptors are read from the first line of the table
ylabelif ==1, names of substances are read from the first column of the table
septhe character used to seperate the cells of the table
appendDescriptorsif set to 1, descriptors will be read from the file and appended as new columns to the current descriptor_matrix
void BALL::QSAR::QSARData::readFromFile ( string  filename)

reconstructs a QSARData object from a text file

void BALL::QSAR::QSARData::readMatrix ( VMatrix mat,
std::ifstream &  in,
char  seperator,
unsigned int  lines,
unsigned int  col 
)
protected

reconstructs a vector based matrix from a file

vector<String>* BALL::QSAR::QSARData::readPropertyNames ( String  sd_file)

reads the names of the properties from the first molecule in the given sd-file

void BALL::QSAR::QSARData::readSDFile ( const char *  file)

Fetches input from one sd-file containing all structures and from one file containing the activities of all structures sorted in ascending order.
The latter file is assumed to have the same name as the first one, with only the extension changed to ".txt"

Parameters
filethe sd-file containing the input
void BALL::QSAR::QSARData::readSDFile ( const char *  file,
std::multiset< int > &  act,
bool  useExDesc = 1,
bool  append = 0,
bool  translate_class_labels = 0 
)

Fetches input from one sd-file containing all structures. The activity value for each molecule is taken from its property in the sd-file.

Parameters
acontains the numbers of the properties that are activity-values
filethe sd-file containing the input
useExDescif set to 1, descriptors read from the sd-file will be used in addition to those calculated by BALL internally
appendif set to 1, the substances read from the sd-file will be appended as new lines to the current descriptor_matrix
void BALL::QSAR::QSARData::readSDFile ( const char *  file,
std::set< String > &  activity_names,
bool  useExDesc = 1,
bool  append = 0,
bool  translate_class_labels = 0,
bool  calc_phychem_properties = 1,
bool  calc_topological_properties = 1 
)
void BALL::QSAR::QSARData::removeHighlyCorrelatedCompounds ( double compound_cor_threshold,
double feature_cor_threshold 
)

removes compounds whose absolute correlation coefficient to another compound is larger than cor_threshold

Parameters
feature_cor_thresholdOnly features that do not have a correlation larger than this value to another feature are used to calculate the similarity of compounds (=instances).
void BALL::QSAR::QSARData::removeInvalidDescriptors ( std::multiset< int > &  invalidDescriptors)
protected

removes columns of invalid descriptor from descriptor_matrix

Parameters
invalidDescriptorslist containing the IDs of the columns to be deleted
void BALL::QSAR::QSARData::removeInvalidSubstances ( std::multiset< int > &  inv)
protected
void BALL::QSAR::QSARData::saveToFile ( string  filename) const

saves the current QSARData object to a text file

void BALL::QSAR::QSARData::scaleAllDescriptors ( )

scales each descriptor to stddev of 1

void BALL::QSAR::QSARData::setDataFolder ( const char *  folder)

allows to set the data-folder neccessary for computation of descriptors without using BALL_DATA_PATH enviroment variable, which is useful for standalone applications

void BALL::QSAR::QSARData::setDescriptorNames ( const Molecule m,
std::multiset< int > &  activity_IDs,
bool  useExDesc = 1,
bool  resize = 1 
)
protected

writes the names of all external descriptors into column_names

void BALL::QSAR::QSARData::transformX ( std::vector< String v)

Friends And Related Function Documentation

friend class ClassificationValidation
friend

Definition at line 254 of file QSARData.h.

friend class FeatureSelection
friend

Definition at line 259 of file QSARData.h.

friend class FitModel
friend

Definition at line 258 of file QSARData.h.

friend class Model
friend

Definition at line 257 of file QSARData.h.

friend class RegressionValidation
friend

Definition at line 255 of file QSARData.h.

friend class Validation
friend

Definition at line 256 of file QSARData.h.

Member Data Documentation

std::map<String,int> BALL::QSAR::QSARData::class_names_
protected

in case of classification data sets with non-numeric class labels, this member maps the names of the individual classes to their assigned id.

Definition at line 249 of file QSARData.h.

vector<string> BALL::QSAR::QSARData::column_names_
protected

names of all descriptors

Definition at line 236 of file QSARData.h.

String BALL::QSAR::QSARData::data_folder_
protected

Definition at line 246 of file QSARData.h.

VMatrix BALL::QSAR::QSARData::descriptor_matrix_
protected

matrix containing the values of each descriptor for each substance

Definition at line 224 of file QSARData.h.

VMatrix BALL::QSAR::QSARData::descriptor_transformations_
protected

2xm dimensional matrix (m=no of descriptors) containing mean and stddev of each transformed descriptor

Definition at line 230 of file QSARData.h.

std::multiset<int> BALL::QSAR::QSARData::invalidDescriptors_
protected

contains the numbers of external descriptors for which invalid values (e.g. strings instead numerical values) were encountered in some molecules

Definition at line 242 of file QSARData.h.

std::multiset<int> BALL::QSAR::QSARData::invalidSubstances_
protected

Definition at line 244 of file QSARData.h.

vector<string> BALL::QSAR::QSARData::substance_names_
protected

names of all substances

Definition at line 239 of file QSARData.h.

VMatrix BALL::QSAR::QSARData::Y_
protected

matrix containing the experimentally determined results (active/non-active) for each substance. Different activities are saved column-wise.

Definition at line 227 of file QSARData.h.

VMatrix BALL::QSAR::QSARData::y_transformations_
protected

2xc dimensional matrix (c=no of activities) containing mean and stddev of each transformed activity

Definition at line 233 of file QSARData.h.