OpenMS  2.5.0
FASTAContainer.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2020.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Chris Bielow $
32 // $Authors: Chris Bielow $
33 // --------------------------------------------------------------------------
34 
35 #pragma once
36 
43 
44 #include <functional>
45 #include <fstream>
46 #include <map>
47 #include <memory>
48 #include <utility>
49 #include <vector>
50 
51 #include <boost/regex.hpp>
52 
53 namespace OpenMS
54 {
55 
56  struct TFI_File;
57  struct TFI_Vector;
58 
81 template<typename TBackend>
82 class FASTAContainer; // prototype
83 
92 template<>
93 class FASTAContainer<TFI_File>
94 {
95 public:
96  FASTAContainer() = delete;
97 
99  FASTAContainer(const String& FASTA_file)
100  : f_(),
101  offsets_(),
102  data_fg_(),
103  data_bg_(),
104  chunk_offset_(0)
105  {
106  f_.readStart(FASTA_file);
107  }
108 
110  size_t getChunkOffset() const
111  {
112  return chunk_offset_;
113  }
114 
122  {
123  chunk_offset_ += data_fg_.size();
124  data_fg_.swap(data_bg_);
125  data_bg_.clear(); // just in case someone calls activateCache() multiple times...
126  return !data_fg_.empty();
127  }
128 
135  bool cacheChunk(int suggested_size)
136  {
137  data_bg_.clear();
138  data_bg_.reserve(suggested_size);
140  for (int i = 0; i < suggested_size; ++i)
141  {
142  std::streampos spos = f_.position();
143  if (!f_.readNext(p)) break;
144  data_bg_.push_back(std::move(p));
145  offsets_.push_back(spos);
146  }
147  return !data_bg_.empty();
148  }
149 
151  size_t chunkSize() const
152  {
153  return data_fg_.size();
154  }
155 
163  const FASTAFile::FASTAEntry& chunkAt(size_t pos) const
164  {
165  return data_fg_[pos];
166  }
167 
181  bool readAt(FASTAFile::FASTAEntry& protein, size_t pos)
182  {
183  // check if position is currently cached...
184  if (chunk_offset_ <= pos && pos < chunk_offset_ + chunkSize())
185  {
186  protein = data_fg_[pos - chunk_offset_];
187  return true;
188  }
189  // read anew from disk...
190  if (pos >= offsets_.size())
191  {
192  throw Exception::IndexOverflow(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, pos, offsets_.size());
193  }
194  std::streampos spos = f_.position(); // save old position
195  if (!f_.setPosition(offsets_[pos])) return false;
196  bool r = f_.readNext(protein);
197  f_.setPosition(spos); // restore old position
198  return r;
199  }
200 
202  bool empty() const
203  { // trusting the FASTA file can be read...
204  return f_.atEnd() && offsets_.empty();
205  }
206 
208  void reset()
209  {
210  f_.setPosition(0);
211  offsets_.clear();
212  data_fg_.clear();
213  data_bg_.clear();
214  chunk_offset_ = 0;
215  }
216 
217 
223  size_t size() const
224  {
225  return offsets_.size();
226  }
227 
228 private:
230  std::vector<std::streampos> offsets_;
231  std::vector<FASTAFile::FASTAEntry> data_fg_;
232  std::vector<FASTAFile::FASTAEntry> data_bg_;
233  size_t chunk_offset_;
234 };
235 
242 template<>
243 class FASTAContainer<TFI_Vector>
244 {
245 public:
246  FASTAContainer() = delete;
247 
253  FASTAContainer(const std::vector<FASTAFile::FASTAEntry>& data)
254  : data_(data)
255  {
256  }
257 
259  size_t getChunkOffset() const
260  {
261  return 0;
262  }
263 
269  {
270  if (!activate_count_)
271  {
272  activate_count_ = 1;
273  return true;
274  }
275  return false;
276  }
277 
281  bool cacheChunk(int /*suggested_size*/)
282  {
283  if (!cache_count_)
284  {
285  cache_count_ = 1;
286  return true;
287  }
288  return false;
289  }
290 
295  size_t chunkSize() const
296  {
297  return data_.size();
298  }
299 
301  const FASTAFile::FASTAEntry& chunkAt(size_t pos) const
302  {
303  return data_[pos];
304  }
305 
307  bool readAt(FASTAFile::FASTAEntry& protein, size_t pos) const
308  {
309  protein = data_[pos];
310  return true;
311  }
312 
314  bool empty() const
315  {
316  return data_.empty();
317  }
318 
320  size_t size() const
321  {
322  return data_.size();
323  }
324 
326  void reset()
327  {
328  activate_count_ = 0;
329  cache_count_ = 0;
330  }
331 
332 private:
333  const std::vector<FASTAFile::FASTAEntry>& data_;
334  int activate_count_ = 0;
335  int cache_count_ = 0;
336 };
337 
342 {
343 public:
344  struct Result
345  {
346  bool success; //< did >=40% of proteins have the *same* prefix or suffix
347  String name; //< on success, what was the decoy string?
348  bool is_prefix; //< on success, was it a prefix or suffix
349  };
350 
358  template<typename T>
360  {
361  // common decoy strings in FASTA files
362  // note: decoy prefixes/suffices must be provided in lower case
363  const std::vector<std::string> affixes{ "decoy", "dec", "reverse", "rev", "reversed", "__id_decoy", "xxx", "shuffled", "shuffle", "pseudo", "random" };
364 
365  // map decoys to counts of occurrences as prefix/suffix
366  DecoyStringToAffixCount decoy_count;
367  // map case insensitive strings back to original case (as used in fasta)
368  CaseInsensitiveToCaseSensitiveDecoy decoy_case_sensitive;
369 
370  // setup prefix- and suffix regex strings
371  const std::string regexstr_prefix = std::string("^(") + ListUtils::concatenate<std::string>(affixes, "_*|") + "_*)";
372  const std::string regexstr_suffix = std::string("(") + ListUtils::concatenate<std::string>(affixes, "_*|") + "_*)$";
373 
374  // setup regexes
375  const boost::regex pattern_prefix(regexstr_prefix);
376  const boost::regex pattern_suffix(regexstr_suffix);
377 
378  int all_prefix_occur(0), all_suffix_occur(0), all_proteins_count(0);
379 
380  constexpr size_t PROTEIN_CACHE_SIZE = 4e5;
381 
382  while (true)
383  {
384  proteins.cacheChunk(PROTEIN_CACHE_SIZE);
385  if (!proteins.activateCache()) break;
386 
387  auto prot_count = (SignedSize)proteins.chunkSize();
388  all_proteins_count += prot_count;
389 
390  boost::smatch sm;
391  for (SignedSize i = 0; i < prot_count; ++i)
392  {
393  String seq = proteins.chunkAt(i).identifier;
394 
395  String seq_lower = seq;
396  seq_lower.toLower();
397 
398  // search for prefix
399  bool found_prefix = boost::regex_search(seq_lower, sm, pattern_prefix);
400  if (found_prefix)
401  {
402  std::string match = sm[0];
403  all_prefix_occur++;
404 
405  // increase count of observed prefix
406  decoy_count[match].first++;
407 
408  // store observed (case sensitive and with special characters)
409  std::string seq_decoy = StringUtils::prefix(seq, match.length());
410  decoy_case_sensitive[match] = seq_decoy;
411  }
412 
413  // search for suffix
414  bool found_suffix = boost::regex_search(seq_lower, sm, pattern_suffix);
415  if (found_suffix)
416  {
417  std::string match = sm[0];
418  all_suffix_occur++;
419 
420  // increase count of observed suffix
421  decoy_count[match].second++;
422 
423  // store observed (case sensitive and with special characters)
424  std::string seq_decoy = StringUtils::suffix(seq, match.length());
425  decoy_case_sensitive[match] = seq_decoy;
426  }
427  }
428  }
429 
430  // DEBUG ONLY: print counts of found decoys
431  for (auto &a : decoy_count) OPENMS_LOG_DEBUG << a.first << "\t" << a.second.first << "\t" << a.second.second << std::endl;
432 
433  // less than 40% of proteins are decoys -> won't be able to determine a decoy string and its position
434  // return default values
435  if (all_prefix_occur + all_suffix_occur < 0.4 * all_proteins_count)
436  {
437  OPENMS_LOG_ERROR << "Unable to determine decoy string (not enough occurrences; <40%)!" << std::endl;
438  return {false, "?", true};
439  }
440 
441  if (all_prefix_occur == all_suffix_occur)
442  {
443  OPENMS_LOG_ERROR << "Unable to determine decoy string (prefix and suffix occur equally often)!" << std::endl;
444  return {false, "?", true};
445  }
446 
447  // Decoy prefix occurred at least 80% of all prefixes + observed in at least 40% of all proteins -> set it as prefix decoy
448  for (const auto& pair : decoy_count)
449  {
450  const std::string & case_insensitive_decoy_string = pair.first;
451  const std::pair<int, int>& prefix_suffix_counts = pair.second;
452  double freq_prefix = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(all_prefix_occur);
453  double freq_prefix_in_proteins = static_cast<double>(prefix_suffix_counts.first) / static_cast<double>(all_proteins_count);
454 
455  if (freq_prefix >= 0.8 && freq_prefix_in_proteins >= 0.4)
456  {
457  if (prefix_suffix_counts.first != all_prefix_occur)
458  {
459  OPENMS_LOG_WARN << "More than one decoy prefix observed!" << std::endl;
460  OPENMS_LOG_WARN << "Using most frequent decoy prefix (" << (int)(freq_prefix * 100) << "%)" << std::endl;
461  }
462 
463  return { true, decoy_case_sensitive[case_insensitive_decoy_string], true};
464  }
465  }
466 
467  // Decoy suffix occurred at least 80% of all suffixes + observed in at least 40% of all proteins -> set it as suffix decoy
468  for (const auto& pair : decoy_count)
469  {
470  const std::string& case_insensitive_decoy_string = pair.first;
471  const std::pair<int, int>& prefix_suffix_counts = pair.second;
472  double freq_suffix = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(all_suffix_occur);
473  double freq_suffix_in_proteins = static_cast<double>(prefix_suffix_counts.second) / static_cast<double>(all_proteins_count);
474 
475  if (freq_suffix >= 0.8 && freq_suffix_in_proteins >= 0.4)
476  {
477  if (prefix_suffix_counts.second != all_suffix_occur)
478  {
479  OPENMS_LOG_WARN << "More than one decoy suffix observed!" << std::endl;
480  OPENMS_LOG_WARN << "Using most frequent decoy suffix (" << (int)(freq_suffix * 100) << "%)" << std::endl;
481  }
482 
483  return { true, decoy_case_sensitive[case_insensitive_decoy_string], false};
484  }
485  }
486 
487  OPENMS_LOG_ERROR << "Unable to determine decoy string and its position. Please provide a decoy string and its position as parameters." << std::endl;
488  return {false, "?", true};
489  }
490 
491 private:
492  using DecoyStringToAffixCount = std::map<std::string, std::pair<int, int>>;
493  using CaseInsensitiveToCaseSensitiveDecoy = std::map<std::string, std::string>;
494 };
495 
496 } // namespace OpenMS
497 
OpenMS::DecoyHelper
Helper class for calculcations on decoy proteins.
Definition: FASTAContainer.h:341
OpenMS::FASTAContainer< TFI_Vector >::readAt
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos) const
fast access to an entry
Definition: FASTAContainer.h:307
FASTAFile.h
OpenMS::FASTAContainer< TFI_Vector >::cacheChunk
bool cacheChunk(int)
no-op (since data is already fully available as vector)
Definition: FASTAContainer.h:281
OpenMS::DecoyHelper::findDecoyString
static Result findDecoyString(FASTAContainer< T > &proteins)
Heuristic to determine the decoy string given a set of protein names.
Definition: FASTAContainer.h:359
OpenMS::FASTAContainer< TFI_Vector >::size
size_t size() const
calls size() on underlying vector
Definition: FASTAContainer.h:320
OpenMS::DecoyHelper::Result::success
bool success
Definition: FASTAContainer.h:346
OpenMS::FASTAContainer< TFI_File >::activateCache
bool activateCache()
Swaps in the background cache of entries, read previously via cacheChunk()
Definition: FASTAContainer.h:121
LogStream.h
OpenMS::FASTAContainer< TFI_File >::getChunkOffset
size_t getChunkOffset() const
how many entries were read and got swapped out already
Definition: FASTAContainer.h:110
OpenMS::FASTAContainer< TFI_File >::readAt
bool readAt(FASTAFile::FASTAEntry &protein, size_t pos)
Retrieve a FASTA entry at global position pos (must not be behind the currently active chunk,...
Definition: FASTAContainer.h:181
OpenMS::FASTAContainer< TFI_File >::reset
void reset()
resets reading of the FASTA file, enables fresh reading of the FASTA from the beginning
Definition: FASTAContainer.h:208
OpenMS::FASTAContainer< TFI_File >::chunkSize
size_t chunkSize() const
number of entries in active cache
Definition: FASTAContainer.h:151
Exception.h
OpenMS::DecoyHelper::Result
Definition: FASTAContainer.h:344
OPENMS_LOG_ERROR
#define OPENMS_LOG_ERROR
Macro to be used if non-fatal error are reported (processing continues)
Definition: LogStream.h:455
OpenMS::FASTAContainer< TFI_File >::cacheChunk
bool cacheChunk(int suggested_size)
Prefetch a new cache in the background, with up to suggestedSize entries (or fewer upon reaching EOF)
Definition: FASTAContainer.h:135
OpenMS::FASTAContainer
template parameter for vector-based FASTA access
Definition: FASTAContainer.h:82
OpenMS::FASTAContainer< TFI_File >::data_fg_
std::vector< FASTAFile::FASTAEntry > data_fg_
active (foreground) data
Definition: FASTAContainer.h:231
OpenMS::DecoyHelper::Result::is_prefix
bool is_prefix
Definition: FASTAContainer.h:348
int
OpenMS::FASTAContainer< TFI_File >::FASTAContainer
FASTAContainer(const String &FASTA_file)
C'tor with FASTA filename.
Definition: FASTAContainer.h:99
OpenMS::FASTAContainer< TFI_File >::size
size_t size() const
NOT the number of entries in the FASTA file, but merely the number of already read entries (since we ...
Definition: FASTAContainer.h:223
OpenMS::FASTAContainer< TFI_Vector >::chunkAt
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
fast access to chunked (i.e. all) entries
Definition: FASTAContainer.h:301
OpenMS::FASTAContainer< TFI_Vector >::FASTAContainer
FASTAContainer(const std::vector< FASTAFile::FASTAEntry > &data)
C'tor for already existing data (by reference).
Definition: FASTAContainer.h:253
OpenMS::FASTAContainer< TFI_Vector >::getChunkOffset
size_t getChunkOffset() const
always 0, since this specialization requires/supports no chunking
Definition: FASTAContainer.h:259
ListUtils.h
OpenMS::FASTAContainer< TFI_Vector >::empty
bool empty() const
calls empty() on underlying vector
Definition: FASTAContainer.h:314
OpenMS::FASTAContainer< TFI_Vector >::reset
void reset()
required for template parameters!
Definition: FASTAContainer.h:326
OpenMS::FASTAContainer< TFI_Vector >::chunkSize
size_t chunkSize() const
active data spans the full range, i.e. size of container
Definition: FASTAContainer.h:295
OPENMS_LOG_DEBUG
#define OPENMS_LOG_DEBUG
Macro for general debugging information.
Definition: LogStream.h:470
OpenMS::FASTAContainer< TFI_File >::offsets_
std::vector< std::streampos > offsets_
internal byte offsets into FASTA file for random access reading of previous entries.
Definition: FASTAContainer.h:230
OpenMS::DecoyHelper::CaseInsensitiveToCaseSensitiveDecoy
std::map< std::string, std::string > CaseInsensitiveToCaseSensitiveDecoy
Definition: FASTAContainer.h:493
OpenMS::FASTAFile::FASTAEntry
FASTA entry type (identifier, description and sequence)
Definition: FASTAFile.h:76
OpenMS::StringUtils::prefix
static String prefix(const String &this_s, size_t length)
Definition: StringUtils.h:339
OpenMS::FASTAContainer< TFI_Vector >::activateCache
bool activateCache()
no-op (since data is already fully available as vector)
Definition: FASTAContainer.h:268
StringUtils.h
OpenMS::String::toLower
String & toLower()
Converts the string to lowercase.
OpenMS::String
A more convenient string class.
Definition: String.h:58
OpenMS::SignedSize
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:134
OpenMS::FASTAContainer< TFI_File >::chunkAt
const FASTAFile::FASTAEntry & chunkAt(size_t pos) const
Retrieve a FASTA entry at cache position pos (fast)
Definition: FASTAContainer.h:163
OpenMS::StringUtils::suffix
static String suffix(const String &this_s, size_t length)
Definition: StringUtils.h:348
OpenMS::FASTAContainer< TFI_Vector >::data_
const std::vector< FASTAFile::FASTAEntry > & data_
reference to existing data
Definition: FASTAContainer.h:333
OpenMS
Main OpenMS namespace.
Definition: FeatureDeconvolution.h:46
OpenMS::FASTAContainer< TFI_File >::empty
bool empty() const
is the FASTA file empty?
Definition: FASTAContainer.h:202
OPENMS_LOG_WARN
#define OPENMS_LOG_WARN
Macro if a warning, a piece of information which should be read by the user, should be logged.
Definition: LogStream.h:460
OpenMS::FASTAContainer< TFI_File >::chunk_offset_
size_t chunk_offset_
number of entries before the current chunk
Definition: FASTAContainer.h:233
String.h
OpenMS::FASTAContainer< TFI_File >::f_
FASTAFile f_
FASTA file connection.
Definition: FASTAContainer.h:229
OpenMS::FASTAFile
This class serves for reading in and writing FASTA files.
Definition: FASTAFile.h:64
OpenMS::DecoyHelper::DecoyStringToAffixCount
std::map< std::string, std::pair< int, int > > DecoyStringToAffixCount
Definition: FASTAContainer.h:492
OpenMS::Exception::IndexOverflow
Int overflow exception.
Definition: Exception.h:254
OpenMS::FASTAContainer< TFI_File >::data_bg_
std::vector< FASTAFile::FASTAEntry > data_bg_
prefetched (background) data; will become the next active data
Definition: FASTAContainer.h:232
OpenMS::DecoyHelper::Result::name
String name
Definition: FASTAContainer.h:347