// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
// SPDX-License-Identifier: BSD-3-Clause
//
// --------------------------------------------------------------------------
// $Maintainer: Timo Sachsenberg $
// $Authors: Timo Sachsenberg $
// --------------------------------------------------------------------------

#include <OpenMS/CHEMISTRY/AASequence.h>
#include <OpenMS/CHEMISTRY/ProteaseDigestion.h>
#include <OpenMS/CHEMISTRY/DecoyGenerator.h>
#include <OpenMS/CONCEPT/Macros.h>

#include <chrono>
#include <algorithm>

using namespace OpenMS;

DecoyGenerator::DecoyGenerator()
{
  const UInt64 seed = std::chrono::high_resolution_clock::now().time_since_epoch().count();
  shuffler_.seed(seed);
}

void DecoyGenerator::setSeed(UInt64 seed)
{
  shuffler_.seed(seed);
}

AASequence DecoyGenerator::reverseProtein(const AASequence& protein) const
{
  OPENMS_PRECONDITION(!protein.isModified(), "Decoy generation only supports unmodified proteins.")
  String s = protein.toUnmodifiedString();
  std::reverse(s.begin(), s.end());
  return AASequence::fromString(s);
}

AASequence DecoyGenerator::reversePeptides(const AASequence& protein, const String& protease) const
{
  OPENMS_PRECONDITION(!protein.isModified(), "Decoy generation only supports unmodified proteins.")
  std::vector<AASequence> peptides;
  ProteaseDigestion ed;
  ed.setMissedCleavages(0); // important as we want to reverse between all cutting sites
  ed.setEnzyme(protease);
  ed.setSpecificity(EnzymaticDigestion::SPEC_FULL);
  ed.digest(protein, peptides);    
  String pseudo_reversed;
  for (int i = 0; i < static_cast<int>(peptides.size()) - 1; ++i)
  {
    std::string s = peptides[i].toUnmodifiedString();
    auto last = --s.end(); // don't reverse enzymatic cutting site
    std::reverse(s.begin(), last);
    pseudo_reversed += s;
  }
  // the last peptide of a protein is not an enzymatic cutting site so we do a full reverse
  std::string s = peptides[peptides.size() - 1 ].toUnmodifiedString();
  std::reverse(s.begin(), s.end());
  pseudo_reversed += s;
  return AASequence::fromString(pseudo_reversed);
}

// generate decoy protein sequences
std::vector<AASequence> DecoyGenerator::shuffle(const AASequence& protein, const String& protease, int decoy_factor)
{
  OPENMS_PRECONDITION(!protein.isModified(), "Decoy generation only supports unmodified proteins.");
  
  ProteaseDigestion digestor;
  digestor.setEnzyme(protease);
  digestor.setMissedCleavages(0);  // for decoy generation disable missed cleavages
  digestor.setSpecificity(EnzymaticDigestion::SPEC_FULL);

  std::vector<AASequence> output;
  digestor.digest(protein, output);

  // generate decoy_factor number of complete decoy proteins
  std::vector<AASequence> decoy_proteins;
  for (int variant = 0; variant < decoy_factor; ++variant)
  {
    String decoy_sequence;
    for (const auto & aas : output)
    {
      if (aas.size() <= 2)
      {
        decoy_sequence += aas.toUnmodifiedString();
        continue;
      }

      // Important: create DecoyGenerator instance per peptide with same seed
      // Otherwise same peptides end up creating different decoys -> much more decoys than targets
      // But: we add variant to seed to get different decoys in multiple decoy generation
      DecoyGenerator dg;
      dg.setSeed(4711 + variant); // + variant to get different decoys in multiple decoy generation
      decoy_sequence += dg.shufflePeptides(aas, protease).toUnmodifiedString();
    }
    decoy_proteins.push_back(AASequence::fromString(decoy_sequence));
  }
  
  return decoy_proteins;
}

AASequence DecoyGenerator::shufflePeptides(
        const AASequence& protein,
        const String& protease,
        const int max_attempts)
{  
  OPENMS_PRECONDITION(!protein.isModified(), "Decoy generation only supports unmodified proteins.");

  std::vector<AASequence> peptides;
  ProteaseDigestion ed;
  ed.setMissedCleavages(0); // important as we want to reverse between all cutting sites
  ed.setEnzyme(protease);
  ed.setSpecificity(EnzymaticDigestion::SPEC_FULL);
  ed.digest(protein, peptides);    
  String protein_shuffled;
  for (int i = 0; i < static_cast<int>(peptides.size()) - 1; ++i)
  {
    const std::string peptide_string = peptides[i].toUnmodifiedString();

    // add from cache if available
    bool cached(false);
    #pragma omp critical (td_cache_)
    {
      auto it = td_cache_.find(peptide_string);
      if (it != td_cache_.end())
      {
        protein_shuffled += it->second; // add if cached
        cached = true;
      }
    }
    if (cached) continue;

    String peptide_string_shuffled = peptide_string;
    auto last = --peptide_string_shuffled.end();
    double lowest_identity(1.0);
    String lowest_identity_string(peptide_string_shuffled);
    for (int i = 0; i < max_attempts; ++i) // try to find sequence with low identity
    {
      shuffler_.portable_random_shuffle(std::begin(peptide_string_shuffled), last);

      double identity = SequenceIdentity_(peptide_string_shuffled, peptide_string);
      if (identity < lowest_identity)
      {
        lowest_identity = identity;
        lowest_identity_string = peptide_string_shuffled;

        if (identity <= (1.0/peptide_string_shuffled.size() + 1e-6)) 
        {
          break; // found perfect shuffle (only 1 (=cutting site) of all AAs match)
        }
      }
    }
    protein_shuffled += lowest_identity_string;
    #pragma omp critical (td_cache_)
    {
      td_cache_[peptide_string] = lowest_identity_string;
    }
  }
  // the last peptide of a protein is not an enzymatic cutting site so we do a full shuffle
  const std::string peptide_string = peptides[peptides.size() - 1 ].toUnmodifiedString();
  bool cached(false);
  #pragma omp critical (td_cache_)
  {
    auto it = td_cache_.find(peptide_string);
    if (it != td_cache_.end())
    {
      protein_shuffled += it->second; // add if cached
      cached = true;
    }
  }
  if (cached) return AASequence::fromString(protein_shuffled);

  String peptide_string_shuffled = peptide_string;
  double lowest_identity(1.0);
  String lowest_identity_string(peptide_string_shuffled);
  for (int i = 0; i < max_attempts; ++i) // try to find sequence with low identity
  {
    shuffler_.portable_random_shuffle(std::begin(peptide_string_shuffled), std::end(peptide_string_shuffled));
    double identity = SequenceIdentity_(peptide_string_shuffled, peptide_string);
    if (identity < lowest_identity)
    {
      lowest_identity = identity;
      lowest_identity_string = peptide_string_shuffled;
      if (identity == 0)
      {
        break; // found best shuffle
      }
    }
  }
  protein_shuffled += lowest_identity_string;
  #pragma omp critical (td_cache_)
  {
    td_cache_[peptide_string] = lowest_identity_string;
  }
  return AASequence::fromString(protein_shuffled);
}

// static
double DecoyGenerator::SequenceIdentity_(const String& decoy, const String& target)
{
  int match = 0;
  for (Size i = 0; i < target.size(); ++i)
  {
    if (target[i] == decoy[i]) { ++match; }
  }
  double identity = (double) match / target.size();

  // also compare against reverse
  match = 0;
  for (int i = (int)target.size() - 1; i >= 0; --i)
  {
    int j = (int)target.size() - 1 - i;
    if (target[j] == decoy[i]) { ++match; }
  }
  double rev_identity = (double) match / target.size();
   
  return std::max(identity, rev_identity);
}


