OpenMS
Loading...
Searching...
No Matches
ProFormaParser.h
Go to the documentation of this file.
1// Copyright (c) 2002-present, OpenMS Inc. -- EKU Tuebingen, ETH Zurich, and FU Berlin
2// SPDX-License-Identifier: BSD-3-Clause
3//
4// --------------------------------------------------------------------------
5// $Maintainer: Timo Sachsenberg $
6// $Authors: Timo Sachsenberg $
7// --------------------------------------------------------------------------
8
9#pragma once
10
15#include <OpenMS/OpenMSConfig.h>
16
17#include <optional>
18#include <string>
19#include <string_view>
20#include <vector>
21
22namespace OpenMS
23{
24
25 // Forward declarations
26 class AASequence;
27 class MSSpectrum;
28
57 class OPENMS_DLLAPI ProFormaParser
58 {
59 public:
72 static Peptidoform parse(const String& input);
73
86 static PeptidoformIon parseIon(const String& input);
87
95 static String toString(const Peptidoform& pf,
96 ProFormaWriteMode mode = ProFormaWriteMode::LOSSLESS);
97
105 static String toString(const PeptidoformIon& pfi,
106 ProFormaWriteMode mode = ProFormaWriteMode::LOSSLESS);
107
108 // ---- AASequence Conversion Methods ----
109
120
132 const Peptidoform& pf,
133 AASequenceConversionPolicy policy = AASequenceConversionPolicy::FAIL_ON_LOSS);
134
145
156
166 static std::vector<ConversionIssue> getAASequenceConversionIssues(const Peptidoform& pf);
167
168 // ---- Mass Calculation Methods ----
169
181 static bool canCalculateMass(const Peptidoform& pf);
182
192 static bool canCalculateMass(const PeptidoformIon& pfi);
193
202 static std::vector<ConversionIssue> getMassCalculationIssues(const Peptidoform& pf);
203
213 static std::vector<ConversionIssue> getMassCalculationIssues(const PeptidoformIon& pfi);
214
231 static double getMonoWeight(const Peptidoform& pf);
232
244 static double getMonoWeight(const PeptidoformIon& pfi);
245
255 static double getMZ(const PeptidoformIon& pfi);
256
265 static double getMZ(const Peptidoform& pf, int charge);
266
267 // ---- Non-throwing variants (single-pass, efficient) ----
268
278 static std::optional<double> tryGetMonoWeight(const Peptidoform& pf);
279
289 static std::optional<double> tryGetMonoWeight(const Peptidoform& pf,
290 std::vector<ConversionIssue>& issues_out);
291
299 static std::optional<double> tryGetMonoWeight(const PeptidoformIon& pfi);
300
308 static std::optional<double> tryGetMonoWeight(const PeptidoformIon& pfi,
309 std::vector<ConversionIssue>& issues_out);
310
318 static std::optional<double> tryGetMZ(const Peptidoform& pf, int charge);
319
328 static std::optional<double> tryGetMZ(const Peptidoform& pf, int charge,
329 std::vector<ConversionIssue>& issues_out);
330
337 static std::optional<double> tryGetMZ(const PeptidoformIon& pfi);
338
346 static std::optional<double> tryGetMZ(const PeptidoformIon& pfi,
347 std::vector<ConversionIssue>& issues_out);
348
349 // ---- Theoretical Spectrum Generation ----
350
360 static bool canGenerateSpectrum(const Peptidoform& pf);
361
371 static bool canGenerateSpectrum(const PeptidoformIon& pfi);
372
379 static std::vector<ConversionIssue> getSpectrumGenerationIssues(const Peptidoform& pf);
380
387 static std::vector<ConversionIssue> getSpectrumGenerationIssues(const PeptidoformIon& pfi);
388
408 const Peptidoform& pf,
409 int min_charge = 1,
410 int max_charge = 1,
411 const std::string& ion_types = "by",
412 bool add_losses = false,
413 bool add_metainfo = true);
414
435 const PeptidoformIon& pfi,
436 int min_charge = 1,
437 int max_charge = 1,
438 const std::string& ion_types = "by",
439 bool add_losses = false,
440 bool add_metainfo = true);
441
442 private:
444 explicit ProFormaParser(std::string_view input);
445
446 // ---- High-level parsing methods ----
447
450
453
456 Peptidoform parsePeptidoformWithCharge_(bool is_chimeric_context);
457
459 std::vector<GlobalModEntry> parseGlobalMods_();
460
463
466
469
471 std::vector<UnlocalisedMod> parseUnlocalisedMods_();
472
474 std::vector<LabileModification> parseLabileModifications_();
475
477 std::vector<SequenceSection> parseSequence_();
478
481
484
487
489 std::vector<Modification> parseTerminalMods_();
490
491 // ---- Modification parsing ----
492
494 std::vector<Modification> parseModificationList_();
495
498
500 std::pair<ModificationTag, std::optional<Label>> parseModificationTagWithLabel_();
501
504
507
510
513
516
519
522
525
528
531
532 // ---- Charge state parsing ----
533
535 std::optional<ChargeState> parseChargeState_();
536
538 std::vector<AdductIon> parseAdductIons_();
539
542
543 // ---- Helper methods ----
544
547
550
553
556
559
562
564 bool isAtEnd_();
565
567 [[noreturn]] void error_(ProFormaErrorCode code, const char* message);
568
570 [[noreturn]] void errorAt_(ProFormaErrorCode code, size_t pos, const char* message);
571
573 std::optional<CvDatabase> parseCvDatabasePrefix_(const std::string_view& id);
574
576 static bool isAminoAcid_(char c);
577
580
583
586
587 // ---- Member variables ----
588
591
593 std::string input_;
594
597
599 bool has_current_ = false;
600 };
601
602} // namespace OpenMS
Representation of a peptide/protein sequence.
Definition AASequence.h:88
The representation of a 1D spectrum.
Definition MSSpectrum.h:44
Recursive descent parser for ProForma v2 peptidoform notation.
Definition ProFormaParser.h:58
MassDelta parseMassDelta_()
Parse a mass delta: +15.9949, Obs:+79.978.
void error_(ProFormaErrorCode code, const char *message)
Throw a parse error at the current position.
bool isAtEnd_()
Check if at end of input.
AmbiguousRegion parseAmbiguousRegion_()
Parse an ambiguous region: (?XY)
static bool canCalculateMass(const PeptidoformIon &pfi)
Check if mass can be calculated for a PeptidoformIon.
CvAccession parseCvAccession_()
Parse a CV accession: UNIMOD:35, MOD:00046.
ProFormaTokenizer tokenizer_
The tokenizer for lexical analysis.
Definition ProFormaParser.h:590
std::vector< Modification > parseModificationList_()
Parse a modification list: [mod1, mod2, ...].
static String toString(const Peptidoform &pf, ProFormaWriteMode mode=ProFormaWriteMode::LOSSLESS)
Convert a Peptidoform AST back to ProForma string notation.
AdductIon parseAdductIon_()
Parse a single adduct ion: Na:z+1.
SequenceElement parseSequenceElement_()
Parse a single sequence element (amino acid + mods)
bool looksLikeModificationTagContent_()
Check if the current position could start a modification tag content.
GlobalModification parseGlobalModification_()
Parse global modification with locations: <[mod]@locations>
std::optional< ChargeState > parseChargeState_()
Parse charge state: /2, /+2, /[Na:z+1].
static Peptidoform parse(const String &input)
Parse a ProForma string into a Peptidoform AST.
Label parseLabel_()
Parse a label: #XL1, #BRANCH, #g1(0.90)
static void resolveModifications(Peptidoform &pf)
Resolve all modifications in a Peptidoform using ModificationsDB.
static std::optional< double > tryGetMonoWeight(const PeptidoformIon &pfi, std::vector< ConversionIssue > &issues_out)
Try to calculate monoisotopic mass of PeptidoformIon with diagnostics.
FormulaTag parseFormulaTag_()
Parse a formula tag: Formula:C12H20O2.
ModifiedRange parseModifiedRange_()
Parse a modified range: (XYZ)[mod].
Modification parseModification_()
Parse a single modification (may have alternatives with |)
static std::optional< double > tryGetMonoWeight(const PeptidoformIon &pfi)
Try to calculate monoisotopic mass of a PeptidoformIon (non-throwing)
static double getMZ(const Peptidoform &pf, int charge)
Calculate m/z for a Peptidoform at a given charge state.
static std::optional< double > tryGetMZ(const Peptidoform &pf, int charge, std::vector< ConversionIssue > &issues_out)
Try to calculate m/z for a Peptidoform with diagnostics.
static std::optional< double > tryGetMonoWeight(const Peptidoform &pf)
Try to calculate monoisotopic mass of a Peptidoform (non-throwing)
static bool isAminoAcid_(char c)
Check if identifier is a valid amino acid.
bool match_(ProFormaTokenizer::TokenType type)
Check if current token matches expected type, consume if true.
static double getMZ(const PeptidoformIon &pfi)
Calculate m/z for a PeptidoformIon at its specified charge state.
static bool isRepresentableAsAASequence(const Peptidoform &pf)
Check if a Peptidoform can be fully represented as an AASequence.
std::vector< UnlocalisedMod > parseUnlocalisedMods_()
Parse unlocalised modifications: [mod]?
ProFormaTokenizer::Token advance_()
Consume and return the current token.
GlycanComposition parseGlycanComposition_()
Parse a glycan composition: Glycan:HexNAc1Hex2.
std::vector< GlobalModEntry > parseGlobalMods_()
Parse global modifications: < ... >
std::pair< ModificationTag, std::optional< Label > > parseModificationTagWithLabel_()
Parse a single modification tag (no alternatives)
static bool canCalculateMass(const Peptidoform &pf)
Check if mass can be calculated for a Peptidoform.
bool hasNTerminalModPattern_()
Check if current position has N-terminal modification pattern ([mod]-)
Peptidoform parsePeptidoform_()
Parse a single Peptidoform (one chain)
static std::optional< double > tryGetMonoWeight(const Peptidoform &pf, std::vector< ConversionIssue > &issues_out)
Try to calculate monoisotopic mass with diagnostic information.
ProFormaTokenizer::Token current_token_
Current token (cached)
Definition ProFormaParser.h:596
std::optional< CvDatabase > parseCvDatabasePrefix_(const std::string_view &id)
Parse a CV database prefix from identifier.
IsotopeReplacement parseIsotopeReplacement_()
Parse isotope replacement: <13C>, <15N>, <D>
static std::optional< double > tryGetMZ(const PeptidoformIon &pfi)
Try to calculate m/z for a PeptidoformIon (non-throwing)
Peptidoform parsePeptidoformWithCharge_(bool is_chimeric_context)
static std::vector< ConversionIssue > getSpectrumGenerationIssues(const Peptidoform &pf)
Get issues preventing spectrum generation for a Peptidoform.
ProFormaTokenizer::Token current_()
Get the current token.
static MSSpectrum generateSpectrum(const PeptidoformIon &pfi, int min_charge=1, int max_charge=1, const std::string &ion_types="by", bool add_losses=false, bool add_metainfo=true)
Generate a theoretical MS/MS spectrum for a PeptidoformIon.
ProFormaTokenizer::Token expect_(ProFormaTokenizer::TokenType type, const char *expected_desc)
Expect a specific token type, throw error if not found.
InfoTag parseInfoTag_()
Parse an info tag: INFO:text.
static bool canGenerateSpectrum(const Peptidoform &pf)
Check if a theoretical spectrum can be generated for a Peptidoform.
static PeptidoformIon parseIon(const String &input)
Parse a ProForma string into a PeptidoformIon AST.
void errorAt_(ProFormaErrorCode code, size_t pos, const char *message)
Throw a parse error at a specific position.
std::vector< SequenceSection > parseSequence_()
Parse the amino acid sequence with modifications.
static double getMonoWeight(const Peptidoform &pf)
Calculate monoisotopic mass of a Peptidoform.
static bool canGenerateSpectrum(const PeptidoformIon &pfi)
Check if a theoretical spectrum can be generated for a PeptidoformIon.
NamedMod parseNamedMod_()
Parse a named modification: Oxidation, U:Oxidation.
static double getMonoWeight(const PeptidoformIon &pfi)
Calculate monoisotopic mass of a PeptidoformIon.
static std::vector< ConversionIssue > getMassCalculationIssues(const Peptidoform &pf)
Get issues preventing mass calculation for a Peptidoform.
ProFormaTokenizer::Token peek_()
Look at the next token without consuming.
static std::vector< ConversionIssue > getSpectrumGenerationIssues(const PeptidoformIon &pfi)
Get issues preventing spectrum generation for a PeptidoformIon.
NamedMod parseNamedMod_(char cv_hint)
Parse a named modification with a known CV hint prefix.
static Peptidoform fromAASequence(const AASequence &seq)
Create a Peptidoform from an OpenMS AASequence.
std::vector< LabileModification > parseLabileModifications_()
Parse labile modifications: {mod}.
ModificationTag parseModificationTag_()
Parse a modification tag.
PositionConstraint parsePositionConstraint_()
Parse a position constraint: Position:MKC.
PeptidoformIon parsePeptidoformIon_()
Parse a complete PeptidoformIon (multiple chains + charge)
std::vector< Modification > parseTerminalMods_()
Parse terminal modifications: [mod1][mod2]...
static String toString(const PeptidoformIon &pfi, ProFormaWriteMode mode=ProFormaWriteMode::LOSSLESS)
Convert a PeptidoformIon AST back to ProForma string notation.
static std::optional< double > tryGetMZ(const Peptidoform &pf, int charge)
Try to calculate m/z for a Peptidoform (non-throwing)
static AASequence toAASequence(const Peptidoform &pf, AASequenceConversionPolicy policy=AASequenceConversionPolicy::FAIL_ON_LOSS)
Convert a Peptidoform to an OpenMS AASequence.
static std::vector< ConversionIssue > getAASequenceConversionIssues(const Peptidoform &pf)
Get a list of all issues that would arise during AASequence conversion.
static MSSpectrum generateSpectrum(const Peptidoform &pf, int min_charge=1, int max_charge=1, const std::string &ion_types="by", bool add_losses=false, bool add_metainfo=true)
Generate a theoretical MS/MS spectrum for a Peptidoform.
GlobalModEntry parseGlobalModEntry_()
Parse a single global modification entry.
ProFormaParser(std::string_view input)
Private constructor - use static methods.
std::string input_
The original input string (for error messages)
Definition ProFormaParser.h:593
static std::optional< double > tryGetMZ(const PeptidoformIon &pfi, std::vector< ConversionIssue > &issues_out)
Try to calculate m/z for a PeptidoformIon with diagnostics.
bool check_(ProFormaTokenizer::TokenType type)
Check if current token matches expected type.
ProFormaTokenizer createLookahead_() const
Create a lookahead tokenizer positioned at the current logical position.
std::vector< AdductIon > parseAdductIons_()
Parse adduct ions: [Na:z+1, H:z+1].
static std::vector< ConversionIssue > getMassCalculationIssues(const PeptidoformIon &pfi)
Get issues preventing mass calculation for a PeptidoformIon.
Tokenizer for ProForma v2 peptidoform notation.
Definition ProFormaTokenizer.h:54
TokenType
Token types produced by the tokenizer.
Definition ProFormaTokenizer.h:58
A more convenient string class.
Definition String.h:34
std::variant< CvAccession, NamedMod, MassDelta, FormulaTag, GlycanComposition, InfoTag, PositionConstraint > ModificationTag
Variant type representing any modification tag content.
Definition ProFormaData.h:262
std::variant< IsotopeReplacement, GlobalModification > GlobalModEntry
Variant type for global modification entries.
Definition ProFormaData.h:458
ProFormaErrorCode
Error codes for programmatic handling of ProForma parse errors.
Definition ProFormaError.h:25
ProFormaWriteMode
Write mode for ProForma string serialization.
Definition ProFormaData.h:88
AASequenceConversionPolicy
Conversion policy for transforming Peptidoform to AASequence.
Definition ProFormaData.h:35
Adduct ion specification for charge state.
Definition ProFormaData.h:472
Ambiguous amino acid region.
Definition ProFormaData.h:340
Controlled vocabulary accession for a modification.
Definition ProFormaData.h:121
Chemical formula with optional charge.
Definition ProFormaData.h:181
Global modification applied to specific locations.
Definition ProFormaData.h:424
Glycan composition specification.
Definition ProFormaData.h:199
Info tag for arbitrary text annotations.
Definition ProFormaData.h:216
Isotope replacement for stable isotope labeling.
Definition ProFormaData.h:441
A modification with one or more alternative tags.
Definition ProFormaData.h:306
Modified sequence range with shared modifications.
Definition ProFormaData.h:356
Named modification with optional CV prefix hint.
Definition ProFormaData.h:137
A single peptidoform (one peptide chain)
Definition ProFormaData.h:509
A peptidoform ion (one or more chains with optional charge)
Definition ProFormaData.h:532
Position constraint specifying allowed residues for a modification.
Definition ProFormaData.h:233
A single amino acid with its modifications.
Definition ProFormaData.h:325
Main OpenMS namespace.
Definition openswathalgo/include/OpenMS/OPENSWATHALGO/DATAACCESS/ISpectrumAccess.h:19
Label for cross-links, branches, or ambiguous grouping.
Definition ProFormaData.h:276
Mass delta modification with optional source hint.
Definition ProFormaData.h:153
A single token from the input stream.
Definition ProFormaTokenizer.h:89