#!/usr/bin/env python
# -*- coding: utf-8 -*-
# pylint: disable=anomalous-backslash-in-string
import re
[docs]
def parseNotes(element):
"""
From an SBML element (ex: species or reaction) will return all the section
note in a dictionary.
ex:
<notes>
<html:body>
<html:p>BIOCYC: |Alkylphosphonates|</html:p>
<html:p>CHEBI: 60983</html:p>
</html:body>
</notes>
output: {'BIOCYC': |Alkylphosphonates|,'CHEBI':'60983'}
value is a list in case diff lines for the same type of info
Parameters
----------
element: libsbml.element
an element from libsbml
Returns
-------
dict:
the dictionary of note
"""
notes = element.getNotesString()
notes_list = notes.splitlines()
notes_dict = {}
for line in notes_list:
try:
# line = <html:p>BIOCYC: |Alkylphosphonates|</html:p>
start = line.index(">") + 1
end = line.index("<", start)
line = line[start:end]
# line = BIOCYC: |Alkylphosphonates|
key, val = line.split(":")
# line = [BIOCYC,|Alkylphosphonates|]
key = re.sub(" ", "_", key)
if len(val) != 0 and val.count(" ") != len(val):
notes_dict[key] = [val]
except ValueError:
continue
return notes_dict
[docs]
def parseGeneAssoc(GeneAssocStr):
"""
Given a grammar of 'and', 'or' and '(' ')'. Extracts genes ids to a list.
(geneX and geneY) or geneW' => [geneX,geneY,geneW]
Parameters
----------
GeneAssocStr: str
the string containing genes ids
Returns
-------
list:
the list of unique ids
"""
# remplace ' and ' or ' or ' by a tag '_FORSPLIT_'
GeneAssocStr_tmp = re.sub(r" and | or ", "_FORSPLIT_", GeneAssocStr)
# remove '(' or ')' or ' '
resultat = re.sub(r"\(|\)|\s", "", GeneAssocStr_tmp)
# create a set by splitting '_FORSPLIT_' then convert to list, set for unique genes
if len(resultat) != 0:
resultat = list(set(resultat.split("_FORSPLIT_")))
else:
resultat = []
return resultat
[docs]
def convert_to_coded_id(uncoded, _type=None, compart=None):
"""
convert an id to sbml valid format. First add type of id "R" for reaction
"M" for compound at the start and the compart at the end.
_type+"_"+uncoded+"_"+compart
then replace not allowed char by integer ordinal
Parameters
----------
uncoded: str
the original id to code
_type: str
the type of the id (ex: 'R' or 'M')
compart: str
the compartment of the id (ex: 'c' or 'e')
Returns
-------
str:
the coded id
"""
# add type and compart
if _type is not None:
uncoded = _type + "_" + uncoded
if compart is not None:
uncoded += "_" + compart
# char list that are not allowed in a sbml id
charlist = [
"-",
"|",
"/",
"(",
")",
"'",
"=",
"#",
"*",
".",
":",
"!",
"+",
"[",
"]",
",",
" ",
]
for char in charlist:
# if a banned char in the uncoded id, convert it using the integer ordinal
uncoded = uncoded.replace(char, "__" + str(ord(char)) + "__")
return uncoded
[docs]
def ascii_replace(match):
"""
recover banned char from the integer ordinal in the reg.match
"""
return chr(int(match.group(1)))
[docs]
def convert_from_coded_id(
coded, pattern="__", compart_in_id=False, reaction_tag="R", species_tag="M"
):
"""
convert an id from sbml format to the original id. try to extract the type of
the id and the compart using strong regular expression
ex: M_METABOLITE__45__12_c => ('METABOLITE-12', 'M', 'c')
Parameters
----------
coded: str
the encoded id
pattern: str
pattern used to delimit interger ordinal
compart_in_id: bool
if true: the last _* is not mean to be the compart is part of the id
reaciton_tag: str
First letter used to tag a reaction
species_tag: str
First letter used to tag a species
Returns
-------
str:
the uncoded id
str, None:
type of ID (ex: 'M' or 'R')
str, None:
compart of the id
"""
# replace DASH from very old sbmls
coded = coded.replace("_DASH_", "__")
# an original id starting with int will start with '_' in sbml
if coded.startswith("_"):
coded = coded[1:]
# reg ex to find the ascii used to replace not allowed char
ascii_pattern = r"{0}(\d+){0}".format(pattern)
codepat = re.compile(ascii_pattern)
# replace ascii by the not allowed char of sbml
coded = codepat.sub(ascii_replace, coded)
# Regular expression for coded ID from:
# https://github.com/SBRG/bigg_models/wiki/BiGG-Models-ID-Specification-and-Guidelines
str_reg = r"(?P<_type>^[{0}{1}]_)(?P<_id>.*)(?P<compart>_[a-z][a-z0-9]?$)".format(
species_tag, reaction_tag
)
reg_expr = re.compile(str_reg)
search_result = reg_expr.search(coded)
if search_result is not None:
compart = search_result.group("compart").replace("_", "")
_type = search_result.group("_type").replace("_", "")
uncoded = search_result.group("_id")
else:
str_reg = r"(?P<_type>^[{0}{1}]_)(?P<_id>.*)".format(species_tag, reaction_tag)
reg_expr = re.compile(str_reg)
search_result = reg_expr.search(coded)
if search_result is not None:
compart = None
_type = search_result.group("_type").replace("_", "")
uncoded = search_result.group("_id")
else:
reg_expr = re.compile(r"(?P<_id>.*)(?P<compart>_.*)")
search_result = reg_expr.search(coded)
if search_result is not None:
_type = None
compart = search_result.group("compart").replace("_", "")
uncoded = search_result.group("_id")
else:
uncoded = coded
_type = None
compart = None
if compart and compart_in_id:
uncoded += "_" + compart
return (uncoded, _type, compart)
[docs]
def get_all_decoded_version(element_id, _type):
"""
Use convert_from_coded function to convert a element_id (reaction or species)
_type use define if element is a 'reaction' or un 'species'.
Try different decoding combination based on old and new sbml id encoding.
Parameters
----------
element_id: str
the encoded id
_type: str
_type is 'reaction' or 'species'
Returns
-------
list:
list of encoded id
"""
all_element_id_decoded = list()
# 1st attemp: decoded id with classic encoding convention. cf sbmlplugin.convert_from_coded_id
all_element_id_decoded.append(convert_from_coded_id(element_id)[0])
if _type == "species":
# 2st attemp: decoded id with classic non-conventionnal encoding. cf sbmlplugin.convert_from_coded_id
if (
convert_from_coded_id(element_id, compart_in_id=True)[0]
not in all_element_id_decoded
):
all_element_id_decoded.append(
convert_from_coded_id(element_id, compart_in_id=True)[0]
)
if (
convert_from_coded_id(element_id, pattern="_", species_tag="S")[0]
not in all_element_id_decoded
):
all_element_id_decoded.append(
convert_from_coded_id(element_id, pattern="_", species_tag="S")[0]
)
elif _type == "reaction":
# 1st attemp: decoded id with classic encoding convention. cf sbmlplugin.convert_from_coded_id
all_element_id_decoded.append(convert_from_coded_id(element_id)[0])
# 2st attemp: decoded id with classic non-conventionnal encoding. cf sbmlplugin.convert_from_coded_id
if (
convert_from_coded_id(element_id, compart_in_id=True)[0]
not in all_element_id_decoded
):
all_element_id_decoded.append(
convert_from_coded_id(element_id, compart_in_id=True)[0]
)
return all_element_id_decoded