The Metabolic Disassembler is a program that estimates the starting material of metabolism by decomposing the structure of natural products (secondary metabolites) whose biosynthetic pathway is unknown into "biosynthetic units". Published in BMC Bioinformatics magazine on December 23, 2019.
Metabolic disassembler for understanding and predicting the biosynthetic units of natural products https://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-019-3183-9
There is also an explanation video in Japanese
The code is written in Python and can be found on the GitHub page below. https://github.com/the-metabolic-disassembler/metadisassembler
In this Qiita article, I'd like to add a little explanation in Japanese to the method used in Google Colaboratory posted on GitHub.
It takes about 5 minutes.
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO
logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)
def install(
chunk_size=4096,
file_name="Miniconda3-4.7.12-Linux-x86_64.sh",
url_base="https://repo.continuum.io/miniconda/",
conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
rdkit_version=None,
add_python_path=True,
force=False):
"""install rdkit from miniconda
```
import rdkit_installer
rdkit_installer.install()
```
"""
python_path = os.path.join(
conda_path,
"lib",
"python{0}.{1}".format(*sys.version_info),
"site-packages",
)
if add_python_path and python_path not in sys.path:
logger.info("add {} to PYTHONPATH".format(python_path))
sys.path.append(python_path)
if os.path.isdir(os.path.join(python_path, "rdkit")):
logger.info("rdkit is already installed")
if not force:
return
logger.info("force re-install")
url = url_base + file_name
python_version = "{0}.{1}.{2}".format(*sys.version_info)
logger.info("python version: {}".format(python_version))
if os.path.isdir(conda_path):
logger.warning("remove current miniconda")
shutil.rmtree(conda_path)
elif os.path.isfile(conda_path):
logger.warning("remove {}".format(conda_path))
os.remove(conda_path)
logger.info('fetching installer from {}'.format(url))
res = requests.get(url, stream=True)
res.raise_for_status()
with open(file_name, 'wb') as f:
for chunk in res.iter_content(chunk_size):
f.write(chunk)
logger.info('done')
logger.info('installing miniconda to {}'.format(conda_path))
subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
logger.info('done')
logger.info("installing rdkit")
subprocess.check_call([
os.path.join(conda_path, "bin", "conda"),
"install",
"--yes",
"-c", "rdkit",
"python=={}".format(python_version),
"rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
logger.info("done")
import rdkit
logger.info("rdkit-{} installation finished!".format(rdkit.__version__))
install(rdkit_version='2019.09.2.0', force=True)
!pip install metadisassembler
import glob
from IPython.display import Image, display_png
from rdkit.Chem.Draw import IPythonConsole
import metadisassembler as medi
Take C05557 Isopenicillin N as an example.
# Create an instance and input a query molecule
test1 = medi.MetaDisassembler()
test1.input_query('C05557')
test1.cpds[0].mol
Let's disassemble this input compound.
Disassemble
# Disassemble the query molecule
# It takes about 30 sec.
test1.disassemble()
If there are multiple Disassemble results, multiple will be output.
# List output files
sorted(glob.glob('./output/' + test1.name + '/*'))
['./output/C05557/0.png',
'./output/C05557/1.png',
'./output/C05557/10.png',
'./output/C05557/11.png',
'./output/C05557/12.png',
'./output/C05557/2.png',
'./output/C05557/3.png',
'./output/C05557/4.png',
'./output/C05557/5.png',
'./output/C05557/6.png',
'./output/C05557/7.png',
'./output/C05557/8.png',
'./output/C05557/9.png',
'./output/C05557/C05557.mol',
'./output/C05557/result.txt']
The most promising candidates can be illustrated as follows. Different colors indicate different fragments (from different biosynthetic units).
# Display the first image
display_png(Image('./output/' + test1.name + '/0.png'))
To find out which compound the fragment came from:
bu_info = test1.output_matched_bu(result_id=0)
For example, to know the compound from which the most promising candidate fragment is derived
n = 0
print('Biosynthetic Unit IDs:')
print(bu_info[n]['bu_id'])
bu_info[n]['mol']
Biosynthetic Unit IDs:
['C00956_01', 'C01251_04']
The above C00956 and C01251 are the KEGG IDs of the derived compounds.
C00956_01 → C00956
https://www.genome.jp/dbget-bin/www_bget?cpd:C00956 ★
C01251_04 → C01251
https://www.genome.jp/dbget-bin/www_bget?cpd:C01251
If you do not know the KEGG ID of the input compound, or if the compound does not exist in the KEGG database, you can enter it using the SMILES format. Let's use Dihydroclavaminic acid as an example.
# Create an instance and input a query molecule
test2 = medi.MetaDisassembler()
test2.input_query('[H][C@]12CC(=O)N1[C@@H]([C@@H](CCN)O2)C(O)=O')
test2.cpds[0].mol
Let's disassemble this input compound.
# Disassemble the query molecule
# It takes about 2 min.
test2.disassemble()
# List output files
sorted(glob.glob('./output/' + test2.name + '/*'))
['./output/C8H12N2O4/0.png',
'./output/C8H12N2O4/1.png',
'./output/C8H12N2O4/10.png',
'./output/C8H12N2O4/2.png',
'./output/C8H12N2O4/3.png',
'./output/C8H12N2O4/4.png',
'./output/C8H12N2O4/5.png',
'./output/C8H12N2O4/6.png',
'./output/C8H12N2O4/7.png',
'./output/C8H12N2O4/8.png',
'./output/C8H12N2O4/9.png',
'./output/C8H12N2O4/C8H12N2O4.mol',
'./output/C8H12N2O4/result.txt']
Click here for the most promising candidates.
# Display the first image
display_png(Image('./output/' + test2.name + '/0.png'))
The derived compound is
bu_info = test2.output_matched_bu(result_id=0)
n = 0
print('Biosynthetic Unit IDs:')
print(bu_info[n]['bu_id'])
bu_info[n]['mol']
Biosynthetic Unit IDs:
['C00062_01', 'C00077']
https://www.genome.jp/dbget-bin/www_bget?cpd:C00062 ★
https://www.genome.jp/dbget-bin/www_bget?cpd:C00077
Similarly, if you do not know the KEGG ID of the input compound, or if the compound does not exist in the KEGG database, you can enter it using the InChI format. Let's use Curcumin diglucoside as an example.
# Create an instance and input a query molecule
test3 = medi.MetaDisassembler()
test3.input_query('InChI=1S/C33H40O16/c1-44-22-11-16(5-9-20(22)46-32-30(42)28(40)26(38)24(14-34)48-32)3-7-18(36)13-19(37)8-4-17-6-10-21(23(12-17)45-2)47-33-31(43)29(41)27(39)25(15-35)49-33/h3-13,24-36,38-43H,14-15H2,1-2H3/b7-3+,8-4+,18-13-/t24-,25-,26-,27-,28+,29+,30-,31-,32-,33-/m1/s1')
test3.cpds[0].mol
Let's disassemble this input compound.
# Disassemble the query molecule
test3.disassemble()
# List output files
sorted(glob.glob('./output/' + test3.name + '/*'))
['./output/C33H40O16/0.png',
'./output/C33H40O16/1.png',
'./output/C33H40O16/10.png',
'./output/C33H40O16/11.png',
'./output/C33H40O16/12.png',
'./output/C33H40O16/13.png',
'./output/C33H40O16/2.png',
'./output/C33H40O16/3.png',
'./output/C33H40O16/4.png',
'./output/C33H40O16/5.png',
'./output/C33H40O16/6.png',
'./output/C33H40O16/7.png',
'./output/C33H40O16/8.png',
'./output/C33H40O16/9.png',
'./output/C33H40O16/C33H40O16.mol',
'./output/C33H40O16/result.txt']
Click here for the most promising candidates.
# Display the first image
display_png(Image('./output/' + test3.name + '/0.png'))
The derived compound is
bu_info = test3.output_matched_bu(result_id=0)
n = 0
print('Biosynthetic Unit IDs:')
print(bu_info[n]['bu_id'])
bu_info[n]['mol']
Biosynthetic Unit IDs:
['C00029_04', 'C00031_05']
https://www.genome.jp/dbget-bin/www_bget?cpd:C00029
https://www.genome.jp/dbget-bin/www_bget?cpd:C00031 ★
You can also use the ID of the KNApSAcK database, a well-known secondary metabolite database, for input. Let's use C00011250 Fumigaclavine C as an example.
# Create an instance and input a query molecule
test4 = medi.MetaDisassembler()
test4.input_query('C00011250')
test4.cpds[0].mol
Let's disassemble this input compound.
# Disassemble the query molecule
test4.disassemble()
# List output files
sorted(glob.glob('./output/' + test4.name + '/*'))
['./output/C00011250/0.png',
'./output/C00011250/1.png',
'./output/C00011250/2.png',
'./output/C00011250/3.png',
'./output/C00011250/4.png',
'./output/C00011250/5.png',
'./output/C00011250/C00011250.mol',
'./output/C00011250/result.txt']
For this compound, the third candidate was the correct answer. "
# Display the "third" image
display_png(Image('./output/' + test4.name + '/2.png'))
Its derived compound is
bu_info = test4.output_matched_bu(result_id=2)
n = 0
print('Biosynthetic Unit IDs:')
print(bu_info[n]['bu_id'])
bu_info[n]['mol']
Biosynthetic Unit IDs:
['C00078_01', 'C00398']
https://www.genome.jp/dbget-bin/www_bget?cpd:C00078 ★
https://www.genome.jp/dbget-bin/www_bget?cpd:C00398
!zip -r /content/result.zip /content/output
from google.colab import files
files.download('/content/result.zip')
Recommended Posts