"""A client for the ClassyFire API which enables efficient querying with
chemical database files"""
import requests
import csv
import time
import os
import json
url = "http://classyfire.wishartlab.com"
chunk_size = 1000
sleep_interval = 60
[docs]def structure_query(compound, label='pyclassyfire'):
"""Submit a compound information to the ClassyFire service for evaluation
and receive a id which can be used to used to collect results
:param compound: The compound structures as line delimited inchikey or
smiles. Optionally a tab-separated id may be prepended for each
structure.
:type compound: str
:param label: A label for the query
:type label:
:return: A query ID number
:rtype: int
>>> structure_query('CCC', 'smiles_test')
>>> structure_query('InChI=1S/C3H4O3/c1-2(4)3(5)6/h1H3,(H,5,6)')
"""
r = requests.post(url + '/queries.json', data='{"label": "%s", '
'"query_input": "%s", "query_type": "STRUCTURE"}'
% (label, compound),
headers={"Content-Type": "application/json"})
r.raise_for_status()
return r.json()['id']
[docs]def iupac_query(compound, label='pyclassyfire'):
"""Submit a IUPAC compound name to the ClassyFire service for evaluation
and receive a id which can be used to used to collect results
:param compound: The line delimited compound names. Optionally a
tab-separated id may be prepended for each compound.
:type compound: str
:param label: A label for the query
:type label:
:return: A query ID number
:rtype: int
>>> iupac_query('ethane', 'iupac_test')
>>> iupac_query('C001\\tethane\\nC002\\tethanol', 'iupac_test')
"""
r = requests.post(url + '/queries.json', data='{"label": "%s", '
'"query_input": "%s", "query_type": "IUPAC_NAME"}'
% (label, compound),
headers={"Content-Type": "application/json"})
r.raise_for_status()
return r.json()['id']
[docs]def get_results(query_id, return_format="json"):
"""Given a query_id, fetch the classification results.
:param query_id: A numeric query id returned at time of query submission
:type query_id: str
:param return_format: desired return format. valid types are json, csv or sdf
:type return_format: str
:return: query information
:rtype: str
>>> get_results('595535', 'csv')
>>> get_results('595535', 'json')
>>> get_results('595535', 'sdf')
"""
r = requests.get('%s/queries/%s.%s' % (url, query_id, return_format),
headers={"Content-Type": "application/%s" % return_format})
r.raise_for_status()
return r.text
[docs]def get_entity(inchikey, return_format="json"):
"""Given a InChIKey for a previously queried structure, fetch the
classification results.
:param inchikey: An InChIKey for a previously calculated chemical structure
:type inchikey: str
:param return_format: desired return format. valid types are json, csv or sdf
:type return_format: str
:return: query information
:rtype: str
>>> get_entity("ATUOYWHBWRKTHZ-UHFFFAOYSA-N", 'csv')
>>> get_entity("ATUOYWHBWRKTHZ-UHFFFAOYSA-N", 'json')
>>> get_entity("ATUOYWHBWRKTHZ-UHFFFAOYSA-N", 'sdf')
"""
inchikey = inchikey.replace('InChIKey=', '')
r = requests.get('%s/entities/%s.%s' % (url, inchikey, return_format),
headers={
"Content-Type": "application/%s" % return_format})
r.raise_for_status()
return r.text
[docs]def get_chemont_node(chemontid):
"""Return data for the TaxNode with ID chemontid.
:param chemontid: the ChemOnt ID of the entity.
:type chemontid: str
:return: The classification results for the entity as json.
:rtype: str
>>> get_chemont_node('CHEMONTID:0004253')
"""
chemontid = chemontid.replace("CHEMONTID:", "C")
r = requests.get('%s/tax_nodes/%s.json' % (url, chemontid),
headers={"Content-Type": "application/json" })
r.raise_for_status()
return r.text
[docs]def tabular_query(inpath, structure_key, dialect='excel', outpath=None,
outfields=('taxonomy', 'description', 'substituents')):
"""Given a path to a compound set in tabular form (comma or tab delimited)
annotate all compounds and write results to an expanded table.
:param inpath: path to compound file to be annotated
:type inpath: str
:param structure_key: column heading which contains the compounds InChIKey
or SMILES
:type structure_key: str
:param dialect: dialect for parsing table (generally 'excel' for csv,
'excel-tab' for tsv)
:type dialect: str
:param outpath: Path to desired output location
:type outpath: str
:param outfields: Fields to append to table from ClassyFire output
:type outfields: tuple(string)
>>> tabular_query('/tabulated_data.tsv', 'structure', 'excel-tab')
"""
tax_fields = ('kingdom', 'superclass', 'class', 'subclass')
query_ids = []
infile = open(inpath, 'rU')
if not outpath:
outpath = _prevent_overwrite(inpath)
comps = []
for line in csv.DictReader(infile, dialect=dialect):
comps.append(line[structure_key])
if not len(comps) % chunk_size:
query_ids.append(structure_query('/n'.join(comps)))
comps = []
if comps:
query_ids.append(structure_query('\\n'.join(comps)))
print('%s queries submitted to ClassyFire API' % len(query_ids))
i = 0
infile.seek(0)
with open(outpath, 'w') as outfile:
reader = csv.DictReader(infile, dialect=dialect)
writer = csv.DictWriter(outfile, reader.fieldnames+list(outfields),
dialect=dialect)
writer.writeheader()
while i < len(query_ids):
result = json.loads(get_results(query_ids[i]))
if result["classification_status"] == "Done":
for hit, line in zip(result['entities'], reader):
if 'taxonomy' in outfields:
hit['taxonomy'] = ";".join(
['%s:%s' % (hit[x]['name'], hit[x]['chemont_id'])
for x in tax_fields if hit[x]])
for field in outfields:
if isinstance(hit[field], list):
line[field] = ';'.join(hit[field])
else:
line[field] = hit[field]
writer.writerow(line)
i += 1
else:
print("%s percent complete" % round(i/len(query_ids)*100))
time.sleep(sleep_interval)
infile.close()
[docs]def sdf_query(inpath, outpath=None):
"""Given a path to a compound set in a sdf file, annotate all compounds
and write results as attributes in a sdf file.
:param inpath: path to compound file to be annotated
:type inpath: str
:param outpath: Path to desired output location
:type outpath: str
>>> sdf_query('/sdf_data.sdf')
"""
from rdkit.Chem import AllChem
query_ids = []
if not outpath:
outpath = _prevent_overwrite(inpath)
comps = []
for mol in AllChem.SDMolSupplier(inpath):
if mol:
comps.append(AllChem.MolToSmiles(mol))
if not len(comps) % chunk_size:
query_ids.append(structure_query('/n'.join(comps)))
comps = []
if comps:
query_ids.append(structure_query('\\n'.join(comps)))
print('%s queries submitted to ClassyFire API' % len(query_ids))
i = 0
with open(outpath, 'w') as outfile:
while i < len(query_ids):
result = json.loads(get_results(query_ids[i]))
if result["classification_status"] == "Done":
outfile.write(get_results(query_ids[i], return_format='sdf'))
i += 1
else:
print("%s percent complete" % round(i / len(query_ids) * 100))
time.sleep(sleep_interval)
def _prevent_overwrite(write_path, suffix='_annotated'):
"""Prevents overwrite of existing output files by appending a suffix when needed
:param write_path: potential write path
:type write_path: string
:return:
:rtype:
"""
while os.path.exists(write_path):
sp = write_path.split('.')
if len(sp) > 1:
sp[-2] += suffix
write_path = '.'.join(sp)
else:
write_path += suffix
return write_path