Source code for pyclassyfire.client

"""A client for the ClassyFire API which enables efficient querying with 
chemical database files"""

import requests
import csv
import time
import os
import json

url = "http://classyfire.wishartlab.com"
chunk_size = 1000
sleep_interval = 60


[docs]def structure_query(compound, label='pyclassyfire'): """Submit a compound information to the ClassyFire service for evaluation and receive a id which can be used to used to collect results :param compound: The compound structures as line delimited inchikey or smiles. Optionally a tab-separated id may be prepended for each structure. :type compound: str :param label: A label for the query :type label: :return: A query ID number :rtype: int >>> structure_query('CCC', 'smiles_test') >>> structure_query('InChI=1S/C3H4O3/c1-2(4)3(5)6/h1H3,(H,5,6)') """ r = requests.post(url + '/queries.json', data='{"label": "%s", ' '"query_input": "%s", "query_type": "STRUCTURE"}' % (label, compound), headers={"Content-Type": "application/json"}) r.raise_for_status() return r.json()['id']
[docs]def iupac_query(compound, label='pyclassyfire'): """Submit a IUPAC compound name to the ClassyFire service for evaluation and receive a id which can be used to used to collect results :param compound: The line delimited compound names. Optionally a tab-separated id may be prepended for each compound. :type compound: str :param label: A label for the query :type label: :return: A query ID number :rtype: int >>> iupac_query('ethane', 'iupac_test') >>> iupac_query('C001\\tethane\\nC002\\tethanol', 'iupac_test') """ r = requests.post(url + '/queries.json', data='{"label": "%s", ' '"query_input": "%s", "query_type": "IUPAC_NAME"}' % (label, compound), headers={"Content-Type": "application/json"}) r.raise_for_status() return r.json()['id']
[docs]def get_results(query_id, return_format="json"): """Given a query_id, fetch the classification results. :param query_id: A numeric query id returned at time of query submission :type query_id: str :param return_format: desired return format. valid types are json, csv or sdf :type return_format: str :return: query information :rtype: str >>> get_results('595535', 'csv') >>> get_results('595535', 'json') >>> get_results('595535', 'sdf') """ r = requests.get('%s/queries/%s.%s' % (url, query_id, return_format), headers={"Content-Type": "application/%s" % return_format}) r.raise_for_status() return r.text
[docs]def get_entity(inchikey, return_format="json"): """Given a InChIKey for a previously queried structure, fetch the classification results. :param inchikey: An InChIKey for a previously calculated chemical structure :type inchikey: str :param return_format: desired return format. valid types are json, csv or sdf :type return_format: str :return: query information :rtype: str >>> get_entity("ATUOYWHBWRKTHZ-UHFFFAOYSA-N", 'csv') >>> get_entity("ATUOYWHBWRKTHZ-UHFFFAOYSA-N", 'json') >>> get_entity("ATUOYWHBWRKTHZ-UHFFFAOYSA-N", 'sdf') """ inchikey = inchikey.replace('InChIKey=', '') r = requests.get('%s/entities/%s.%s' % (url, inchikey, return_format), headers={ "Content-Type": "application/%s" % return_format}) r.raise_for_status() return r.text
[docs]def get_chemont_node(chemontid): """Return data for the TaxNode with ID chemontid. :param chemontid: the ChemOnt ID of the entity. :type chemontid: str :return: The classification results for the entity as json. :rtype: str >>> get_chemont_node('CHEMONTID:0004253') """ chemontid = chemontid.replace("CHEMONTID:", "C") r = requests.get('%s/tax_nodes/%s.json' % (url, chemontid), headers={"Content-Type": "application/json" }) r.raise_for_status() return r.text
[docs]def tabular_query(inpath, structure_key, dialect='excel', outpath=None, outfields=('taxonomy', 'description', 'substituents')): """Given a path to a compound set in tabular form (comma or tab delimited) annotate all compounds and write results to an expanded table. :param inpath: path to compound file to be annotated :type inpath: str :param structure_key: column heading which contains the compounds InChIKey or SMILES :type structure_key: str :param dialect: dialect for parsing table (generally 'excel' for csv, 'excel-tab' for tsv) :type dialect: str :param outpath: Path to desired output location :type outpath: str :param outfields: Fields to append to table from ClassyFire output :type outfields: tuple(string) >>> tabular_query('/tabulated_data.tsv', 'structure', 'excel-tab') """ tax_fields = ('kingdom', 'superclass', 'class', 'subclass') query_ids = [] infile = open(inpath, 'rU') if not outpath: outpath = _prevent_overwrite(inpath) comps = [] for line in csv.DictReader(infile, dialect=dialect): comps.append(line[structure_key]) if not len(comps) % chunk_size: query_ids.append(structure_query('/n'.join(comps))) comps = [] if comps: query_ids.append(structure_query('\\n'.join(comps))) print('%s queries submitted to ClassyFire API' % len(query_ids)) i = 0 infile.seek(0) with open(outpath, 'w') as outfile: reader = csv.DictReader(infile, dialect=dialect) writer = csv.DictWriter(outfile, reader.fieldnames+list(outfields), dialect=dialect) writer.writeheader() while i < len(query_ids): result = json.loads(get_results(query_ids[i])) if result["classification_status"] == "Done": for hit, line in zip(result['entities'], reader): if 'taxonomy' in outfields: hit['taxonomy'] = ";".join( ['%s:%s' % (hit[x]['name'], hit[x]['chemont_id']) for x in tax_fields if hit[x]]) for field in outfields: if isinstance(hit[field], list): line[field] = ';'.join(hit[field]) else: line[field] = hit[field] writer.writerow(line) i += 1 else: print("%s percent complete" % round(i/len(query_ids)*100)) time.sleep(sleep_interval) infile.close()
[docs]def sdf_query(inpath, outpath=None): """Given a path to a compound set in a sdf file, annotate all compounds and write results as attributes in a sdf file. :param inpath: path to compound file to be annotated :type inpath: str :param outpath: Path to desired output location :type outpath: str >>> sdf_query('/sdf_data.sdf') """ from rdkit.Chem import AllChem query_ids = [] if not outpath: outpath = _prevent_overwrite(inpath) comps = [] for mol in AllChem.SDMolSupplier(inpath): if mol: comps.append(AllChem.MolToSmiles(mol)) if not len(comps) % chunk_size: query_ids.append(structure_query('/n'.join(comps))) comps = [] if comps: query_ids.append(structure_query('\\n'.join(comps))) print('%s queries submitted to ClassyFire API' % len(query_ids)) i = 0 with open(outpath, 'w') as outfile: while i < len(query_ids): result = json.loads(get_results(query_ids[i])) if result["classification_status"] == "Done": outfile.write(get_results(query_ids[i], return_format='sdf')) i += 1 else: print("%s percent complete" % round(i / len(query_ids) * 100)) time.sleep(sleep_interval)
def _prevent_overwrite(write_path, suffix='_annotated'): """Prevents overwrite of existing output files by appending a suffix when needed :param write_path: potential write path :type write_path: string :return: :rtype: """ while os.path.exists(write_path): sp = write_path.split('.') if len(sp) > 1: sp[-2] += suffix write_path = '.'.join(sp) else: write_path += suffix return write_path