Source code for ord_rxn_converter.identifiers_module

# import requirements: 
from ord_schema.proto import dataset_pb2, reaction_pb2
from google.protobuf.message import Message
from rdkit import Chem
from rdkit.Chem import AllChem
from ord_rxn_converter.utility_functions_module import extract_all_enums

#generate enums_data to be accessible here TODO - have importable object instead..?
enums_data = extract_all_enums(reaction_pb2)

[docs] def extract_reaction_identifiers(identifiers, reactionID: str) -> list: """ Extracts detailed reaction identifier information for a given reaction. Args: identifiers (list): A list of `ReactionIdentifier` protobuf messages. reactionID (str): Unique reaction ID string. Returns: list: A list in the format: [reactionID, reaction_smiles, reaction_cxsmiles, rdfile, rinchi, reaction_type, unspecified, custom, details_dict, mapped_dict] Example: >>> from identifiers_module import extract_reaction_identifiers >>> extract_reaction_identifiers(reaction.identifiers, 'rxn-000001') ['rxn-000001', 'CCO>>CC=O', None, None, None, 'REACTION_TYPE_XYZ', None, None, {'REACTION_CXSMILES': 'CCO>>CC=O'}, {'REACTION_CXSMILES': True}] """ # Initiate empty lists to store identifier type, details, value, and is_mapped. identifier_type= [] identifier_details = [] identifier_value = [] identifier_mapped = [] for identifier in identifiers: # append lists identifier_type.append(enums_data['ReactionIdentifier.ReactionIdentifierType'][identifier.type]) identifier_value.append(identifier.value) identifier_details.append(identifier.details) identifier_mapped.append(identifier.is_mapped) # create a dictionary of identifier types and values and a dictionary of identifier types and details identifier_dict = dict(zip(identifier_type, identifier_value)) details_dict = dict(zip(identifier_type, identifier_details)) mapped_dict = dict(zip(identifier_type, identifier_mapped)) # extract values unspecified = identifier_dict.get('UNSPECIFIED') custom = identifier_dict.get('CUSTOM') reaction_smiles = identifier_dict.get('REACTION_SMILES') reaction_cxsmiles = identifier_dict.get('REACTION_CXSMILES') rdfile = identifier_dict.get('RDFILE') rinchi = identifier_dict.get('RINCHI') reaction_type = identifier_dict.get('REACTION_TYPE') reaction_identifiers = [reactionID, reaction_smiles, reaction_cxsmiles, rdfile, rinchi, reaction_type, unspecified, custom, details_dict, mapped_dict] return reaction_identifiers
from rdkit import Chem
[docs] def extract_compound_identifiers(compound_identifiers): """ Extracts compound identifier values and ensures key identifiers are present. Generates missing InChI keys and CXSMILES if possible using RDKit. Args: compound_identifiers (list): A list of `CompoundIdentifier` protobuf messages. Returns: tuple: - str: InChI key of the compound. - dict: Dictionary of identifier types to their values. Example: >>> from identifiers_module import extract_compound_identifiers >>> compound_identifiers = reaction.inputs['...'].components[0].identifiers >>> extract_compound_identifiers(compound_identifiers) ('ROSDSFDQCJNGOL-UHFFFAOYSA-N', {'NAME': 'dimethylamine', 'SMILES': 'CCO', ...}) """ identifier_type_list = [] identifier_details_list = [] identifier_value_list = [] for identifier in compound_identifiers: identifier_type = enums_data['CompoundIdentifier.CompoundIdentifierType'][identifier.type] identifier_type_list.append(identifier_type) identifier_value_list.append(identifier.value) identifier_details_list.append(identifier.details) identifier_dict = dict(zip(identifier_type_list, identifier_value_list)) # Safely access keys - get() ensures they return None if they do not exist inchi_key = identifier_dict.get('INCHI_KEY') inchi = identifier_dict.get('INCHI') smiles = identifier_dict.get('SMILES') cxsmiles = identifier_dict.get('CXSMILES') if inchi_key is None and inchi: rdkit_mol = Chem.MolFromInchi(inchi) if rdkit_mol: identifier_dict['INCHI_KEY'] = Chem.MolToInchiKey(rdkit_mol) inchi_key = identifier_dict.get('INCHI_KEY') elif inchi_key is None and inchi is None and smiles: rdkit_mol = Chem.MolFromSmiles(smiles) if rdkit_mol: identifier_dict['INCHI'] = Chem.MolToInchi(rdkit_mol) identifier_dict['INCHI_KEY'] = Chem.MolToInchiKey(rdkit_mol) inchi_key = identifier_dict.get('INCHI_KEY') if smiles and cxsmiles is None: rdkit_mol = Chem.MolFromSmiles(smiles) identifier_dict['CXSMILES'] = Chem.MolToCXSmiles(rdkit_mol) else: pass return inchi_key, identifier_dict
[docs] def generate_compound_table (compound_identifiers): """ Generates a full set of compound identifiers in a fixed order. If InChI key or CXSMILES are missing, attempts to generate them using RDKit. Args: compound_identifiers (list): A list of `CompoundIdentifier` protobuf messages, typically accessed via `reaction.inputs['m1_m2'].components[0].identifiers`. Returns: list: A list of compound identifier values in this order: [inchi_key, smiles, inchi, iupac_name, name, cas_number, pubchem_cid, chemspider_id, cxsmiles, unspecified, custom, molblock, xyz, uniprot_id, pdb_id, amino_acid_sequence, helm, mdl] Example: >>> from identifiers_module import generate_compound_table >>> compound_identifiers = reaction.inputs['...'].components[0].identifiers >>> generate_compound_table(compound_identifiers) ['BQJCRHHNABKAKU-KBQPJGBKSA-N', 'CCO', 'InChI=1S/C2H6O/...', ...] """ identifier_type_list = [] identifier_details_list = [] identifier_value_list = [] for identifier in compound_identifiers: identifier_type = enums_data['CompoundIdentifier.CompoundIdentifierType'][identifier.type] identifier_type_list.append(identifier_type) identifier_value_list.append(identifier.value) identifier_details_list.append(identifier.details) identifier_dict = dict(zip(identifier_type_list, identifier_value_list)) details_dict = dict(zip(identifier_type_list, identifier_details_list)) if identifier_dict.get('INCHI_KEY') is None and identifier_dict.get('INCHI'): inchi = identifier_dict.get('INCHI') rdkit_mol = Chem.MolFromInchi(inchi) identifier_dict['INCHI_KEY'] = Chem.MolToInchiKey(rdkit_mol) elif identifier_dict.get('INCHI_KEY') is None and identifier_dict.get('INCHI') is None: smiles_string = identifier_dict.get('SMILES') rdkit_mol = None identifier_dict['INCHI'] = None identifier_dict['INCHI_KEY'] = None if smiles_string: #Chem.MolFromSmiles errors if passed None rdkit_mol = Chem.MolFromSmiles(smiles_string) identifier_dict['INCHI'] = Chem.MolToInchi(rdkit_mol) identifier_dict['INCHI_KEY'] = Chem.MolToInchiKey(rdkit_mol) else: pass if identifier_dict.get('SMILES') and identifier_dict.get('CXSMILES') is None: smiles_string = identifier_dict.get('SMILES') rdkit_mol = None if smiles_string: #Chem.MolFromSmiles errors if passed None rdkit_mol = Chem.MolFromSmiles(smiles_string) identifier_dict['CXSMILES'] = Chem.MolToCXSmiles(rdkit_mol) else: pass # extract values inchi_key = identifier_dict.get('INCHI_KEY') smiles = identifier_dict.get('SMILES') inchi = identifier_dict.get('INCHI') iupac_name = identifier_dict.get('IUPAC_NAME') name = identifier_dict.get('NAME') cas_number = identifier_dict.get('CAS_NUMBER') pubchem_cid = identifier_dict.get('PUBCHEM_CID') chemspider_id = identifier_dict.get('CHEMSPIDER_ID') cxsmiles = identifier_dict.get('CXSMILES') unspecified = identifier_dict.get('UNSPECIFIED') custom = identifier_dict.get('CUSTOM') molblock = identifier_dict.get('MOLBLOCK') xyz = identifier_dict.get('XYZ') uniprot_id = identifier_dict.get('UNIPROT_ID') pdb_id = identifier_dict.get('PDB_ID') amino_acid_sequence = identifier_dict.get('AMINO_ACID_SEQUENCE') helm = identifier_dict.get('HELM') mdl = identifier_dict.get('MDL') compound_identifiers = [inchi_key, smiles, inchi, iupac_name, name, cas_number, pubchem_cid, chemspider_id, cxsmiles, unspecified, custom, molblock, xyz, uniprot_id, pdb_id, amino_acid_sequence, helm, mdl] #TODO - figure out what to do with details_dict return compound_identifiers