Source code for ord_rxn_converter.identifiers_module

# import requirements: 
from ord_schema.proto import dataset_pb2, reaction_pb2
from google.protobuf.message import Message
from rdkit import Chem
from rdkit.Chem import AllChem
from ord_rxn_converter.utility_functions_module import extract_all_enums

#generate enums_data to be accessible here TODO - have importable object instead..?
enums_data = extract_all_enums(reaction_pb2)


[docs]
def extract_reaction_identifiers(identifiers, reactionID: str) -> list:
    """
    Extracts detailed reaction identifier information for a given reaction.

    Args:
        identifiers (list): A list of `ReactionIdentifier` protobuf messages.
        reactionID (str): Unique reaction ID string.

    Returns:
        list: A list in the format:
            [reactionID, reaction_smiles, reaction_cxsmiles, rdfile, rinchi, reaction_type, unspecified, custom, details_dict, mapped_dict]

    Example:
        >>> from identifiers_module import extract_reaction_identifiers
        >>> extract_reaction_identifiers(reaction.identifiers, 'rxn-000001')
        ['rxn-000001', 'CCO>>CC=O', None, None, None,
         'REACTION_TYPE_XYZ', None, None,
         {'REACTION_CXSMILES': 'CCO>>CC=O'}, {'REACTION_CXSMILES': True}]
    """

    # Initiate empty lists to store identifier type, details, value, and is_mapped.
    identifier_type= []
    identifier_details = []
    identifier_value = []
    identifier_mapped = []

    for identifier in identifiers: 
        # append lists
        identifier_type.append(enums_data['ReactionIdentifier.ReactionIdentifierType'][identifier.type])
        identifier_value.append(identifier.value)
        identifier_details.append(identifier.details)
        identifier_mapped.append(identifier.is_mapped)

    # create a dictionary of identifier types and values and a dictionary of identifier types and details
    identifier_dict = dict(zip(identifier_type, identifier_value))
    details_dict = dict(zip(identifier_type, identifier_details))
    mapped_dict = dict(zip(identifier_type, identifier_mapped))

    # extract values
    unspecified = identifier_dict.get('UNSPECIFIED')
    custom = identifier_dict.get('CUSTOM')
    reaction_smiles = identifier_dict.get('REACTION_SMILES')
    reaction_cxsmiles = identifier_dict.get('REACTION_CXSMILES') 
    rdfile = identifier_dict.get('RDFILE')  
    rinchi = identifier_dict.get('RINCHI')
    reaction_type = identifier_dict.get('REACTION_TYPE')

    reaction_identifiers = [reactionID, reaction_smiles, reaction_cxsmiles, rdfile, rinchi, reaction_type, unspecified, custom, details_dict, mapped_dict]

    return reaction_identifiers


from rdkit import Chem


[docs]
def extract_compound_identifiers(compound_identifiers):

    """
    Extracts compound identifier values and ensures key identifiers are present.

    Generates missing InChI keys and CXSMILES if possible using RDKit.

    Args:
        compound_identifiers (list): A list of `CompoundIdentifier` protobuf messages.

    Returns:
        tuple: 
            - str: InChI key of the compound.
            - dict: Dictionary of identifier types to their values.

    Example:
        >>> from identifiers_module import extract_compound_identifiers
        >>> compound_identifiers = reaction.inputs['...'].components[0].identifiers
        >>> extract_compound_identifiers(compound_identifiers)
        ('ROSDSFDQCJNGOL-UHFFFAOYSA-N', {'NAME': 'dimethylamine', 'SMILES': 'CCO', ...})
    """
    
    identifier_type_list = []
    identifier_details_list = []
    identifier_value_list = []

    for identifier in compound_identifiers:
        identifier_type = enums_data['CompoundIdentifier.CompoundIdentifierType'][identifier.type]
        identifier_type_list.append(identifier_type)
        identifier_value_list.append(identifier.value)
        identifier_details_list.append(identifier.details)

    identifier_dict = dict(zip(identifier_type_list, identifier_value_list))

    # Safely access keys - get() ensures they return None if they do not exist
    inchi_key = identifier_dict.get('INCHI_KEY')
    inchi = identifier_dict.get('INCHI')
    smiles = identifier_dict.get('SMILES')
    cxsmiles = identifier_dict.get('CXSMILES')

    if inchi_key is None and inchi:
        rdkit_mol = Chem.MolFromInchi(inchi)
        if rdkit_mol:
            identifier_dict['INCHI_KEY'] = Chem.MolToInchiKey(rdkit_mol)
            inchi_key = identifier_dict.get('INCHI_KEY')

    elif inchi_key is None and inchi is None and smiles:
        rdkit_mol = Chem.MolFromSmiles(smiles)
        if rdkit_mol:
            identifier_dict['INCHI'] = Chem.MolToInchi(rdkit_mol)
            identifier_dict['INCHI_KEY'] = Chem.MolToInchiKey(rdkit_mol)
            inchi_key = identifier_dict.get('INCHI_KEY')

    if smiles and cxsmiles is None:
        rdkit_mol = Chem.MolFromSmiles(smiles)
        identifier_dict['CXSMILES'] = Chem.MolToCXSmiles(rdkit_mol)
    
    else: pass

    return inchi_key, identifier_dict



[docs]
def generate_compound_table (compound_identifiers):

    """
    Generates a full set of compound identifiers in a fixed order.

    If InChI key or CXSMILES are missing, attempts to generate them using RDKit.

    Args:
        compound_identifiers (list): A list of `CompoundIdentifier` protobuf messages,
            typically accessed via `reaction.inputs['m1_m2'].components[0].identifiers`.

    Returns:
        list: A list of compound identifier values in this order:
            [inchi_key, smiles, inchi, iupac_name, name, cas_number, pubchem_cid, chemspider_id, cxsmiles, unspecified, custom, molblock, xyz, uniprot_id, pdb_id, amino_acid_sequence, helm, mdl]

    Example:
        >>> from identifiers_module import generate_compound_table
        >>> compound_identifiers = reaction.inputs['...'].components[0].identifiers
        >>> generate_compound_table(compound_identifiers)
        ['BQJCRHHNABKAKU-KBQPJGBKSA-N', 'CCO', 'InChI=1S/C2H6O/...', ...]
    """

    identifier_type_list = []
    identifier_details_list = []
    identifier_value_list = []

    for identifier in compound_identifiers:
        identifier_type = enums_data['CompoundIdentifier.CompoundIdentifierType'][identifier.type]

        identifier_type_list.append(identifier_type)
        identifier_value_list.append(identifier.value)
        identifier_details_list.append(identifier.details)

    identifier_dict = dict(zip(identifier_type_list, identifier_value_list))
    details_dict = dict(zip(identifier_type_list, identifier_details_list))

    if identifier_dict.get('INCHI_KEY') is None and identifier_dict.get('INCHI'): 
        inchi = identifier_dict.get('INCHI')
        rdkit_mol = Chem.MolFromInchi(inchi)
        identifier_dict['INCHI_KEY'] = Chem.MolToInchiKey(rdkit_mol)
    
    elif identifier_dict.get('INCHI_KEY') is None and identifier_dict.get('INCHI') is None:
        smiles_string = identifier_dict.get('SMILES')
        rdkit_mol = None
        identifier_dict['INCHI'] = None
        identifier_dict['INCHI_KEY'] = None
        if smiles_string:  #Chem.MolFromSmiles errors if passed None
            rdkit_mol = Chem.MolFromSmiles(smiles_string)
            identifier_dict['INCHI'] = Chem.MolToInchi(rdkit_mol)
            identifier_dict['INCHI_KEY'] = Chem.MolToInchiKey(rdkit_mol)

    else: pass

    if identifier_dict.get('SMILES') and identifier_dict.get('CXSMILES') is None: 
        smiles_string = identifier_dict.get('SMILES')
        rdkit_mol = None
        if smiles_string:  #Chem.MolFromSmiles errors if passed None
            rdkit_mol = Chem.MolFromSmiles(smiles_string)
        identifier_dict['CXSMILES'] = Chem.MolToCXSmiles(rdkit_mol) 
    
    else: pass

    # extract values
    inchi_key = identifier_dict.get('INCHI_KEY')
    smiles = identifier_dict.get('SMILES')
    inchi = identifier_dict.get('INCHI')
    iupac_name = identifier_dict.get('IUPAC_NAME')
    name = identifier_dict.get('NAME')
    cas_number = identifier_dict.get('CAS_NUMBER')
    pubchem_cid = identifier_dict.get('PUBCHEM_CID')
    chemspider_id = identifier_dict.get('CHEMSPIDER_ID')
    cxsmiles = identifier_dict.get('CXSMILES')
    unspecified = identifier_dict.get('UNSPECIFIED')
    custom = identifier_dict.get('CUSTOM')
    molblock = identifier_dict.get('MOLBLOCK')
    xyz = identifier_dict.get('XYZ')
    uniprot_id = identifier_dict.get('UNIPROT_ID')
    pdb_id = identifier_dict.get('PDB_ID')
    amino_acid_sequence = identifier_dict.get('AMINO_ACID_SEQUENCE')
    helm = identifier_dict.get('HELM')
    mdl = identifier_dict.get('MDL')

    compound_identifiers = [inchi_key, smiles, inchi, iupac_name, name, cas_number, pubchem_cid, chemspider_id, cxsmiles, unspecified, custom, molblock, xyz, uniprot_id, pdb_id, amino_acid_sequence, helm, mdl]

    #TODO - figure out what to do with details_dict

    return compound_identifiers