Source code for ord_rxn_converter.dataset_module


# import requirements
from ord_schema.message_helpers import load_message, write_message
from ord_schema.proto import dataset_pb2, reaction_pb2
from google.protobuf.message import Message
import pandas as pd
import re
#function imports
from ord_rxn_converter.metadata_module import extract_dataset_metadata, extract_reaction_metadata
from ord_rxn_converter.identifiers_module import extract_reaction_identifiers, generate_compound_table
from ord_rxn_converter.conditions_module import extract_reaction_conditions
from ord_rxn_converter.inputs_module import extract_input_components, extract_input_addition
from ord_rxn_converter.notes_observations_module import extract_notes_observations
from ord_rxn_converter.outcomes_module import extract_reaction_outcomes
from ord_rxn_converter.setup_module import extract_reaction_setup
from ord_rxn_converter.workups_module import extract_reaction_workups
from ord_rxn_converter.utility_functions_module import extract_all_enums



[docs]
def extract_dataset (filepath, compounds=pd.DataFrame(), persons=pd.DataFrame()):
    """
    Extracts all structured data from an ORD dataset file and organizes it into a dictionary of DataFrames.

    This function loads a dataset from a `.pb` or `.pbtxt` file (compressed or uncompressed), then extracts and organizes
    its reactions and associated metadata into tabular form. Each component of the reaction—identifiers, inputs, conditions, 
    setup, workups, outcomes, and more—is parsed into a separate `pandas.DataFrame`.

    If compound or person tables are provided, they will be updated to include any new compounds or people found during extraction.

    Args:
        filepath (str): Path to the input file (either zipped or unzipped Google Protobuf format).
        compounds (pd.DataFrame, optional): Existing compound table to update or append to. Defaults to an empty DataFrame.
        persons (pd.DataFrame, optional): Existing person table to update or append to. Defaults to an empty DataFrame.

    Returns:
        dict: A dictionary containing the following keys, each mapping to a `pandas.DataFrame`:
        
            - `"dataset_metadata"`: Dataset-level metadata.
            - `"reaction_metadata"`: Reaction-level metadata including provenance and contributor info.
            - `"reaction_identifiers"`: SMILES, InChI, and other identifiers for each reaction.
            - `"input_components"`: Details of each input component (compound, amount, role, etc.).
            - `"input_addition"`: Temporal details for the addition of inputs.
            - `"reaction_setup"`: Setup information including vessels and automation.
            - `"reaction_conditions"`: Environmental and operational reaction conditions.
            - `"reaction_notes"`: Observations and experimental notes.
            - `"reaction_workups"`: Post-reaction processing steps.
            - `"reaction_outcomes"`: Products and analyses of reaction outcomes.
            - `"compound"`: A table of all compounds involved across reactions.
            - `"person"`: A table of contributors extracted from provenance.

    Raises:
        FileNotFoundError: If the `filepath` does not exist.
        ValueError: If the Protobuf file is invalid or does not conform to `dataset_pb2.Dataset`.

    Example:
        >>> from ord_rxn_converter.dataset_module import extract_dataset
        >>> out = extract_dataset("example_dataset.pb")
        >>> out["reaction_metadata"].head()
    """
    enums_data = extract_all_enums(reaction_pb2)

    # 'load_message' to extract dataset from file path?
    dataset = load_message(filepath, dataset_pb2.Dataset)

    #initialize lists for compiling generated output dataframes 
    reaction_metadata = []
    reaction_identifiers = []
    input_components = []  #being weird for now
    input_addition = []
    reaction_setup = []
    reaction_conditions = []
    reaction_notes_observations = []
    reaction_workups = []
    reaction_outcomes = []

    #persons column headers
    persons_cols = ['ORCiD', 'username', 'name', 'organization', 'email']
    #compounds column headers
    compounds_cols = ['InChIKey', 'smiles', 'inchi', 'iupacName', 'name', 'casNumber', 'pubchemCID', 'chemspiderID', 'cxSmiles', 'unspecified', 'custom', 'molblock', 'xyz', 'uniprotID', 'pbdID', 'aminoAcidSequence', 'helm', 'mdl']

    #check that persons cols match expectation
    if persons.columns.tolist() != persons_cols:
        print("Persons column input headers inoperable - creating new DataFrame")
        #init column headers for empty DF
        persons = pd.DataFrame(columns=persons_cols)

    #check that the compounds cols match expectation
    if compounds.columns.tolist() != compounds_cols:
        print("Compounds column input headers inoperable - creating new DataFrame") 
        compounds = pd.DataFrame(columns=compounds_cols)


    # generate dataset metadata table
    dataset_metadata = extract_dataset_metadata(dataset)
    
    # Set a reaction to extract data from: 
    for reaction in dataset.reactions:        
        # extract reactionID
        rxnID = re.split('-', reaction.reaction_id)
        reactionID = f"mds_reaction-{rxnID[1]}" 

        provenance = reaction.provenance 
        # extract reaction metadata (reaction IDs + provenance); 
        if hasattr(reaction, 'provenance') and reaction.provenance:    #check if provenance attribtue exists before calling
            rxn_metadata, person_metadata = extract_reaction_metadata(provenance, reactionID)
            rxn_metadata = [dataset_metadata[0], reactionID] + rxn_metadata    #dataset_metadata[0] is datasetID
            reaction_metadata.append(rxn_metadata)

            #check if person table needs update
            for person in person_metadata:
                if 'ORCiD' in persons.columns and not persons['ORCiD'].str.contains(person[2]).any():
                   #update persons table if it does not exist
                   persons.loc[len(persons)] = person
       
        # extract reaction identifiers and update compound table if needed
        if hasattr(reaction, 'identifiers') and reaction.identifiers:     #check if exists before calling
            reaction_identifiers.append(extract_reaction_identifiers(reaction.identifiers, reactionID))

        # extract reaction inputs, update compound table, extract reaction addition
        if hasattr(reaction, 'inputs') and reaction.inputs:
            #extract reaction inputs
            input_components, compound_identifiers = extract_input_components(reaction.inputs, reactionID)  #append each list extracted output separately
          #  print("PRINTING COMPOUND IDENTIFIERS:", compound_identifiers)
            for identifier in compound_identifiers:
                if 'InChIKey' in compounds.columns and identifier[0] and not compounds['InChIKey'].str.contains(identifier[0]).any():  #check if contains InChIKey
                    #update compounds table if it does not exist
                    #compounds = pd.concat([compounds, identifier], ignore_index=True)
                    compounds.loc[len(compounds)] = identifier


            #extract reaction addition
            input_addition.extend(extract_input_addition(reaction.inputs, reactionID))
       
        # extract reaction setup
        if hasattr(reaction, 'setup') and reaction.setup:
            reaction_setup.append(extract_reaction_setup(reaction.setup, reactionID))

        # extract reaction conditions  
        if hasattr(reaction, 'conditions') and reaction.conditions:
            reaction_conditions.append(extract_reaction_conditions(reaction.conditions, reactionID))
            

        # extract reaction notes & observations
        if hasattr(reaction, 'notes') and hasattr(reaction, 'observations') and reaction.notes and reaction.observations:
            reaction_notes_observations.append(extract_notes_observations(reactionID, reaction.notes, reaction.observations))

        # extract reaction workups
        if hasattr(reaction, 'workups') and reaction.workups:
            reaction_workups.extend(extract_reaction_workups(reaction.workups, reactionID))

        # extract reaction outcomes 
        if hasattr(reaction, 'outcomes') and reaction.outcomes:
            outcomes, outcomes_identifiers = extract_reaction_outcomes(reactionID, reaction.outcomes)
            reaction_outcomes.extend(outcomes)
            for identifier in outcomes_identifiers: 
                if 'InChIKey' in compounds.columns and identifier[0] and not compounds['InChIKey'].str.contains(identifier[0]).any():  #check if contains InChIKey
                #update compounds table if it does not exist
                #compounds = pd.concat([compounds, identifier], ignore_index=True)
                    compounds.loc[len(compounds)] = identifier
            
    
    # return dictionary of dataframes 

    #define column headers for each dataframe
    dataset_cols = ['datasetID', 'ORDdatasetID', 'datasetName', 'datasetDescription']
    reaction_meta_cols = ['datasetID', 'reactionID', 'ORDreactionID', 'experimenter', 
                            'provenanceCity', 'experimentStart', 'doi', 'patent', 'publicationURL', 
                            'recordCreatedTime', 'recordCreatedPerson', 'recordCreatedDetails', 'modifiedTimes', 'modifiedPeople']
    reaction_identifiers_cols = ['reactionID', 'reactionCXSMILES', 'reactionSMILES', 'RDFile', 'RInChI', 
                                'reactionType', 'unspecified', 'custom', 'identifierDetails', 'isMapped']
    input_comps_cols = ['reactionID', 'inputKey', 'compoundIdenfiers', 'amount', 'amountUnit', 'reactionRole',
                        'isLimiting', 'compoundPreparation', 'compoundSource', 'features', 'analyses', 'texture']
    input_addition_cols = ['reactionID', 'inputKey', 'additionOrder', 'additionTime', 'timeUnit', 'additionSpeed', 
                            'additionDuration', 'durationUnit', 'additionDevice', 'additionTemperature', 
                            'temperatureUnit', 'flowRate', 'flowRateUnit', 'texture', 'textureDetails']
    reaction_setup_cols = ['reactionID', 'vessel', 'vesselMaterial', 'vesselVolume', 'volumeUnit', 'vesselPreparations', 
                            'vesselAttachments', 'isAutomated', 'automationPlatform', 'automationCode', 
                            'reactionEnvironment']
    reaction_conds_cols = ['reactionID', 'temperatureConditions', 'pressureConditions', 'stirringConditions', 
                            'illuminationConditions', 'electrochemistryConditions', 'flowConditions', 
                            'reflux', 'pH', 'conditionsAreDynamic', 'conditionDetails']
    reaction_notes_cols = ['reactionID', 'isHeterogeneous', 'formsPrecipitates', 'isExothermic', 'offGasses', 
                            'isSensitiveToMoisture', 'isSensitiveToOxygen', 'isSensitivetoLight', 'safetyNotes', 
                            'procedureDetails', 'observations']
    reaction_workups_cols = ['reactionID', 'workupType', 'workupDetails', 'workupDuration', 'durationUnit', 
                            'inputComponents', 'inputAdditionDetails', 'temperatureConditions', 'keepPhase', 
                            'stirringConditions', 'workupTargetPH', 'isAutomated']
    reaction_outcomes_cols = ['reactionID', 'outcomeKey', 'reactionTime', 'timeUnit', 'outcomeConversion', 'products', 'analyses']

    #create dictionary of dataframes to output
    out = {
        "dataset_metadata" : pd.DataFrame([dataset_metadata], columns=dataset_cols),
        "reaction_metadata" : pd.DataFrame(reaction_metadata, columns=reaction_meta_cols), 
        "reaction_identifiers" : pd.DataFrame(reaction_identifiers, columns=reaction_identifiers_cols) , 
        "input_components" : pd.DataFrame(input_components, columns=input_comps_cols),
        "input_addition" : pd.DataFrame(input_addition, columns=input_addition_cols),
        "reaction_setup" : pd.DataFrame(reaction_setup, columns=reaction_setup_cols), 
        "reaction_conditions" : pd.DataFrame(reaction_conditions, columns=reaction_conds_cols), 
        "reaction_notes" : pd.DataFrame(reaction_notes_observations, columns=reaction_notes_cols),
        "reaction_workups" : pd.DataFrame(reaction_workups, columns=reaction_workups_cols),
        "reaction_outcomes" : pd.DataFrame(reaction_outcomes, columns=reaction_outcomes_cols),
        "compound" : compounds,
        "person" : persons
    }
    return out