Source code for ord_rxn_converter.dataset_module
# import requirements
from ord_schema.message_helpers import load_message, write_message
from ord_schema.proto import dataset_pb2, reaction_pb2
from google.protobuf.message import Message
import pandas as pd
import re
#function imports
from ord_rxn_converter.metadata_module import extract_dataset_metadata, extract_reaction_metadata
from ord_rxn_converter.identifiers_module import extract_reaction_identifiers, generate_compound_table
from ord_rxn_converter.conditions_module import extract_reaction_conditions
from ord_rxn_converter.inputs_module import extract_input_components, extract_input_addition
from ord_rxn_converter.notes_observations_module import extract_notes_observations
from ord_rxn_converter.outcomes_module import extract_reaction_outcomes
from ord_rxn_converter.setup_module import extract_reaction_setup
from ord_rxn_converter.workups_module import extract_reaction_workups
from ord_rxn_converter.utility_functions_module import extract_all_enums
[docs]
def extract_dataset (filepath, compounds=pd.DataFrame(), persons=pd.DataFrame()):
"""
Extracts all structured data from an ORD dataset file and organizes it into a dictionary of DataFrames.
This function loads a dataset from a `.pb` or `.pbtxt` file (compressed or uncompressed), then extracts and organizes
its reactions and associated metadata into tabular form. Each component of the reaction—identifiers, inputs, conditions,
setup, workups, outcomes, and more—is parsed into a separate `pandas.DataFrame`.
If compound or person tables are provided, they will be updated to include any new compounds or people found during extraction.
Args:
filepath (str): Path to the input file (either zipped or unzipped Google Protobuf format).
compounds (pd.DataFrame, optional): Existing compound table to update or append to. Defaults to an empty DataFrame.
persons (pd.DataFrame, optional): Existing person table to update or append to. Defaults to an empty DataFrame.
Returns:
dict: A dictionary containing the following keys, each mapping to a `pandas.DataFrame`:
- `"dataset_metadata"`: Dataset-level metadata.
- `"reaction_metadata"`: Reaction-level metadata including provenance and contributor info.
- `"reaction_identifiers"`: SMILES, InChI, and other identifiers for each reaction.
- `"input_components"`: Details of each input component (compound, amount, role, etc.).
- `"input_addition"`: Temporal details for the addition of inputs.
- `"reaction_setup"`: Setup information including vessels and automation.
- `"reaction_conditions"`: Environmental and operational reaction conditions.
- `"reaction_notes"`: Observations and experimental notes.
- `"reaction_workups"`: Post-reaction processing steps.
- `"reaction_outcomes"`: Products and analyses of reaction outcomes.
- `"compound"`: A table of all compounds involved across reactions.
- `"person"`: A table of contributors extracted from provenance.
Raises:
FileNotFoundError: If the `filepath` does not exist.
ValueError: If the Protobuf file is invalid or does not conform to `dataset_pb2.Dataset`.
Example:
>>> from ord_rxn_converter.dataset_module import extract_dataset
>>> out = extract_dataset("example_dataset.pb")
>>> out["reaction_metadata"].head()
"""
enums_data = extract_all_enums(reaction_pb2)
# 'load_message' to extract dataset from file path?
dataset = load_message(filepath, dataset_pb2.Dataset)
#initialize lists for compiling generated output dataframes
reaction_metadata = []
reaction_identifiers = []
input_components = [] #being weird for now
input_addition = []
reaction_setup = []
reaction_conditions = []
reaction_notes_observations = []
reaction_workups = []
reaction_outcomes = []
#persons column headers
persons_cols = ['ORCiD', 'username', 'name', 'organization', 'email']
#compounds column headers
compounds_cols = ['InChIKey', 'smiles', 'inchi', 'iupacName', 'name', 'casNumber', 'pubchemCID', 'chemspiderID', 'cxSmiles', 'unspecified', 'custom', 'molblock', 'xyz', 'uniprotID', 'pbdID', 'aminoAcidSequence', 'helm', 'mdl']
#check that persons cols match expectation
if persons.columns.tolist() != persons_cols:
print("Persons column input headers inoperable - creating new DataFrame")
#init column headers for empty DF
persons = pd.DataFrame(columns=persons_cols)
#check that the compounds cols match expectation
if compounds.columns.tolist() != compounds_cols:
print("Compounds column input headers inoperable - creating new DataFrame")
compounds = pd.DataFrame(columns=compounds_cols)
# generate dataset metadata table
dataset_metadata = extract_dataset_metadata(dataset)
# Set a reaction to extract data from:
for reaction in dataset.reactions:
# extract reactionID
rxnID = re.split('-', reaction.reaction_id)
reactionID = f"mds_reaction-{rxnID[1]}"
provenance = reaction.provenance
# extract reaction metadata (reaction IDs + provenance);
if hasattr(reaction, 'provenance') and reaction.provenance: #check if provenance attribtue exists before calling
rxn_metadata, person_metadata = extract_reaction_metadata(provenance, reactionID)
rxn_metadata = [dataset_metadata[0], reactionID] + rxn_metadata #dataset_metadata[0] is datasetID
reaction_metadata.append(rxn_metadata)
#check if person table needs update
for person in person_metadata:
if 'ORCiD' in persons.columns and not persons['ORCiD'].str.contains(person[2]).any():
#update persons table if it does not exist
persons.loc[len(persons)] = person
# extract reaction identifiers and update compound table if needed
if hasattr(reaction, 'identifiers') and reaction.identifiers: #check if exists before calling
reaction_identifiers.append(extract_reaction_identifiers(reaction.identifiers, reactionID))
# extract reaction inputs, update compound table, extract reaction addition
if hasattr(reaction, 'inputs') and reaction.inputs:
#extract reaction inputs
input_components, compound_identifiers = extract_input_components(reaction.inputs, reactionID) #append each list extracted output separately
# print("PRINTING COMPOUND IDENTIFIERS:", compound_identifiers)
for identifier in compound_identifiers:
if 'InChIKey' in compounds.columns and identifier[0] and not compounds['InChIKey'].str.contains(identifier[0]).any(): #check if contains InChIKey
#update compounds table if it does not exist
#compounds = pd.concat([compounds, identifier], ignore_index=True)
compounds.loc[len(compounds)] = identifier
#extract reaction addition
input_addition.extend(extract_input_addition(reaction.inputs, reactionID))
# extract reaction setup
if hasattr(reaction, 'setup') and reaction.setup:
reaction_setup.append(extract_reaction_setup(reaction.setup, reactionID))
# extract reaction conditions
if hasattr(reaction, 'conditions') and reaction.conditions:
reaction_conditions.append(extract_reaction_conditions(reaction.conditions, reactionID))
# extract reaction notes & observations
if hasattr(reaction, 'notes') and hasattr(reaction, 'observations') and reaction.notes and reaction.observations:
reaction_notes_observations.append(extract_notes_observations(reactionID, reaction.notes, reaction.observations))
# extract reaction workups
if hasattr(reaction, 'workups') and reaction.workups:
reaction_workups.extend(extract_reaction_workups(reaction.workups, reactionID))
# extract reaction outcomes
if hasattr(reaction, 'outcomes') and reaction.outcomes:
outcomes, outcomes_identifiers = extract_reaction_outcomes(reactionID, reaction.outcomes)
reaction_outcomes.extend(outcomes)
for identifier in outcomes_identifiers:
if 'InChIKey' in compounds.columns and identifier[0] and not compounds['InChIKey'].str.contains(identifier[0]).any(): #check if contains InChIKey
#update compounds table if it does not exist
#compounds = pd.concat([compounds, identifier], ignore_index=True)
compounds.loc[len(compounds)] = identifier
# return dictionary of dataframes
#define column headers for each dataframe
dataset_cols = ['datasetID', 'ORDdatasetID', 'datasetName', 'datasetDescription']
reaction_meta_cols = ['datasetID', 'reactionID', 'ORDreactionID', 'experimenter',
'provenanceCity', 'experimentStart', 'doi', 'patent', 'publicationURL',
'recordCreatedTime', 'recordCreatedPerson', 'recordCreatedDetails', 'modifiedTimes', 'modifiedPeople']
reaction_identifiers_cols = ['reactionID', 'reactionCXSMILES', 'reactionSMILES', 'RDFile', 'RInChI',
'reactionType', 'unspecified', 'custom', 'identifierDetails', 'isMapped']
input_comps_cols = ['reactionID', 'inputKey', 'compoundIdenfiers', 'amount', 'amountUnit', 'reactionRole',
'isLimiting', 'compoundPreparation', 'compoundSource', 'features', 'analyses', 'texture']
input_addition_cols = ['reactionID', 'inputKey', 'additionOrder', 'additionTime', 'timeUnit', 'additionSpeed',
'additionDuration', 'durationUnit', 'additionDevice', 'additionTemperature',
'temperatureUnit', 'flowRate', 'flowRateUnit', 'texture', 'textureDetails']
reaction_setup_cols = ['reactionID', 'vessel', 'vesselMaterial', 'vesselVolume', 'volumeUnit', 'vesselPreparations',
'vesselAttachments', 'isAutomated', 'automationPlatform', 'automationCode',
'reactionEnvironment']
reaction_conds_cols = ['reactionID', 'temperatureConditions', 'pressureConditions', 'stirringConditions',
'illuminationConditions', 'electrochemistryConditions', 'flowConditions',
'reflux', 'pH', 'conditionsAreDynamic', 'conditionDetails']
reaction_notes_cols = ['reactionID', 'isHeterogeneous', 'formsPrecipitates', 'isExothermic', 'offGasses',
'isSensitiveToMoisture', 'isSensitiveToOxygen', 'isSensitivetoLight', 'safetyNotes',
'procedureDetails', 'observations']
reaction_workups_cols = ['reactionID', 'workupType', 'workupDetails', 'workupDuration', 'durationUnit',
'inputComponents', 'inputAdditionDetails', 'temperatureConditions', 'keepPhase',
'stirringConditions', 'workupTargetPH', 'isAutomated']
reaction_outcomes_cols = ['reactionID', 'outcomeKey', 'reactionTime', 'timeUnit', 'outcomeConversion', 'products', 'analyses']
#create dictionary of dataframes to output
out = {
"dataset_metadata" : pd.DataFrame([dataset_metadata], columns=dataset_cols),
"reaction_metadata" : pd.DataFrame(reaction_metadata, columns=reaction_meta_cols),
"reaction_identifiers" : pd.DataFrame(reaction_identifiers, columns=reaction_identifiers_cols) ,
"input_components" : pd.DataFrame(input_components, columns=input_comps_cols),
"input_addition" : pd.DataFrame(input_addition, columns=input_addition_cols),
"reaction_setup" : pd.DataFrame(reaction_setup, columns=reaction_setup_cols),
"reaction_conditions" : pd.DataFrame(reaction_conditions, columns=reaction_conds_cols),
"reaction_notes" : pd.DataFrame(reaction_notes_observations, columns=reaction_notes_cols),
"reaction_workups" : pd.DataFrame(reaction_workups, columns=reaction_workups_cols),
"reaction_outcomes" : pd.DataFrame(reaction_outcomes, columns=reaction_outcomes_cols),
"compound" : compounds,
"person" : persons
}
return out