Source code for ord_rxn_converter.metadata_module
# %%
import re
from ord_schema.message_helpers import load_message, write_message
from ord_schema.proto import dataset_pb2, reaction_pb2
from google.protobuf.message import Message
[docs]
def extract_dataset_metadata(dataset):
"""
Extracts key metadata from a loaded ORD dataset message.
This function parses a loaded Protocol Buffer dataset message and extracts
high-level metadata such as a modified dataset ID, original ORD ID, name,
and description. The modified ID is formatted to reflect that the dataset
is stored in an MDS (custom) database.
Args:
dataset (dataset_pb2.Dataset):
A dataset message loaded via `load_message` from the ORD schema.
Returns:
list:
A list containing the following metadata fields:
- `dataset_id` (str): Custom MDS-formatted dataset ID.
- `ord_dataset_id` (str): Original dataset ID from ORD.
- `name` (str): Human-readable name of the dataset.
- `description` (str): Textual description of the dataset.
Example:
>>> from metadata_module import extract_dataset_metadata
>>> dataset = load_message("example_dataset.pb", dataset_pb2.Dataset())
>>> extract_dataset_metadata(dataset)
['mds_dataset-000001', 'ord_dataset-000001', '...', '...']
"""
dsID = re.split('-', dataset.dataset_id)
datasetID = f"mds_dataset-{dsID[1]}"
ORDdsID = dataset.dataset_id
dsName = dataset.name
dsDes = dataset.description
dataset_metadata = [datasetID, ORDdsID, dsName, dsDes]
return dataset_metadata
[docs]
def extract_reaction_metadata(provenance, reactionID):
"""
Extracts reaction-level provenance and contributor metadata from a reaction.
This function parses the `Provenance` message from a reaction in an ORD dataset,
extracting detailed metadata related to:
- The reaction's source (e.g., DOI, patent, publication)
- Timing and authorship of creation and modifications
- Contributor identities (with ORCID and contact details)
Args:
provenance (reaction_pb2.Provenance):
A Provenance message associated with a reaction.
reactionID (str):
The unique identifier of the reaction being processed.
Returns:
tuple:
- `provenance_data` (list): Reaction-level metadata including:
- `reactionID` (str)
- `experimenter_orcid` (str)
- `city` (str)
- `experiment_start` (str)
- `doi` (str)
- `patent` (str)
- `publication_url` (str)
- `created_time` (str)
- `created_person_orcid` (str)
- `created_details` (str)
- `modified_times` (str, comma-separated)
- `modified_people` (str, comma-separated ORCIDs)
- `person_metadata` (list of list of str): Contributor metadata:
- Each inner list includes:
`[orcid, username, full_name, organization, email]`
Example:
>>> from metadata_module import extract_reaction_metada
>>> reaction = dataset.reactions[0]
>>> extract_reaction_metadata(reaction.provenance, "reaction-001")
(['reaction-001', '0000-0001-...', 'Boston', ...],
[['jsmith', 'John Smith', '0000-0001-...', ...], ...])
"""
person_metadata = []
# experimenter = 1
experimenter = provenance.experimenter
person_metadata.append([experimenter.username, experimenter.name, experimenter.orcid, experimenter.organization, experimenter.email])
# city = 2
# experiment_start = 3
# doi = 4
# patent = 5
# publication_url = 6
# record_created =
created_time = provenance.record_created.time.value
person = provenance.record_created.person
person_metadata.append([person.username, person.name, person.orcid, person.organization, person.email])
modified_times_list = []
modified_person_orcid_list = []
for record in provenance.record_modified:
modified_times_list.append(record.time.value)
modified_person_orcid_list.append(record.person.orcid)
modified_people = ", ".join(modified_person_orcid_list)
modified_times = ", ".join(modified_times_list)
for record in provenance.record_modified:
person = record.person
person_metadata.append([person.orcid, person.username, person.name, person.organization, person.email])
provenance_data = [reactionID, provenance.experimenter.orcid, provenance.city, provenance.experiment_start, provenance.doi, provenance.patent, provenance.publication_url,
provenance.record_created.time.value, provenance.record_created.person.orcid, provenance.record_created.details,
modified_times, modified_people]
return provenance_data, person_metadata