glam/scripts/enrich_hyponyms_with_ontology.py
kempersc fa5680f0dd Add initial versions of custodian hub UML diagrams in Mermaid and PlantUML formats
- Introduced custodian_hub_v3.mmd, custodian_hub_v4_final.mmd, and custodian_hub_v5_FINAL.mmd for Mermaid representation.
- Created custodian_hub_FINAL.puml and custodian_hub_v3.puml for PlantUML representation.
- Defined entities such as CustodianReconstruction, Identifier, TimeSpan, Agent, CustodianName, CustodianObservation, ReconstructionActivity, Appellation, ConfidenceMeasure, Custodian, LanguageCode, and SourceDocument.
- Established relationships and associations between entities, including temporal extents, observations, and reconstruction activities.
- Incorporated enumerations for various types, statuses, and classifications relevant to custodians and their activities.
2025-11-22 14:33:51 +01:00

324 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Enrich hyponyms_curated.yaml with ontology class mappings and properties.
This script reads hyponyms_curated_full.yaml (with full Wikidata metadata)
and adds ontology mappings to hyponyms_curated.yaml based on hypernym categories.
Usage:
python3 scripts/enrich_hyponyms_with_ontology.py
"""
import yaml
import sys
from datetime import datetime, timezone
from pathlib import Path
# Ontology mapping templates by hypernym category
ONTOLOGY_TEMPLATES = {
# Buildings and structures
'building': {
'semantic_aspects': ['place_reference', 'potential_custodian_reference'],
'complexity_note': 'Buildings can be both physical heritage sites (place) AND organizations managing heritage (custodian). Model both aspects with independent temporal lifecycles.',
'place_ontology': {
'primary_class': {'uri': 'crm:E27_Site', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Site'},
'secondary_class': {'uri': 'schema:Place', 'namespace': 'http://schema.org/', 'label': 'Place'},
'tertiary_class': {'uri': 'schema:LandmarksOrHistoricalBuildings', 'namespace': 'http://schema.org/', 'label': 'Landmarks'},
'rdfs_comment': 'Physical heritage building or site with archaeological/architectural significance',
'properties': [
{'uri': 'crm:P1_is_identified_by', 'range': 'crm:E41_Appellation', 'usage': 'Building name identification'},
{'uri': 'crm:P2_has_type', 'range': 'crm:E55_Type', 'usage': 'Building type classification'},
{'uri': 'crm:P4_has_time-span', 'range': 'crm:E52_Time-Span', 'usage': 'Temporal extent (construction → present/demolition)'},
{'uri': 'crm:P7_took_place_at', 'range': 'crm:E53_Place', 'usage': 'Geographic location'},
{'uri': 'schema:geo', 'range': 'schema:GeoCoordinates', 'usage': 'Latitude/longitude coordinates'},
{'uri': 'schema:address', 'range': 'schema:PostalAddress', 'usage': 'Physical postal address'},
{'uri': 'schema:containedInPlace', 'range': 'schema:Place', 'usage': 'Administrative hierarchy (city → province → country)'},
]
},
'custodian_ontology': {
'condition': 'operates_as_heritage_institution',
'public_sector': {
'class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'},
'secondary_class': {'uri': 'schema:Museum', 'namespace': 'http://schema.org/', 'label': 'Museum'},
},
'private_sector': {
'class': {'uri': 'schema:Organization', 'namespace': 'http://schema.org/', 'label': 'Organization'},
'secondary_class': {'uri': 'schema:Museum', 'namespace': 'http://schema.org/', 'label': 'Museum'},
},
'properties': [
{'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code, KvK number, Wikidata Q-number'},
{'uri': 'cpov:hasUnit', 'range': 'cpov:PublicOrganisation', 'usage': 'Organizational structure (departments, teams)'},
{'uri': 'crm:P147_curated', 'range': 'crm:E78_Curated_Holding', 'usage': 'Link to curated collection'},
{'uri': 'schema:location', 'range': 'schema:Place', 'usage': 'Link custodian organization to physical site'},
]
},
'temporal_model': {
'place_aspect': 'Construction date → Demolition/Present',
'custodian_aspect': 'Founding date → Dissolution/Present (if operates as heritage institution)',
'note': 'Place and custodian have INDEPENDENT temporal lifecycles',
}
},
# Museums
'museum': {
'semantic_aspects': ['custodian_reference', 'place_reference', 'collections_reference', 'people_reference'],
'complexity_note': 'Museums are primarily custodian organizations, but also operate at physical locations with staff and collections.',
'custodian_ontology': {
'primary_class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'},
'secondary_class': {'uri': 'schema:Museum', 'namespace': 'http://schema.org/', 'label': 'Museum'},
'tertiary_class': {'uri': 'crm:E39_Actor', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Actor'},
'condition_public': 'legal_status == "public" OR "governmental" OR "municipal"',
'condition_private': 'legal_status == "private" OR "foundation" OR "NGO"',
'rdfs_comment': 'Institution collecting, preserving, and exhibiting cultural heritage objects',
'properties': [
{'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code, Wikidata Q-number, VIAF ID'},
{'uri': 'skos:prefLabel', 'range': 'rdfs:Literal', 'usage': 'Preferred name (CPOV uses SKOS)'},
{'uri': 'skos:altLabel', 'range': 'rdfs:Literal', 'usage': 'Alternative names'},
{'uri': 'cpov:hasUnit', 'range': 'cpov:PublicOrganisation', 'usage': 'Organizational units (departments, divisions)'},
{'uri': 'crm:P147_curated', 'range': 'crm:E78_Curated_Holding', 'usage': 'Curated museum collection'},
{'uri': 'schema:location', 'range': 'schema:Place', 'usage': 'Physical location/address'},
{'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'Museum founding date'},
]
},
'collections_ontology': {
'primary_class': {'uri': 'crm:E78_Curated_Holding', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Curated Holding'},
'rdfs_comment': 'Aggregations of physical things assembled and maintained by museum',
'properties': [
{'uri': 'crm:P147i_was_curated_by', 'range': 'crm:E39_Actor', 'usage': 'Curating organization'},
{'uri': 'crm:P109_has_current_or_former_curator', 'range': 'crm:E39_Actor', 'usage': 'Individual curators'},
{'uri': 'crm:P46_is_composed_of', 'range': 'crm:E18_Physical_Thing', 'usage': 'Individual objects in collection'},
]
},
'people_ontology': {
'primary_class': {'uri': 'pico:PersonObservation', 'namespace': 'https://personsincontext.org/model#', 'label': 'Person Observation'},
'secondary_class': {'uri': 'crm:E21_Person', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Person'},
'rdfs_comment': 'Use PiCo for staff observations from archival sources',
'properties': [
{'uri': 'pico:hasRole', 'range': 'picot_roles:', 'usage': 'Staff roles (curator, director, conservator)'},
{'uri': 'sdo:worksFor', 'range': 'schema:Organization', 'usage': 'Employment relationship'},
{'uri': 'crm:P14i_performed', 'range': 'crm:E7_Activity', 'usage': 'Activities performed (exhibitions, conservation)'},
]
},
'temporal_model': {
'custodian_aspect': 'Founding date → Dissolution/Present',
'collections_aspect': 'Accession dates (per object/collection)',
'people_aspect': 'Employment periods (per person)',
}
},
# Archives
'archive': {
'semantic_aspects': ['custodian_reference', 'collections_reference', 'people_reference'],
'complexity_note': 'Archives are custodian organizations holding archival record sets.',
'custodian_ontology': {
'primary_class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'},
'secondary_class': {'uri': 'schema:ArchiveOrganization', 'namespace': 'http://schema.org/', 'label': 'Archive Organization'},
'tertiary_class': {'uri': 'rico:CorporateBody', 'namespace': 'https://www.ica.org/standards/RiC/ontology#', 'label': 'Corporate Body'},
'rdfs_comment': 'Institution preserving archival records and providing access',
'properties': [
{'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code'},
{'uri': 'rico:isOrWasHolderOf', 'range': 'rico:RecordSet', 'usage': 'Archival holdings'},
{'uri': 'rico:manages', 'range': 'rico:Thing', 'usage': 'Archival resources managed'},
{'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'Archive founding date'},
]
},
'collections_ontology': {
'primary_class': {'uri': 'rico:RecordSet', 'namespace': 'https://www.ica.org/standards/RiC/ontology#', 'label': 'Record Set'},
'rdfs_comment': 'Archival record sets with provenance and access information',
'properties': [
{'uri': 'rico:isOrWasHeldBy', 'range': 'rico:Agent', 'usage': 'Archive holding the records'},
{'uri': 'rico:hasProvenance', 'range': 'rico:Agent', 'usage': 'Original creator of records'},
{'uri': 'rico:scopeAndContent', 'range': 'rdfs:Literal', 'usage': 'Description of archival holdings'},
{'uri': 'rico:isAssociatedWithDate', 'range': 'rico:Date', 'usage': 'Temporal coverage (EDTF format)'},
]
},
'people_ontology': {
'primary_class': {'uri': 'pico:PersonObservation', 'namespace': 'https://personsincontext.org/model#', 'label': 'Person Observation'},
'properties': [
{'uri': 'pico:hasRole', 'range': 'picot_roles:', 'usage': 'archivist, records manager, director'},
]
},
'temporal_model': {
'custodian_aspect': 'Archive founding → Present/Closure',
'collections_aspect': 'Accession dates + temporal coverage of records',
}
},
# Libraries
'library': {
'semantic_aspects': ['custodian_reference', 'collections_reference'],
'custodian_ontology': {
'primary_class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'},
'secondary_class': {'uri': 'schema:Library', 'namespace': 'http://schema.org/', 'label': 'Library'},
'rdfs_comment': 'Institution providing access to bibliographic collections',
'properties': [
{'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code'},
{'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'Library founding date'},
]
},
'collections_ontology': {
'primary_class': {'uri': 'bf:Collection', 'namespace': 'http://id.loc.gov/ontologies/bibframe/', 'label': 'Collection'},
'rdfs_comment': 'Bibliographic collections following BIBFRAME',
'properties': [
{'uri': 'bf:heldBy', 'range': 'bf:Agent', 'usage': 'Library holding the collection'},
{'uri': 'bf:title', 'range': 'bf:Title', 'usage': 'Collection title'},
]
}
},
# Organizations (generic)
'organisation': {
'semantic_aspects': ['custodian_reference'],
'complexity_note': 'Generic organizations require classification into public/private and domain (cultural, research, etc.)',
'custodian_ontology': {
'public_sector': {
'class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'},
},
'private_sector': {
'class': {'uri': 'schema:Organization', 'namespace': 'http://schema.org/', 'label': 'Organization'},
},
'properties': [
{'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'Identifiers'},
{'uri': 'skos:prefLabel', 'range': 'rdfs:Literal', 'usage': 'Preferred name'},
]
}
},
# Companies (private sector)
'company': {
'semantic_aspects': ['custodian_reference'],
'custodian_ontology': {
'primary_class': {'uri': 'schema:Corporation', 'namespace': 'http://schema.org/', 'label': 'Corporation'},
'secondary_class': {'uri': 'crm:E40_Legal_Body', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Legal Body'},
'rdfs_comment': 'Corporate heritage collections (company archives, corporate museums)',
'properties': [
{'uri': 'schema:identifier', 'range': 'schema:PropertyValue', 'usage': 'Company registration numbers'},
{'uri': 'schema:founder', 'range': 'schema:Person', 'usage': 'Company founder'},
]
}
},
# Heritage sites
'heritage site': {
'semantic_aspects': ['place_reference', 'potential_custodian_reference'],
'place_ontology': {
'primary_class': {'uri': 'crm:E27_Site', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Site'},
'secondary_class': {'uri': 'schema:LandmarksOrHistoricalBuildings', 'namespace': 'http://schema.org/', 'label': 'Landmarks'},
'rdfs_comment': 'Heritage sites with cultural/historical significance',
'properties': [
{'uri': 'crm:P2_has_type', 'range': 'crm:E55_Type', 'usage': 'Heritage site type'},
{'uri': 'schema:geo', 'range': 'schema:GeoCoordinates', 'usage': 'Coordinates'},
]
}
},
# Protected areas
'protected area': {
'semantic_aspects': ['place_reference'],
'place_ontology': {
'primary_class': {'uri': 'schema:Place', 'namespace': 'http://schema.org/', 'label': 'Place'},
'secondary_class': {'uri': 'crm:E27_Site', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Site'},
'rdfs_comment': 'Protected environmental/natural heritage areas',
'properties': [
{'uri': 'schema:geo', 'range': 'schema:GeoCoordinates', 'usage': 'Coordinates'},
{'uri': 'schema:containedInPlace', 'range': 'schema:Place', 'usage': 'Administrative location'},
]
}
},
# Universities
'university': {
'semantic_aspects': ['custodian_reference', 'collections_reference'],
'custodian_ontology': {
'primary_class': {'uri': 'schema:EducationalOrganization', 'namespace': 'http://schema.org/', 'label': 'Educational Organization'},
'secondary_class': {'uri': 'schema:CollegeOrUniversity', 'namespace': 'http://schema.org/', 'label': 'College or University'},
'rdfs_comment': 'Higher education institutions with heritage collections (university museums, archives, libraries)',
'properties': [
{'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'University founding date'},
{'uri': 'schema:address', 'range': 'schema:PostalAddress', 'usage': 'University address'},
]
}
},
}
def enrich_entry_with_ontology(entry, hypernyms):
"""Add ontology mapping to a single entry based on its hypernyms."""
ontology_mapping = {
'wikidata_source': entry['curated']['label'],
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enriched_by': 'automated_ontology_mapper',
}
# Find best matching template
for hypernym in hypernyms:
if hypernym in ONTOLOGY_TEMPLATES:
template = ONTOLOGY_TEMPLATES[hypernym]
# Copy template structure
ontology_mapping['semantic_aspects'] = template.get('semantic_aspects', [])
ontology_mapping['complexity_note'] = template.get('complexity_note', '')
# Add ontology classes and properties
if 'place_ontology' in template:
ontology_mapping['place_ontology'] = template['place_ontology']
if 'custodian_ontology' in template:
ontology_mapping['custodian_ontology'] = template['custodian_ontology']
if 'collections_ontology' in template:
ontology_mapping['collections_ontology'] = template['collections_ontology']
if 'people_ontology' in template:
ontology_mapping['people_ontology'] = template['people_ontology']
if 'temporal_model' in template:
ontology_mapping['temporal_model'] = template['temporal_model']
# Found a match, stop
break
return ontology_mapping
def main():
# Paths
curated_file = Path('data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml')
full_file = Path('data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated_full.yaml')
print("Reading files...")
with open(curated_file, 'r', encoding='utf-8') as f:
curated_data = yaml.safe_load(f)
with open(full_file, 'r', encoding='utf-8') as f:
full_data = yaml.safe_load(f)
print(f"Total entries: {len(curated_data['hypernym'])}")
# Enrich entries
enriched_count = 0
for i, entry in enumerate(curated_data['hypernym']):
if 'curated' not in entry:
continue
hypernyms = entry['curated'].get('hypernym', [])
if not hypernyms:
continue
# Add ontology mapping
if 'ontology_mapping' not in entry:
entry['ontology_mapping'] = enrich_entry_with_ontology(entry, hypernyms)
enriched_count += 1
if (i + 1) % 100 == 0:
print(f"Processed {i + 1} entries... (enriched: {enriched_count})")
print(f"\nEnrichment complete: {enriched_count} entries enriched")
# Write back to curated file
print(f"Writing to {curated_file}...")
with open(curated_file, 'w', encoding='utf-8') as f:
yaml.dump(curated_data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print("Done!")
if __name__ == '__main__':
main()