#!/usr/bin/env python3 """ Enrich hyponyms_curated.yaml with ontology class mappings and properties. This script reads hyponyms_curated_full.yaml (with full Wikidata metadata) and adds ontology mappings to hyponyms_curated.yaml based on hypernym categories. Usage: python3 scripts/enrich_hyponyms_with_ontology.py """ import yaml import sys from datetime import datetime, timezone from pathlib import Path # Ontology mapping templates by hypernym category ONTOLOGY_TEMPLATES = { # Buildings and structures 'building': { 'semantic_aspects': ['place_reference', 'potential_custodian_reference'], 'complexity_note': 'Buildings can be both physical heritage sites (place) AND organizations managing heritage (custodian). Model both aspects with independent temporal lifecycles.', 'place_ontology': { 'primary_class': {'uri': 'crm:E27_Site', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Site'}, 'secondary_class': {'uri': 'schema:Place', 'namespace': 'http://schema.org/', 'label': 'Place'}, 'tertiary_class': {'uri': 'schema:LandmarksOrHistoricalBuildings', 'namespace': 'http://schema.org/', 'label': 'Landmarks'}, 'rdfs_comment': 'Physical heritage building or site with archaeological/architectural significance', 'properties': [ {'uri': 'crm:P1_is_identified_by', 'range': 'crm:E41_Appellation', 'usage': 'Building name identification'}, {'uri': 'crm:P2_has_type', 'range': 'crm:E55_Type', 'usage': 'Building type classification'}, {'uri': 'crm:P4_has_time-span', 'range': 'crm:E52_Time-Span', 'usage': 'Temporal extent (construction → present/demolition)'}, {'uri': 'crm:P7_took_place_at', 'range': 'crm:E53_Place', 'usage': 'Geographic location'}, {'uri': 'schema:geo', 'range': 'schema:GeoCoordinates', 'usage': 'Latitude/longitude coordinates'}, {'uri': 'schema:address', 'range': 'schema:PostalAddress', 'usage': 'Physical postal address'}, {'uri': 'schema:containedInPlace', 'range': 'schema:Place', 'usage': 'Administrative hierarchy (city → province → country)'}, ] }, 'custodian_ontology': { 'condition': 'operates_as_heritage_institution', 'public_sector': { 'class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'}, 'secondary_class': {'uri': 'schema:Museum', 'namespace': 'http://schema.org/', 'label': 'Museum'}, }, 'private_sector': { 'class': {'uri': 'schema:Organization', 'namespace': 'http://schema.org/', 'label': 'Organization'}, 'secondary_class': {'uri': 'schema:Museum', 'namespace': 'http://schema.org/', 'label': 'Museum'}, }, 'properties': [ {'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code, KvK number, Wikidata Q-number'}, {'uri': 'cpov:hasUnit', 'range': 'cpov:PublicOrganisation', 'usage': 'Organizational structure (departments, teams)'}, {'uri': 'crm:P147_curated', 'range': 'crm:E78_Curated_Holding', 'usage': 'Link to curated collection'}, {'uri': 'schema:location', 'range': 'schema:Place', 'usage': 'Link custodian organization to physical site'}, ] }, 'temporal_model': { 'place_aspect': 'Construction date → Demolition/Present', 'custodian_aspect': 'Founding date → Dissolution/Present (if operates as heritage institution)', 'note': 'Place and custodian have INDEPENDENT temporal lifecycles', } }, # Museums 'museum': { 'semantic_aspects': ['custodian_reference', 'place_reference', 'collections_reference', 'people_reference'], 'complexity_note': 'Museums are primarily custodian organizations, but also operate at physical locations with staff and collections.', 'custodian_ontology': { 'primary_class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'}, 'secondary_class': {'uri': 'schema:Museum', 'namespace': 'http://schema.org/', 'label': 'Museum'}, 'tertiary_class': {'uri': 'crm:E39_Actor', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Actor'}, 'condition_public': 'legal_status == "public" OR "governmental" OR "municipal"', 'condition_private': 'legal_status == "private" OR "foundation" OR "NGO"', 'rdfs_comment': 'Institution collecting, preserving, and exhibiting cultural heritage objects', 'properties': [ {'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code, Wikidata Q-number, VIAF ID'}, {'uri': 'skos:prefLabel', 'range': 'rdfs:Literal', 'usage': 'Preferred name (CPOV uses SKOS)'}, {'uri': 'skos:altLabel', 'range': 'rdfs:Literal', 'usage': 'Alternative names'}, {'uri': 'cpov:hasUnit', 'range': 'cpov:PublicOrganisation', 'usage': 'Organizational units (departments, divisions)'}, {'uri': 'crm:P147_curated', 'range': 'crm:E78_Curated_Holding', 'usage': 'Curated museum collection'}, {'uri': 'schema:location', 'range': 'schema:Place', 'usage': 'Physical location/address'}, {'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'Museum founding date'}, ] }, 'collections_ontology': { 'primary_class': {'uri': 'crm:E78_Curated_Holding', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Curated Holding'}, 'rdfs_comment': 'Aggregations of physical things assembled and maintained by museum', 'properties': [ {'uri': 'crm:P147i_was_curated_by', 'range': 'crm:E39_Actor', 'usage': 'Curating organization'}, {'uri': 'crm:P109_has_current_or_former_curator', 'range': 'crm:E39_Actor', 'usage': 'Individual curators'}, {'uri': 'crm:P46_is_composed_of', 'range': 'crm:E18_Physical_Thing', 'usage': 'Individual objects in collection'}, ] }, 'people_ontology': { 'primary_class': {'uri': 'pico:PersonObservation', 'namespace': 'https://personsincontext.org/model#', 'label': 'Person Observation'}, 'secondary_class': {'uri': 'crm:E21_Person', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Person'}, 'rdfs_comment': 'Use PiCo for staff observations from archival sources', 'properties': [ {'uri': 'pico:hasRole', 'range': 'picot_roles:', 'usage': 'Staff roles (curator, director, conservator)'}, {'uri': 'sdo:worksFor', 'range': 'schema:Organization', 'usage': 'Employment relationship'}, {'uri': 'crm:P14i_performed', 'range': 'crm:E7_Activity', 'usage': 'Activities performed (exhibitions, conservation)'}, ] }, 'temporal_model': { 'custodian_aspect': 'Founding date → Dissolution/Present', 'collections_aspect': 'Accession dates (per object/collection)', 'people_aspect': 'Employment periods (per person)', } }, # Archives 'archive': { 'semantic_aspects': ['custodian_reference', 'collections_reference', 'people_reference'], 'complexity_note': 'Archives are custodian organizations holding archival record sets.', 'custodian_ontology': { 'primary_class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'}, 'secondary_class': {'uri': 'schema:ArchiveOrganization', 'namespace': 'http://schema.org/', 'label': 'Archive Organization'}, 'tertiary_class': {'uri': 'rico:CorporateBody', 'namespace': 'https://www.ica.org/standards/RiC/ontology#', 'label': 'Corporate Body'}, 'rdfs_comment': 'Institution preserving archival records and providing access', 'properties': [ {'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code'}, {'uri': 'rico:isOrWasHolderOf', 'range': 'rico:RecordSet', 'usage': 'Archival holdings'}, {'uri': 'rico:manages', 'range': 'rico:Thing', 'usage': 'Archival resources managed'}, {'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'Archive founding date'}, ] }, 'collections_ontology': { 'primary_class': {'uri': 'rico:RecordSet', 'namespace': 'https://www.ica.org/standards/RiC/ontology#', 'label': 'Record Set'}, 'rdfs_comment': 'Archival record sets with provenance and access information', 'properties': [ {'uri': 'rico:isOrWasHeldBy', 'range': 'rico:Agent', 'usage': 'Archive holding the records'}, {'uri': 'rico:hasProvenance', 'range': 'rico:Agent', 'usage': 'Original creator of records'}, {'uri': 'rico:scopeAndContent', 'range': 'rdfs:Literal', 'usage': 'Description of archival holdings'}, {'uri': 'rico:isAssociatedWithDate', 'range': 'rico:Date', 'usage': 'Temporal coverage (EDTF format)'}, ] }, 'people_ontology': { 'primary_class': {'uri': 'pico:PersonObservation', 'namespace': 'https://personsincontext.org/model#', 'label': 'Person Observation'}, 'properties': [ {'uri': 'pico:hasRole', 'range': 'picot_roles:', 'usage': 'archivist, records manager, director'}, ] }, 'temporal_model': { 'custodian_aspect': 'Archive founding → Present/Closure', 'collections_aspect': 'Accession dates + temporal coverage of records', } }, # Libraries 'library': { 'semantic_aspects': ['custodian_reference', 'collections_reference'], 'custodian_ontology': { 'primary_class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'}, 'secondary_class': {'uri': 'schema:Library', 'namespace': 'http://schema.org/', 'label': 'Library'}, 'rdfs_comment': 'Institution providing access to bibliographic collections', 'properties': [ {'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code'}, {'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'Library founding date'}, ] }, 'collections_ontology': { 'primary_class': {'uri': 'bf:Collection', 'namespace': 'http://id.loc.gov/ontologies/bibframe/', 'label': 'Collection'}, 'rdfs_comment': 'Bibliographic collections following BIBFRAME', 'properties': [ {'uri': 'bf:heldBy', 'range': 'bf:Agent', 'usage': 'Library holding the collection'}, {'uri': 'bf:title', 'range': 'bf:Title', 'usage': 'Collection title'}, ] } }, # Organizations (generic) 'organisation': { 'semantic_aspects': ['custodian_reference'], 'complexity_note': 'Generic organizations require classification into public/private and domain (cultural, research, etc.)', 'custodian_ontology': { 'public_sector': { 'class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'}, }, 'private_sector': { 'class': {'uri': 'schema:Organization', 'namespace': 'http://schema.org/', 'label': 'Organization'}, }, 'properties': [ {'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'Identifiers'}, {'uri': 'skos:prefLabel', 'range': 'rdfs:Literal', 'usage': 'Preferred name'}, ] } }, # Companies (private sector) 'company': { 'semantic_aspects': ['custodian_reference'], 'custodian_ontology': { 'primary_class': {'uri': 'schema:Corporation', 'namespace': 'http://schema.org/', 'label': 'Corporation'}, 'secondary_class': {'uri': 'crm:E40_Legal_Body', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Legal Body'}, 'rdfs_comment': 'Corporate heritage collections (company archives, corporate museums)', 'properties': [ {'uri': 'schema:identifier', 'range': 'schema:PropertyValue', 'usage': 'Company registration numbers'}, {'uri': 'schema:founder', 'range': 'schema:Person', 'usage': 'Company founder'}, ] } }, # Heritage sites 'heritage site': { 'semantic_aspects': ['place_reference', 'potential_custodian_reference'], 'place_ontology': { 'primary_class': {'uri': 'crm:E27_Site', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Site'}, 'secondary_class': {'uri': 'schema:LandmarksOrHistoricalBuildings', 'namespace': 'http://schema.org/', 'label': 'Landmarks'}, 'rdfs_comment': 'Heritage sites with cultural/historical significance', 'properties': [ {'uri': 'crm:P2_has_type', 'range': 'crm:E55_Type', 'usage': 'Heritage site type'}, {'uri': 'schema:geo', 'range': 'schema:GeoCoordinates', 'usage': 'Coordinates'}, ] } }, # Protected areas 'protected area': { 'semantic_aspects': ['place_reference'], 'place_ontology': { 'primary_class': {'uri': 'schema:Place', 'namespace': 'http://schema.org/', 'label': 'Place'}, 'secondary_class': {'uri': 'crm:E27_Site', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Site'}, 'rdfs_comment': 'Protected environmental/natural heritage areas', 'properties': [ {'uri': 'schema:geo', 'range': 'schema:GeoCoordinates', 'usage': 'Coordinates'}, {'uri': 'schema:containedInPlace', 'range': 'schema:Place', 'usage': 'Administrative location'}, ] } }, # Universities 'university': { 'semantic_aspects': ['custodian_reference', 'collections_reference'], 'custodian_ontology': { 'primary_class': {'uri': 'schema:EducationalOrganization', 'namespace': 'http://schema.org/', 'label': 'Educational Organization'}, 'secondary_class': {'uri': 'schema:CollegeOrUniversity', 'namespace': 'http://schema.org/', 'label': 'College or University'}, 'rdfs_comment': 'Higher education institutions with heritage collections (university museums, archives, libraries)', 'properties': [ {'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'University founding date'}, {'uri': 'schema:address', 'range': 'schema:PostalAddress', 'usage': 'University address'}, ] } }, } def enrich_entry_with_ontology(entry, hypernyms): """Add ontology mapping to a single entry based on its hypernyms.""" ontology_mapping = { 'wikidata_source': entry['curated']['label'], 'enrichment_date': datetime.now(timezone.utc).isoformat(), 'enriched_by': 'automated_ontology_mapper', } # Find best matching template for hypernym in hypernyms: if hypernym in ONTOLOGY_TEMPLATES: template = ONTOLOGY_TEMPLATES[hypernym] # Copy template structure ontology_mapping['semantic_aspects'] = template.get('semantic_aspects', []) ontology_mapping['complexity_note'] = template.get('complexity_note', '') # Add ontology classes and properties if 'place_ontology' in template: ontology_mapping['place_ontology'] = template['place_ontology'] if 'custodian_ontology' in template: ontology_mapping['custodian_ontology'] = template['custodian_ontology'] if 'collections_ontology' in template: ontology_mapping['collections_ontology'] = template['collections_ontology'] if 'people_ontology' in template: ontology_mapping['people_ontology'] = template['people_ontology'] if 'temporal_model' in template: ontology_mapping['temporal_model'] = template['temporal_model'] # Found a match, stop break return ontology_mapping def main(): # Paths curated_file = Path('data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml') full_file = Path('data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated_full.yaml') print("Reading files...") with open(curated_file, 'r', encoding='utf-8') as f: curated_data = yaml.safe_load(f) with open(full_file, 'r', encoding='utf-8') as f: full_data = yaml.safe_load(f) print(f"Total entries: {len(curated_data['hypernym'])}") # Enrich entries enriched_count = 0 for i, entry in enumerate(curated_data['hypernym']): if 'curated' not in entry: continue hypernyms = entry['curated'].get('hypernym', []) if not hypernyms: continue # Add ontology mapping if 'ontology_mapping' not in entry: entry['ontology_mapping'] = enrich_entry_with_ontology(entry, hypernyms) enriched_count += 1 if (i + 1) % 100 == 0: print(f"Processed {i + 1} entries... (enriched: {enriched_count})") print(f"\nEnrichment complete: {enriched_count} entries enriched") # Write back to curated file print(f"Writing to {curated_file}...") with open(curated_file, 'w', encoding='utf-8') as f: yaml.dump(curated_data, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print("Done!") if __name__ == '__main__': main()