- Introduced custodian_hub_v3.mmd, custodian_hub_v4_final.mmd, and custodian_hub_v5_FINAL.mmd for Mermaid representation. - Created custodian_hub_FINAL.puml and custodian_hub_v3.puml for PlantUML representation. - Defined entities such as CustodianReconstruction, Identifier, TimeSpan, Agent, CustodianName, CustodianObservation, ReconstructionActivity, Appellation, ConfidenceMeasure, Custodian, LanguageCode, and SourceDocument. - Established relationships and associations between entities, including temporal extents, observations, and reconstruction activities. - Incorporated enumerations for various types, statuses, and classifications relevant to custodians and their activities.
324 lines
18 KiB
Python
324 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich hyponyms_curated.yaml with ontology class mappings and properties.
|
|
|
|
This script reads hyponyms_curated_full.yaml (with full Wikidata metadata)
|
|
and adds ontology mappings to hyponyms_curated.yaml based on hypernym categories.
|
|
|
|
Usage:
|
|
python3 scripts/enrich_hyponyms_with_ontology.py
|
|
"""
|
|
|
|
import yaml
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Ontology mapping templates by hypernym category
|
|
ONTOLOGY_TEMPLATES = {
|
|
# Buildings and structures
|
|
'building': {
|
|
'semantic_aspects': ['place_reference', 'potential_custodian_reference'],
|
|
'complexity_note': 'Buildings can be both physical heritage sites (place) AND organizations managing heritage (custodian). Model both aspects with independent temporal lifecycles.',
|
|
'place_ontology': {
|
|
'primary_class': {'uri': 'crm:E27_Site', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Site'},
|
|
'secondary_class': {'uri': 'schema:Place', 'namespace': 'http://schema.org/', 'label': 'Place'},
|
|
'tertiary_class': {'uri': 'schema:LandmarksOrHistoricalBuildings', 'namespace': 'http://schema.org/', 'label': 'Landmarks'},
|
|
'rdfs_comment': 'Physical heritage building or site with archaeological/architectural significance',
|
|
'properties': [
|
|
{'uri': 'crm:P1_is_identified_by', 'range': 'crm:E41_Appellation', 'usage': 'Building name identification'},
|
|
{'uri': 'crm:P2_has_type', 'range': 'crm:E55_Type', 'usage': 'Building type classification'},
|
|
{'uri': 'crm:P4_has_time-span', 'range': 'crm:E52_Time-Span', 'usage': 'Temporal extent (construction → present/demolition)'},
|
|
{'uri': 'crm:P7_took_place_at', 'range': 'crm:E53_Place', 'usage': 'Geographic location'},
|
|
{'uri': 'schema:geo', 'range': 'schema:GeoCoordinates', 'usage': 'Latitude/longitude coordinates'},
|
|
{'uri': 'schema:address', 'range': 'schema:PostalAddress', 'usage': 'Physical postal address'},
|
|
{'uri': 'schema:containedInPlace', 'range': 'schema:Place', 'usage': 'Administrative hierarchy (city → province → country)'},
|
|
]
|
|
},
|
|
'custodian_ontology': {
|
|
'condition': 'operates_as_heritage_institution',
|
|
'public_sector': {
|
|
'class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'},
|
|
'secondary_class': {'uri': 'schema:Museum', 'namespace': 'http://schema.org/', 'label': 'Museum'},
|
|
},
|
|
'private_sector': {
|
|
'class': {'uri': 'schema:Organization', 'namespace': 'http://schema.org/', 'label': 'Organization'},
|
|
'secondary_class': {'uri': 'schema:Museum', 'namespace': 'http://schema.org/', 'label': 'Museum'},
|
|
},
|
|
'properties': [
|
|
{'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code, KvK number, Wikidata Q-number'},
|
|
{'uri': 'cpov:hasUnit', 'range': 'cpov:PublicOrganisation', 'usage': 'Organizational structure (departments, teams)'},
|
|
{'uri': 'crm:P147_curated', 'range': 'crm:E78_Curated_Holding', 'usage': 'Link to curated collection'},
|
|
{'uri': 'schema:location', 'range': 'schema:Place', 'usage': 'Link custodian organization to physical site'},
|
|
]
|
|
},
|
|
'temporal_model': {
|
|
'place_aspect': 'Construction date → Demolition/Present',
|
|
'custodian_aspect': 'Founding date → Dissolution/Present (if operates as heritage institution)',
|
|
'note': 'Place and custodian have INDEPENDENT temporal lifecycles',
|
|
}
|
|
},
|
|
|
|
# Museums
|
|
'museum': {
|
|
'semantic_aspects': ['custodian_reference', 'place_reference', 'collections_reference', 'people_reference'],
|
|
'complexity_note': 'Museums are primarily custodian organizations, but also operate at physical locations with staff and collections.',
|
|
'custodian_ontology': {
|
|
'primary_class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'},
|
|
'secondary_class': {'uri': 'schema:Museum', 'namespace': 'http://schema.org/', 'label': 'Museum'},
|
|
'tertiary_class': {'uri': 'crm:E39_Actor', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Actor'},
|
|
'condition_public': 'legal_status == "public" OR "governmental" OR "municipal"',
|
|
'condition_private': 'legal_status == "private" OR "foundation" OR "NGO"',
|
|
'rdfs_comment': 'Institution collecting, preserving, and exhibiting cultural heritage objects',
|
|
'properties': [
|
|
{'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code, Wikidata Q-number, VIAF ID'},
|
|
{'uri': 'skos:prefLabel', 'range': 'rdfs:Literal', 'usage': 'Preferred name (CPOV uses SKOS)'},
|
|
{'uri': 'skos:altLabel', 'range': 'rdfs:Literal', 'usage': 'Alternative names'},
|
|
{'uri': 'cpov:hasUnit', 'range': 'cpov:PublicOrganisation', 'usage': 'Organizational units (departments, divisions)'},
|
|
{'uri': 'crm:P147_curated', 'range': 'crm:E78_Curated_Holding', 'usage': 'Curated museum collection'},
|
|
{'uri': 'schema:location', 'range': 'schema:Place', 'usage': 'Physical location/address'},
|
|
{'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'Museum founding date'},
|
|
]
|
|
},
|
|
'collections_ontology': {
|
|
'primary_class': {'uri': 'crm:E78_Curated_Holding', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Curated Holding'},
|
|
'rdfs_comment': 'Aggregations of physical things assembled and maintained by museum',
|
|
'properties': [
|
|
{'uri': 'crm:P147i_was_curated_by', 'range': 'crm:E39_Actor', 'usage': 'Curating organization'},
|
|
{'uri': 'crm:P109_has_current_or_former_curator', 'range': 'crm:E39_Actor', 'usage': 'Individual curators'},
|
|
{'uri': 'crm:P46_is_composed_of', 'range': 'crm:E18_Physical_Thing', 'usage': 'Individual objects in collection'},
|
|
]
|
|
},
|
|
'people_ontology': {
|
|
'primary_class': {'uri': 'pico:PersonObservation', 'namespace': 'https://personsincontext.org/model#', 'label': 'Person Observation'},
|
|
'secondary_class': {'uri': 'crm:E21_Person', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Person'},
|
|
'rdfs_comment': 'Use PiCo for staff observations from archival sources',
|
|
'properties': [
|
|
{'uri': 'pico:hasRole', 'range': 'picot_roles:', 'usage': 'Staff roles (curator, director, conservator)'},
|
|
{'uri': 'sdo:worksFor', 'range': 'schema:Organization', 'usage': 'Employment relationship'},
|
|
{'uri': 'crm:P14i_performed', 'range': 'crm:E7_Activity', 'usage': 'Activities performed (exhibitions, conservation)'},
|
|
]
|
|
},
|
|
'temporal_model': {
|
|
'custodian_aspect': 'Founding date → Dissolution/Present',
|
|
'collections_aspect': 'Accession dates (per object/collection)',
|
|
'people_aspect': 'Employment periods (per person)',
|
|
}
|
|
},
|
|
|
|
# Archives
|
|
'archive': {
|
|
'semantic_aspects': ['custodian_reference', 'collections_reference', 'people_reference'],
|
|
'complexity_note': 'Archives are custodian organizations holding archival record sets.',
|
|
'custodian_ontology': {
|
|
'primary_class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'},
|
|
'secondary_class': {'uri': 'schema:ArchiveOrganization', 'namespace': 'http://schema.org/', 'label': 'Archive Organization'},
|
|
'tertiary_class': {'uri': 'rico:CorporateBody', 'namespace': 'https://www.ica.org/standards/RiC/ontology#', 'label': 'Corporate Body'},
|
|
'rdfs_comment': 'Institution preserving archival records and providing access',
|
|
'properties': [
|
|
{'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code'},
|
|
{'uri': 'rico:isOrWasHolderOf', 'range': 'rico:RecordSet', 'usage': 'Archival holdings'},
|
|
{'uri': 'rico:manages', 'range': 'rico:Thing', 'usage': 'Archival resources managed'},
|
|
{'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'Archive founding date'},
|
|
]
|
|
},
|
|
'collections_ontology': {
|
|
'primary_class': {'uri': 'rico:RecordSet', 'namespace': 'https://www.ica.org/standards/RiC/ontology#', 'label': 'Record Set'},
|
|
'rdfs_comment': 'Archival record sets with provenance and access information',
|
|
'properties': [
|
|
{'uri': 'rico:isOrWasHeldBy', 'range': 'rico:Agent', 'usage': 'Archive holding the records'},
|
|
{'uri': 'rico:hasProvenance', 'range': 'rico:Agent', 'usage': 'Original creator of records'},
|
|
{'uri': 'rico:scopeAndContent', 'range': 'rdfs:Literal', 'usage': 'Description of archival holdings'},
|
|
{'uri': 'rico:isAssociatedWithDate', 'range': 'rico:Date', 'usage': 'Temporal coverage (EDTF format)'},
|
|
]
|
|
},
|
|
'people_ontology': {
|
|
'primary_class': {'uri': 'pico:PersonObservation', 'namespace': 'https://personsincontext.org/model#', 'label': 'Person Observation'},
|
|
'properties': [
|
|
{'uri': 'pico:hasRole', 'range': 'picot_roles:', 'usage': 'archivist, records manager, director'},
|
|
]
|
|
},
|
|
'temporal_model': {
|
|
'custodian_aspect': 'Archive founding → Present/Closure',
|
|
'collections_aspect': 'Accession dates + temporal coverage of records',
|
|
}
|
|
},
|
|
|
|
# Libraries
|
|
'library': {
|
|
'semantic_aspects': ['custodian_reference', 'collections_reference'],
|
|
'custodian_ontology': {
|
|
'primary_class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'},
|
|
'secondary_class': {'uri': 'schema:Library', 'namespace': 'http://schema.org/', 'label': 'Library'},
|
|
'rdfs_comment': 'Institution providing access to bibliographic collections',
|
|
'properties': [
|
|
{'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'ISIL code'},
|
|
{'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'Library founding date'},
|
|
]
|
|
},
|
|
'collections_ontology': {
|
|
'primary_class': {'uri': 'bf:Collection', 'namespace': 'http://id.loc.gov/ontologies/bibframe/', 'label': 'Collection'},
|
|
'rdfs_comment': 'Bibliographic collections following BIBFRAME',
|
|
'properties': [
|
|
{'uri': 'bf:heldBy', 'range': 'bf:Agent', 'usage': 'Library holding the collection'},
|
|
{'uri': 'bf:title', 'range': 'bf:Title', 'usage': 'Collection title'},
|
|
]
|
|
}
|
|
},
|
|
|
|
# Organizations (generic)
|
|
'organisation': {
|
|
'semantic_aspects': ['custodian_reference'],
|
|
'complexity_note': 'Generic organizations require classification into public/private and domain (cultural, research, etc.)',
|
|
'custodian_ontology': {
|
|
'public_sector': {
|
|
'class': {'uri': 'cpov:PublicOrganisation', 'namespace': 'http://data.europa.eu/m8g/', 'label': 'Public Organisation'},
|
|
},
|
|
'private_sector': {
|
|
'class': {'uri': 'schema:Organization', 'namespace': 'http://schema.org/', 'label': 'Organization'},
|
|
},
|
|
'properties': [
|
|
{'uri': 'dct:identifier', 'range': 'rdfs:Literal', 'usage': 'Identifiers'},
|
|
{'uri': 'skos:prefLabel', 'range': 'rdfs:Literal', 'usage': 'Preferred name'},
|
|
]
|
|
}
|
|
},
|
|
|
|
# Companies (private sector)
|
|
'company': {
|
|
'semantic_aspects': ['custodian_reference'],
|
|
'custodian_ontology': {
|
|
'primary_class': {'uri': 'schema:Corporation', 'namespace': 'http://schema.org/', 'label': 'Corporation'},
|
|
'secondary_class': {'uri': 'crm:E40_Legal_Body', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Legal Body'},
|
|
'rdfs_comment': 'Corporate heritage collections (company archives, corporate museums)',
|
|
'properties': [
|
|
{'uri': 'schema:identifier', 'range': 'schema:PropertyValue', 'usage': 'Company registration numbers'},
|
|
{'uri': 'schema:founder', 'range': 'schema:Person', 'usage': 'Company founder'},
|
|
]
|
|
}
|
|
},
|
|
|
|
# Heritage sites
|
|
'heritage site': {
|
|
'semantic_aspects': ['place_reference', 'potential_custodian_reference'],
|
|
'place_ontology': {
|
|
'primary_class': {'uri': 'crm:E27_Site', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Site'},
|
|
'secondary_class': {'uri': 'schema:LandmarksOrHistoricalBuildings', 'namespace': 'http://schema.org/', 'label': 'Landmarks'},
|
|
'rdfs_comment': 'Heritage sites with cultural/historical significance',
|
|
'properties': [
|
|
{'uri': 'crm:P2_has_type', 'range': 'crm:E55_Type', 'usage': 'Heritage site type'},
|
|
{'uri': 'schema:geo', 'range': 'schema:GeoCoordinates', 'usage': 'Coordinates'},
|
|
]
|
|
}
|
|
},
|
|
|
|
# Protected areas
|
|
'protected area': {
|
|
'semantic_aspects': ['place_reference'],
|
|
'place_ontology': {
|
|
'primary_class': {'uri': 'schema:Place', 'namespace': 'http://schema.org/', 'label': 'Place'},
|
|
'secondary_class': {'uri': 'crm:E27_Site', 'namespace': 'http://www.cidoc-crm.org/cidoc-crm/', 'label': 'Site'},
|
|
'rdfs_comment': 'Protected environmental/natural heritage areas',
|
|
'properties': [
|
|
{'uri': 'schema:geo', 'range': 'schema:GeoCoordinates', 'usage': 'Coordinates'},
|
|
{'uri': 'schema:containedInPlace', 'range': 'schema:Place', 'usage': 'Administrative location'},
|
|
]
|
|
}
|
|
},
|
|
|
|
# Universities
|
|
'university': {
|
|
'semantic_aspects': ['custodian_reference', 'collections_reference'],
|
|
'custodian_ontology': {
|
|
'primary_class': {'uri': 'schema:EducationalOrganization', 'namespace': 'http://schema.org/', 'label': 'Educational Organization'},
|
|
'secondary_class': {'uri': 'schema:CollegeOrUniversity', 'namespace': 'http://schema.org/', 'label': 'College or University'},
|
|
'rdfs_comment': 'Higher education institutions with heritage collections (university museums, archives, libraries)',
|
|
'properties': [
|
|
{'uri': 'schema:foundingDate', 'range': 'xsd:date', 'usage': 'University founding date'},
|
|
{'uri': 'schema:address', 'range': 'schema:PostalAddress', 'usage': 'University address'},
|
|
]
|
|
}
|
|
},
|
|
}
|
|
|
|
def enrich_entry_with_ontology(entry, hypernyms):
|
|
"""Add ontology mapping to a single entry based on its hypernyms."""
|
|
ontology_mapping = {
|
|
'wikidata_source': entry['curated']['label'],
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enriched_by': 'automated_ontology_mapper',
|
|
}
|
|
|
|
# Find best matching template
|
|
for hypernym in hypernyms:
|
|
if hypernym in ONTOLOGY_TEMPLATES:
|
|
template = ONTOLOGY_TEMPLATES[hypernym]
|
|
|
|
# Copy template structure
|
|
ontology_mapping['semantic_aspects'] = template.get('semantic_aspects', [])
|
|
ontology_mapping['complexity_note'] = template.get('complexity_note', '')
|
|
|
|
# Add ontology classes and properties
|
|
if 'place_ontology' in template:
|
|
ontology_mapping['place_ontology'] = template['place_ontology']
|
|
|
|
if 'custodian_ontology' in template:
|
|
ontology_mapping['custodian_ontology'] = template['custodian_ontology']
|
|
|
|
if 'collections_ontology' in template:
|
|
ontology_mapping['collections_ontology'] = template['collections_ontology']
|
|
|
|
if 'people_ontology' in template:
|
|
ontology_mapping['people_ontology'] = template['people_ontology']
|
|
|
|
if 'temporal_model' in template:
|
|
ontology_mapping['temporal_model'] = template['temporal_model']
|
|
|
|
# Found a match, stop
|
|
break
|
|
|
|
return ontology_mapping
|
|
|
|
def main():
|
|
# Paths
|
|
curated_file = Path('data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated.yaml')
|
|
full_file = Path('data/wikidata/GLAMORCUBEPSXHFN/hyponyms_curated_full.yaml')
|
|
|
|
print("Reading files...")
|
|
with open(curated_file, 'r', encoding='utf-8') as f:
|
|
curated_data = yaml.safe_load(f)
|
|
|
|
with open(full_file, 'r', encoding='utf-8') as f:
|
|
full_data = yaml.safe_load(f)
|
|
|
|
print(f"Total entries: {len(curated_data['hypernym'])}")
|
|
|
|
# Enrich entries
|
|
enriched_count = 0
|
|
for i, entry in enumerate(curated_data['hypernym']):
|
|
if 'curated' not in entry:
|
|
continue
|
|
|
|
hypernyms = entry['curated'].get('hypernym', [])
|
|
if not hypernyms:
|
|
continue
|
|
|
|
# Add ontology mapping
|
|
if 'ontology_mapping' not in entry:
|
|
entry['ontology_mapping'] = enrich_entry_with_ontology(entry, hypernyms)
|
|
enriched_count += 1
|
|
|
|
if (i + 1) % 100 == 0:
|
|
print(f"Processed {i + 1} entries... (enriched: {enriched_count})")
|
|
|
|
print(f"\nEnrichment complete: {enriched_count} entries enriched")
|
|
|
|
# Write back to curated file
|
|
print(f"Writing to {curated_file}...")
|
|
with open(curated_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(curated_data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print("Done!")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|