#!/usr/bin/env python3 """ Batch Apply CH-Annotator Convention to GLAM Datasets This script applies formal CH-Annotator (ch_annotator-v1_7_0) provenance metadata to heritage institution datasets across multiple countries. CH-Annotator v1.7.0 requirements: 1. Entity hypernym codes (GRP.HER for heritage institutions) 2. 5-component claim provenance model 3. Convention version reference 4. Ontology class mappings Usage: # Process all discovered datasets python scripts/apply_ch_annotator_batch.py # Process specific files python scripts/apply_ch_annotator_batch.py --files data/instances/egypt_institutions_ghcid.yaml # Dry run (show what would be processed) python scripts/apply_ch_annotator_batch.py --dry-run Author: OpenCode (Claude Sonnet 4) Date: 2025-12-06 Convention: ch_annotator-v1_7_0 """ import argparse import yaml import json from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional import sys # ============================================================================= # CH-ANNOTATOR MAPPINGS # ============================================================================= # Map GLAMORCUBESFIXPHDNT institution_type to CH-Annotator hypernym codes INSTITUTION_TYPE_TO_HYPERNYM = { "GALLERY": "GRP.HER.GAL", "LIBRARY": "GRP.HER.LIB", "ARCHIVE": "GRP.HER.ARC", "MUSEUM": "GRP.HER.MUS", "OFFICIAL_INSTITUTION": "GRP.HER.OFF", "RESEARCH_CENTER": "GRP.HER.RES", "CORPORATION": "GRP.COR", "UNKNOWN": "GRP.HER", "BOTANICAL_ZOO": "GRP.HER.BOT", "EDUCATION_PROVIDER": "GRP.EDU", "COLLECTING_SOCIETY": "GRP.HER.SOC", "FEATURES": "TOP.MON", # Monuments are toponym-based "INTANGIBLE_HERITAGE_GROUP": "GRP.HER.INT", "MIXED": "GRP.HER.MIX", "PERSONAL_COLLECTION": "GRP.HER.PER", "HOLY_SITES": "GRP.HER.HOL", "DIGITAL_PLATFORM": "GRP.HER.DIG", "NGO": "GRP.ASS", "TASTE_SMELL": "GRP.HER.TAS", } # Ontology class mappings for each hypernym HYPERNYM_ONTOLOGY_CLASS = { "GRP.HER.GAL": "schema:Museum", "GRP.HER.LIB": "schema:Library", "GRP.HER.ARC": "schema:ArchiveOrganization", "GRP.HER.MUS": "schema:Museum", "GRP.HER.OFF": "schema:GovernmentOrganization", "GRP.HER.RES": "schema:ResearchOrganization", "GRP.COR": "schema:Corporation", "GRP.HER": "glam:HeritageCustodian", "GRP.HER.BOT": "schema:Zoo", "GRP.EDU": "schema:EducationalOrganization", "GRP.HER.SOC": "org:FormalOrganization", "TOP.MON": "crm:E22_Human-Made_Object", "GRP.HER.INT": "crm:E74_Group", "GRP.HER.MIX": "glam:HeritageCustodian", "GRP.HER.PER": "glam:PersonalCollection", "GRP.HER.HOL": "schema:PlaceOfWorship", "GRP.HER.DIG": "schema:WebSite", "GRP.ASS": "org:FormalOrganization", "GRP.HER.TAS": "glam:HeritageCustodian", } # Hypernym labels HYPERNYM_LABELS = { "GRP": "GROUP", "TOP": "TOPONYM", "AGT": "AGENT", } # ============================================================================= # DATASET DISCOVERY # ============================================================================= # Primary datasets to process (main YAML files with GHCID-enriched data) # Excludes: backups, archives, cache files, test files PRIMARY_DATASETS = [ # North Africa "data/instances/algeria/algerian_institutions_ghcid.yaml", "data/instances/egypt_institutions_ghcid.yaml", "data/instances/libya/libyan_institutions.yaml", "data/instances/morocco/moroccan_institutions.yaml", "data/instances/tunisia/tunisian_institutions_enhanced.yaml", # Europe "data/instances/austria_complete.yaml", "data/instances/belarus_complete.yaml", "data/instances/belgium_complete.yaml", "data/instances/bulgaria_complete.yaml", "data/instances/czech_unified.yaml", "data/instances/denmark_libraries.yaml", "data/instances/netherlands_complete.yaml", "data/instances/norway/city_archives.yaml", "data/instances/norway/county_archives.yaml", "data/instances/norway/museums_oslo.yaml", "data/instances/switzerland_isil.yaml", "data/instances/georgia_glam_institutions_enriched.yaml", "data/instances/great_britain/gb_institutions_enriched_manual.yaml", "data/instances/italy/it_institutions_enriched_manual.yaml", # Asia "data/instances/japan_complete.yaml", "data/instances/vietnamese_glam_institutions.yaml", "data/instances/palestinian_heritage_custodians.yaml", # Americas "data/instances/argentina_complete.yaml", "data/instances/latin_american_institutions_AUTHORITATIVE.yaml", "data/instances/mexico/mexican_institutions_curated.yaml", "data/instances/united_states/us_institutions_enriched_manual.yaml", ] def discover_datasets(base_path: Path) -> list[Path]: """Discover all primary datasets that need CH-Annotator application.""" datasets = [] for rel_path in PRIMARY_DATASETS: full_path = base_path / rel_path if full_path.exists(): datasets.append(full_path) else: print(f" [SKIP] Not found: {rel_path}") return datasets def already_has_ch_annotator(data: list | dict) -> bool: """Check if dataset already has CH-Annotator annotations.""" if isinstance(data, list) and len(data) > 0: first_item = data[0] return isinstance(first_item, dict) and "ch_annotator" in first_item return False # ============================================================================= # CH-ANNOTATOR APPLICATION # ============================================================================= def get_hypernym_code(institution_type: str) -> str: """Get CH-Annotator hypernym code from institution type.""" return INSTITUTION_TYPE_TO_HYPERNYM.get(institution_type, "GRP.HER") def get_ontology_class(hypernym_code: str) -> str: """Get ontology class URI from hypernym code.""" return HYPERNYM_ONTOLOGY_CLASS.get(hypernym_code, "glam:HeritageCustodian") def get_hypernym_label(hypernym_code: str) -> str: """Get hypernym label from code (first 3 letters).""" prefix = hypernym_code.split(".")[0] if "." in hypernym_code else hypernym_code return HYPERNYM_LABELS.get(prefix, "GROUP") def extract_conversation_id(institution: dict) -> str: """Extract conversation ID from provenance if available.""" provenance = institution.get("provenance", {}) return provenance.get("conversation_id", "unknown") def create_ch_annotator_block(institution: dict, annotation_date: str, source_file: str) -> dict: """Create CH-Annotator entity annotation block for an institution.""" # Handle alternative field names for institution type institution_type = ( institution.get("institution_type") or institution.get("custodian_type") or "UNKNOWN" ) hypernym_code = get_hypernym_code(institution_type) ontology_class = get_ontology_class(hypernym_code) hypernym_label = get_hypernym_label(hypernym_code) # Extract provenance info provenance = institution.get("provenance", {}) conversation_id = provenance.get("conversation_id", "unknown") extraction_date = provenance.get("extraction_date", "unknown") confidence_score = provenance.get("confidence_score", 0.85) # Determine original extraction agent # For conversation-based extractions, we use a generic identifier # since Claude conversation exports don't specify model version extraction_agent = provenance.get("extraction_agent", "claude-conversation") # Build entity annotation block entity_annotation = { "convention_id": "ch_annotator-v1_7_0", "convention_version": "1.7.0", # Entity classification "entity_classification": { "hypernym": hypernym_code.split(".")[0] if "." in hypernym_code else hypernym_code, "hypernym_label": hypernym_label, "subtype": hypernym_code, "subtype_label": institution_type, "ontology_class": ontology_class, "alternative_classes": [ c for c in [ "org:FormalOrganization", "rov:RegisteredOrganization" if institution.get("identifiers") else None, "glam:HeritageCustodian", ] if c ], }, # Extraction provenance (5-component model) - reflects ORIGINAL extraction "extraction_provenance": { "namespace": "glam", "path": f"/conversations/{conversation_id}" if conversation_id != "unknown" else f"/files/{source_file}", "timestamp": extraction_date, "agent": extraction_agent, "context_convention": "ch_annotator-v1_7_0", }, # CH-Annotator application provenance (separate from original extraction) "annotation_provenance": { "annotation_agent": "opencode-claude-sonnet-4", "annotation_date": annotation_date, "annotation_method": "retroactive CH-Annotator application via batch script", "source_file": source_file, }, # Confidence and verification "annotation_metadata": { "confidence_score": confidence_score, "verified": False, "verification_date": None, "verified_by": None, }, } return entity_annotation def create_entity_claims(institution: dict, source_file: str) -> list: """Create CH-Annotator claims for key entity attributes.""" claims = [] provenance = institution.get("provenance", {}) conversation_id = provenance.get("conversation_id", "unknown") extraction_date = provenance.get("extraction_date", institution.get("created", "unknown")) extraction_agent = provenance.get("extraction_agent", "claude-conversation") # Base provenance for all claims base_provenance = { "namespace": "glam", "path": f"/conversations/{conversation_id}" if conversation_id != "unknown" else f"/files/{source_file}", "timestamp": extraction_date, "agent": extraction_agent, "context_convention": "ch_annotator-v1_7_0", } # Claim 1: Institution name (handle alternative field names) name = institution.get("name") or institution.get("preferred_label") if name: claims.append({ "claim_type": "full_name", "claim_value": name, "property_uri": "skos:prefLabel", "provenance": base_provenance.copy(), "confidence": provenance.get("confidence_score", 0.9), }) # Claim 2: Institution type (handle alternative field names) inst_type = institution.get("institution_type") or institution.get("custodian_type") if inst_type: claims.append({ "claim_type": "institution_type", "claim_value": inst_type, "property_uri": "rdf:type", "provenance": base_provenance.copy(), "confidence": 0.95, }) # Claim 3: Location (city) - handle different location structures locations = institution.get("locations", []) place_designation = institution.get("place_designation", {}) city = None if locations and isinstance(locations, list) and len(locations) > 0: first_loc = locations[0] if isinstance(first_loc, dict): city = first_loc.get("city") elif place_designation: # Handle Palestinian schema structure city = place_designation.get("place_name") if not city: settlement = place_designation.get("settlement", {}) city = settlement.get("settlement_name") if city: claims.append({ "claim_type": "located_in_city", "claim_value": city, "property_uri": "schema:addressLocality", "provenance": base_provenance.copy(), "confidence": 0.9, }) # Claim 4: Wikidata ID (if present) identifiers = institution.get("identifiers", []) if identifiers and isinstance(identifiers, list): for ident in identifiers: if isinstance(ident, dict) and ident.get("identifier_scheme") == "Wikidata": claims.append({ "claim_type": "wikidata_id", "claim_value": ident["identifier_value"], "property_uri": "owl:sameAs", "provenance": { **base_provenance, "namespace": "wikidata", "path": f"/entity/{ident['identifier_value']}", }, "confidence": 0.98, }) break # Claim 5: GHCID (if present) ghcid = institution.get("ghcid") if ghcid: claims.append({ "claim_type": "ghcid", "claim_value": ghcid, "property_uri": "glam:ghcid", "provenance": base_provenance.copy(), "confidence": 1.0, # GHCID is deterministically generated }) return claims def apply_ch_annotator(input_path: Path, output_path: Path) -> dict: """Apply CH-Annotator convention to a dataset.""" # Load existing data with open(input_path, 'r', encoding='utf-8') as f: content = f.read() # Handle YAML files that may have header comments data = yaml.safe_load(content) if data is None: return {"error": "Empty or invalid YAML file", "total_institutions": 0} # Handle different data structures if isinstance(data, dict): # Some files have a root key containing the list # Try to find the list of institutions for key in ["institutions", "entries", "records", "data", "custodians"]: if key in data and isinstance(data[key], list): institutions = data[key] root_key = key # Remember the root key for writing back break else: # Single institution or unknown structure return {"error": "Unknown data structure (dict without institution list)", "total_institutions": 0} elif isinstance(data, list): institutions = data root_key = None else: return {"error": f"Unexpected data type: {type(data)}", "total_institutions": 0} if not institutions: return {"error": "No institutions found", "total_institutions": 0} # Check if already annotated if already_has_ch_annotator(institutions): return {"skipped": True, "reason": "Already has CH-Annotator annotations", "total_institutions": len(institutions)} annotation_date = datetime.now(timezone.utc).isoformat() source_file = input_path.name stats = { "total_institutions": 0, "annotations_added": 0, "claims_created": 0, "by_hypernym": {}, "skipped": False, } # Process each institution for institution in institutions: if not isinstance(institution, dict): continue stats["total_institutions"] += 1 # Add CH-Annotator entity annotation block ch_annotation = create_ch_annotator_block(institution, annotation_date, source_file) institution["ch_annotator"] = ch_annotation stats["annotations_added"] += 1 # Track hypernym distribution hypernym = ch_annotation["entity_classification"]["subtype"] stats["by_hypernym"][hypernym] = stats["by_hypernym"].get(hypernym, 0) + 1 # Add entity claims claims = create_entity_claims(institution, source_file) institution["ch_annotator"]["entity_claims"] = claims stats["claims_created"] += len(claims) # Write updated data with open(output_path, 'w', encoding='utf-8') as f: # Add header comment header = f"""# Heritage Institutions - CH-Annotator Enhanced # Source file: {input_path.name} # Last updated: {annotation_date} # CH-Annotator Convention: ch_annotator-v1_7_0 # # This file has been enhanced with formal CH-Annotator (Cultural Heritage Annotator) # provenance metadata following the ch_annotator-v1_7_0 convention. # # CH-Annotator Features Applied: # - Entity hypernym codes (GRP.HER.* for heritage institutions) # - 5-component claim provenance model (namespace, path, timestamp, agent, convention) # - Ontology class mappings (CIDOC-CRM, Schema.org, W3C Org) # - Entity claims for key attributes (name, type, location, identifiers, GHCID) # # Statistics: # - Total institutions: {stats['total_institutions']} # - Annotations added: {stats['annotations_added']} # - Entity claims created: {stats['claims_created']} # - Hypernym distribution: {', '.join(f'{k}: {v}' for k, v in sorted(stats['by_hypernym'].items()))} # # Annotation Agent: opencode-claude-sonnet-4 # Annotation Date: {annotation_date} """ f.write(header) yaml.dump(institutions if isinstance(data, list) else data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return stats # ============================================================================= # MAIN ENTRY POINT # ============================================================================= def main(): parser = argparse.ArgumentParser( description="Batch apply CH-Annotator convention to GLAM datasets" ) parser.add_argument( "--files", nargs="+", help="Specific files to process (default: discover all primary datasets)" ) parser.add_argument( "--dry-run", action="store_true", help="Show what would be processed without making changes" ) parser.add_argument( "--output-suffix", default="_ch_annotator", help="Suffix for output files (default: _ch_annotator)" ) parser.add_argument( "--in-place", action="store_true", help="Modify files in place (creates backup with .pre_ch_annotator suffix)" ) args = parser.parse_args() base_path = Path("/Users/kempersc/apps/glam") print("=" * 70) print("CH-Annotator Batch Application") print("=" * 70) print(f"Convention: ch_annotator-v1_7_0") print(f"Annotation Agent: opencode-claude-sonnet-4") print() # Discover or use specified files if args.files: datasets = [] for f in args.files: p = Path(f) if not p.is_absolute(): p = base_path / p if p.exists(): datasets.append(p) else: print(f" [SKIP] File not found: {f}") else: print("Discovering datasets...") datasets = discover_datasets(base_path) print(f"Found {len(datasets)} datasets to process") print() if args.dry_run: print("[DRY RUN] Would process:") for ds in datasets: print(f" - {ds.relative_to(base_path)}") return # Process each dataset total_stats = { "files_processed": 0, "files_skipped": 0, "files_failed": 0, "total_institutions": 0, "total_claims": 0, "by_hypernym": {}, } for dataset in datasets: rel_path = dataset.relative_to(base_path) print(f"Processing: {rel_path}") # Determine output path if args.in_place: output_path = dataset # Create backup backup_path = dataset.with_suffix(f".pre_ch_annotator{dataset.suffix}") import shutil shutil.copy2(dataset, backup_path) else: stem = dataset.stem if stem.endswith("_ghcid"): stem = stem.replace("_ghcid", "") output_path = dataset.parent / f"{stem}{args.output_suffix}.yaml" try: stats = apply_ch_annotator(dataset, output_path) if stats.get("skipped"): print(f" [SKIP] {stats.get('reason', 'Unknown reason')}") total_stats["files_skipped"] += 1 elif stats.get("error"): print(f" [ERROR] {stats['error']}") total_stats["files_failed"] += 1 else: print(f" [OK] {stats['total_institutions']} institutions, {stats['claims_created']} claims") print(f" Output: {output_path.relative_to(base_path)}") total_stats["files_processed"] += 1 total_stats["total_institutions"] += stats["total_institutions"] total_stats["total_claims"] += stats["claims_created"] # Merge hypernym stats for h, c in stats.get("by_hypernym", {}).items(): total_stats["by_hypernym"][h] = total_stats["by_hypernym"].get(h, 0) + c except Exception as e: print(f" [ERROR] {e}") total_stats["files_failed"] += 1 # Print summary print() print("=" * 70) print("SUMMARY") print("=" * 70) print(f"Files processed: {total_stats['files_processed']}") print(f"Files skipped: {total_stats['files_skipped']}") print(f"Files failed: {total_stats['files_failed']}") print(f"Total institutions: {total_stats['total_institutions']}") print(f"Total claims: {total_stats['total_claims']}") print() print("Hypernym Distribution:") for hypernym, count in sorted(total_stats["by_hypernym"].items()): print(f" {hypernym}: {count}") print("=" * 70) if __name__ == "__main__": main()