glam/scripts/apply_ch_annotator_batch.py
2025-12-07 00:26:01 +01:00

595 lines
21 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Batch Apply CH-Annotator Convention to GLAM Datasets
This script applies formal CH-Annotator (ch_annotator-v1_7_0) provenance metadata
to heritage institution datasets across multiple countries.
CH-Annotator v1.7.0 requirements:
1. Entity hypernym codes (GRP.HER for heritage institutions)
2. 5-component claim provenance model
3. Convention version reference
4. Ontology class mappings
Usage:
# Process all discovered datasets
python scripts/apply_ch_annotator_batch.py
# Process specific files
python scripts/apply_ch_annotator_batch.py --files data/instances/egypt_institutions_ghcid.yaml
# Dry run (show what would be processed)
python scripts/apply_ch_annotator_batch.py --dry-run
Author: OpenCode (Claude Sonnet 4)
Date: 2025-12-06
Convention: ch_annotator-v1_7_0
"""
import argparse
import yaml
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Optional
import sys
# =============================================================================
# CH-ANNOTATOR MAPPINGS
# =============================================================================
# Map GLAMORCUBESFIXPHDNT institution_type to CH-Annotator hypernym codes
INSTITUTION_TYPE_TO_HYPERNYM = {
"GALLERY": "GRP.HER.GAL",
"LIBRARY": "GRP.HER.LIB",
"ARCHIVE": "GRP.HER.ARC",
"MUSEUM": "GRP.HER.MUS",
"OFFICIAL_INSTITUTION": "GRP.HER.OFF",
"RESEARCH_CENTER": "GRP.HER.RES",
"CORPORATION": "GRP.COR",
"UNKNOWN": "GRP.HER",
"BOTANICAL_ZOO": "GRP.HER.BOT",
"EDUCATION_PROVIDER": "GRP.EDU",
"COLLECTING_SOCIETY": "GRP.HER.SOC",
"FEATURES": "TOP.MON", # Monuments are toponym-based
"INTANGIBLE_HERITAGE_GROUP": "GRP.HER.INT",
"MIXED": "GRP.HER.MIX",
"PERSONAL_COLLECTION": "GRP.HER.PER",
"HOLY_SITES": "GRP.HER.HOL",
"DIGITAL_PLATFORM": "GRP.HER.DIG",
"NGO": "GRP.ASS",
"TASTE_SMELL": "GRP.HER.TAS",
}
# Ontology class mappings for each hypernym
HYPERNYM_ONTOLOGY_CLASS = {
"GRP.HER.GAL": "schema:Museum",
"GRP.HER.LIB": "schema:Library",
"GRP.HER.ARC": "schema:ArchiveOrganization",
"GRP.HER.MUS": "schema:Museum",
"GRP.HER.OFF": "schema:GovernmentOrganization",
"GRP.HER.RES": "schema:ResearchOrganization",
"GRP.COR": "schema:Corporation",
"GRP.HER": "glam:HeritageCustodian",
"GRP.HER.BOT": "schema:Zoo",
"GRP.EDU": "schema:EducationalOrganization",
"GRP.HER.SOC": "org:FormalOrganization",
"TOP.MON": "crm:E22_Human-Made_Object",
"GRP.HER.INT": "crm:E74_Group",
"GRP.HER.MIX": "glam:HeritageCustodian",
"GRP.HER.PER": "glam:PersonalCollection",
"GRP.HER.HOL": "schema:PlaceOfWorship",
"GRP.HER.DIG": "schema:WebSite",
"GRP.ASS": "org:FormalOrganization",
"GRP.HER.TAS": "glam:HeritageCustodian",
}
# Hypernym labels
HYPERNYM_LABELS = {
"GRP": "GROUP",
"TOP": "TOPONYM",
"AGT": "AGENT",
}
# =============================================================================
# DATASET DISCOVERY
# =============================================================================
# Primary datasets to process (main YAML files with GHCID-enriched data)
# Excludes: backups, archives, cache files, test files
PRIMARY_DATASETS = [
# North Africa
"data/instances/algeria/algerian_institutions_ghcid.yaml",
"data/instances/egypt_institutions_ghcid.yaml",
"data/instances/libya/libyan_institutions.yaml",
"data/instances/morocco/moroccan_institutions.yaml",
"data/instances/tunisia/tunisian_institutions_enhanced.yaml",
# Europe
"data/instances/austria_complete.yaml",
"data/instances/belarus_complete.yaml",
"data/instances/belgium_complete.yaml",
"data/instances/bulgaria_complete.yaml",
"data/instances/czech_unified.yaml",
"data/instances/denmark_libraries.yaml",
"data/instances/netherlands_complete.yaml",
"data/instances/norway/city_archives.yaml",
"data/instances/norway/county_archives.yaml",
"data/instances/norway/museums_oslo.yaml",
"data/instances/switzerland_isil.yaml",
"data/instances/georgia_glam_institutions_enriched.yaml",
"data/instances/great_britain/gb_institutions_enriched_manual.yaml",
"data/instances/italy/it_institutions_enriched_manual.yaml",
# Asia
"data/instances/japan_complete.yaml",
"data/instances/vietnamese_glam_institutions.yaml",
"data/instances/palestinian_heritage_custodians.yaml",
# Americas
"data/instances/argentina_complete.yaml",
"data/instances/latin_american_institutions_AUTHORITATIVE.yaml",
"data/instances/mexico/mexican_institutions_curated.yaml",
"data/instances/united_states/us_institutions_enriched_manual.yaml",
]
def discover_datasets(base_path: Path) -> list[Path]:
"""Discover all primary datasets that need CH-Annotator application."""
datasets = []
for rel_path in PRIMARY_DATASETS:
full_path = base_path / rel_path
if full_path.exists():
datasets.append(full_path)
else:
print(f" [SKIP] Not found: {rel_path}")
return datasets
def already_has_ch_annotator(data: list | dict) -> bool:
"""Check if dataset already has CH-Annotator annotations."""
if isinstance(data, list) and len(data) > 0:
first_item = data[0]
return isinstance(first_item, dict) and "ch_annotator" in first_item
return False
# =============================================================================
# CH-ANNOTATOR APPLICATION
# =============================================================================
def get_hypernym_code(institution_type: str) -> str:
"""Get CH-Annotator hypernym code from institution type."""
return INSTITUTION_TYPE_TO_HYPERNYM.get(institution_type, "GRP.HER")
def get_ontology_class(hypernym_code: str) -> str:
"""Get ontology class URI from hypernym code."""
return HYPERNYM_ONTOLOGY_CLASS.get(hypernym_code, "glam:HeritageCustodian")
def get_hypernym_label(hypernym_code: str) -> str:
"""Get hypernym label from code (first 3 letters)."""
prefix = hypernym_code.split(".")[0] if "." in hypernym_code else hypernym_code
return HYPERNYM_LABELS.get(prefix, "GROUP")
def extract_conversation_id(institution: dict) -> str:
"""Extract conversation ID from provenance if available."""
provenance = institution.get("provenance", {})
return provenance.get("conversation_id", "unknown")
def create_ch_annotator_block(institution: dict, annotation_date: str, source_file: str) -> dict:
"""Create CH-Annotator entity annotation block for an institution."""
# Handle alternative field names for institution type
institution_type = (
institution.get("institution_type") or
institution.get("custodian_type") or
"UNKNOWN"
)
hypernym_code = get_hypernym_code(institution_type)
ontology_class = get_ontology_class(hypernym_code)
hypernym_label = get_hypernym_label(hypernym_code)
# Extract provenance info
provenance = institution.get("provenance", {})
conversation_id = provenance.get("conversation_id", "unknown")
extraction_date = provenance.get("extraction_date", "unknown")
confidence_score = provenance.get("confidence_score", 0.85)
# Determine original extraction agent
# For conversation-based extractions, we use a generic identifier
# since Claude conversation exports don't specify model version
extraction_agent = provenance.get("extraction_agent", "claude-conversation")
# Build entity annotation block
entity_annotation = {
"convention_id": "ch_annotator-v1_7_0",
"convention_version": "1.7.0",
# Entity classification
"entity_classification": {
"hypernym": hypernym_code.split(".")[0] if "." in hypernym_code else hypernym_code,
"hypernym_label": hypernym_label,
"subtype": hypernym_code,
"subtype_label": institution_type,
"ontology_class": ontology_class,
"alternative_classes": [
c for c in [
"org:FormalOrganization",
"rov:RegisteredOrganization" if institution.get("identifiers") else None,
"glam:HeritageCustodian",
] if c
],
},
# Extraction provenance (5-component model) - reflects ORIGINAL extraction
"extraction_provenance": {
"namespace": "glam",
"path": f"/conversations/{conversation_id}" if conversation_id != "unknown" else f"/files/{source_file}",
"timestamp": extraction_date,
"agent": extraction_agent,
"context_convention": "ch_annotator-v1_7_0",
},
# CH-Annotator application provenance (separate from original extraction)
"annotation_provenance": {
"annotation_agent": "opencode-claude-sonnet-4",
"annotation_date": annotation_date,
"annotation_method": "retroactive CH-Annotator application via batch script",
"source_file": source_file,
},
# Confidence and verification
"annotation_metadata": {
"confidence_score": confidence_score,
"verified": False,
"verification_date": None,
"verified_by": None,
},
}
return entity_annotation
def create_entity_claims(institution: dict, source_file: str) -> list:
"""Create CH-Annotator claims for key entity attributes."""
claims = []
provenance = institution.get("provenance", {})
conversation_id = provenance.get("conversation_id", "unknown")
extraction_date = provenance.get("extraction_date", institution.get("created", "unknown"))
extraction_agent = provenance.get("extraction_agent", "claude-conversation")
# Base provenance for all claims
base_provenance = {
"namespace": "glam",
"path": f"/conversations/{conversation_id}" if conversation_id != "unknown" else f"/files/{source_file}",
"timestamp": extraction_date,
"agent": extraction_agent,
"context_convention": "ch_annotator-v1_7_0",
}
# Claim 1: Institution name (handle alternative field names)
name = institution.get("name") or institution.get("preferred_label")
if name:
claims.append({
"claim_type": "full_name",
"claim_value": name,
"property_uri": "skos:prefLabel",
"provenance": base_provenance.copy(),
"confidence": provenance.get("confidence_score", 0.9),
})
# Claim 2: Institution type (handle alternative field names)
inst_type = institution.get("institution_type") or institution.get("custodian_type")
if inst_type:
claims.append({
"claim_type": "institution_type",
"claim_value": inst_type,
"property_uri": "rdf:type",
"provenance": base_provenance.copy(),
"confidence": 0.95,
})
# Claim 3: Location (city) - handle different location structures
locations = institution.get("locations", [])
place_designation = institution.get("place_designation", {})
city = None
if locations and isinstance(locations, list) and len(locations) > 0:
first_loc = locations[0]
if isinstance(first_loc, dict):
city = first_loc.get("city")
elif place_designation:
# Handle Palestinian schema structure
city = place_designation.get("place_name")
if not city:
settlement = place_designation.get("settlement", {})
city = settlement.get("settlement_name")
if city:
claims.append({
"claim_type": "located_in_city",
"claim_value": city,
"property_uri": "schema:addressLocality",
"provenance": base_provenance.copy(),
"confidence": 0.9,
})
# Claim 4: Wikidata ID (if present)
identifiers = institution.get("identifiers", [])
if identifiers and isinstance(identifiers, list):
for ident in identifiers:
if isinstance(ident, dict) and ident.get("identifier_scheme") == "Wikidata":
claims.append({
"claim_type": "wikidata_id",
"claim_value": ident["identifier_value"],
"property_uri": "owl:sameAs",
"provenance": {
**base_provenance,
"namespace": "wikidata",
"path": f"/entity/{ident['identifier_value']}",
},
"confidence": 0.98,
})
break
# Claim 5: GHCID (if present)
ghcid = institution.get("ghcid")
if ghcid:
claims.append({
"claim_type": "ghcid",
"claim_value": ghcid,
"property_uri": "glam:ghcid",
"provenance": base_provenance.copy(),
"confidence": 1.0, # GHCID is deterministically generated
})
return claims
def apply_ch_annotator(input_path: Path, output_path: Path) -> dict:
"""Apply CH-Annotator convention to a dataset."""
# Load existing data
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
# Handle YAML files that may have header comments
data = yaml.safe_load(content)
if data is None:
return {"error": "Empty or invalid YAML file", "total_institutions": 0}
# Handle different data structures
if isinstance(data, dict):
# Some files have a root key containing the list
# Try to find the list of institutions
for key in ["institutions", "entries", "records", "data", "custodians"]:
if key in data and isinstance(data[key], list):
institutions = data[key]
root_key = key # Remember the root key for writing back
break
else:
# Single institution or unknown structure
return {"error": "Unknown data structure (dict without institution list)", "total_institutions": 0}
elif isinstance(data, list):
institutions = data
root_key = None
else:
return {"error": f"Unexpected data type: {type(data)}", "total_institutions": 0}
if not institutions:
return {"error": "No institutions found", "total_institutions": 0}
# Check if already annotated
if already_has_ch_annotator(institutions):
return {"skipped": True, "reason": "Already has CH-Annotator annotations", "total_institutions": len(institutions)}
annotation_date = datetime.now(timezone.utc).isoformat()
source_file = input_path.name
stats = {
"total_institutions": 0,
"annotations_added": 0,
"claims_created": 0,
"by_hypernym": {},
"skipped": False,
}
# Process each institution
for institution in institutions:
if not isinstance(institution, dict):
continue
stats["total_institutions"] += 1
# Add CH-Annotator entity annotation block
ch_annotation = create_ch_annotator_block(institution, annotation_date, source_file)
institution["ch_annotator"] = ch_annotation
stats["annotations_added"] += 1
# Track hypernym distribution
hypernym = ch_annotation["entity_classification"]["subtype"]
stats["by_hypernym"][hypernym] = stats["by_hypernym"].get(hypernym, 0) + 1
# Add entity claims
claims = create_entity_claims(institution, source_file)
institution["ch_annotator"]["entity_claims"] = claims
stats["claims_created"] += len(claims)
# Write updated data
with open(output_path, 'w', encoding='utf-8') as f:
# Add header comment
header = f"""# Heritage Institutions - CH-Annotator Enhanced
# Source file: {input_path.name}
# Last updated: {annotation_date}
# CH-Annotator Convention: ch_annotator-v1_7_0
#
# This file has been enhanced with formal CH-Annotator (Cultural Heritage Annotator)
# provenance metadata following the ch_annotator-v1_7_0 convention.
#
# CH-Annotator Features Applied:
# - Entity hypernym codes (GRP.HER.* for heritage institutions)
# - 5-component claim provenance model (namespace, path, timestamp, agent, convention)
# - Ontology class mappings (CIDOC-CRM, Schema.org, W3C Org)
# - Entity claims for key attributes (name, type, location, identifiers, GHCID)
#
# Statistics:
# - Total institutions: {stats['total_institutions']}
# - Annotations added: {stats['annotations_added']}
# - Entity claims created: {stats['claims_created']}
# - Hypernym distribution: {', '.join(f'{k}: {v}' for k, v in sorted(stats['by_hypernym'].items()))}
#
# Annotation Agent: opencode-claude-sonnet-4
# Annotation Date: {annotation_date}
"""
f.write(header)
yaml.dump(institutions if isinstance(data, list) else data, f,
default_flow_style=False, allow_unicode=True, sort_keys=False)
return stats
# =============================================================================
# MAIN ENTRY POINT
# =============================================================================
def main():
parser = argparse.ArgumentParser(
description="Batch apply CH-Annotator convention to GLAM datasets"
)
parser.add_argument(
"--files",
nargs="+",
help="Specific files to process (default: discover all primary datasets)"
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be processed without making changes"
)
parser.add_argument(
"--output-suffix",
default="_ch_annotator",
help="Suffix for output files (default: _ch_annotator)"
)
parser.add_argument(
"--in-place",
action="store_true",
help="Modify files in place (creates backup with .pre_ch_annotator suffix)"
)
args = parser.parse_args()
base_path = Path("/Users/kempersc/apps/glam")
print("=" * 70)
print("CH-Annotator Batch Application")
print("=" * 70)
print(f"Convention: ch_annotator-v1_7_0")
print(f"Annotation Agent: opencode-claude-sonnet-4")
print()
# Discover or use specified files
if args.files:
datasets = []
for f in args.files:
p = Path(f)
if not p.is_absolute():
p = base_path / p
if p.exists():
datasets.append(p)
else:
print(f" [SKIP] File not found: {f}")
else:
print("Discovering datasets...")
datasets = discover_datasets(base_path)
print(f"Found {len(datasets)} datasets to process")
print()
if args.dry_run:
print("[DRY RUN] Would process:")
for ds in datasets:
print(f" - {ds.relative_to(base_path)}")
return
# Process each dataset
total_stats = {
"files_processed": 0,
"files_skipped": 0,
"files_failed": 0,
"total_institutions": 0,
"total_claims": 0,
"by_hypernym": {},
}
for dataset in datasets:
rel_path = dataset.relative_to(base_path)
print(f"Processing: {rel_path}")
# Determine output path
if args.in_place:
output_path = dataset
# Create backup
backup_path = dataset.with_suffix(f".pre_ch_annotator{dataset.suffix}")
import shutil
shutil.copy2(dataset, backup_path)
else:
stem = dataset.stem
if stem.endswith("_ghcid"):
stem = stem.replace("_ghcid", "")
output_path = dataset.parent / f"{stem}{args.output_suffix}.yaml"
try:
stats = apply_ch_annotator(dataset, output_path)
if stats.get("skipped"):
print(f" [SKIP] {stats.get('reason', 'Unknown reason')}")
total_stats["files_skipped"] += 1
elif stats.get("error"):
print(f" [ERROR] {stats['error']}")
total_stats["files_failed"] += 1
else:
print(f" [OK] {stats['total_institutions']} institutions, {stats['claims_created']} claims")
print(f" Output: {output_path.relative_to(base_path)}")
total_stats["files_processed"] += 1
total_stats["total_institutions"] += stats["total_institutions"]
total_stats["total_claims"] += stats["claims_created"]
# Merge hypernym stats
for h, c in stats.get("by_hypernym", {}).items():
total_stats["by_hypernym"][h] = total_stats["by_hypernym"].get(h, 0) + c
except Exception as e:
print(f" [ERROR] {e}")
total_stats["files_failed"] += 1
# Print summary
print()
print("=" * 70)
print("SUMMARY")
print("=" * 70)
print(f"Files processed: {total_stats['files_processed']}")
print(f"Files skipped: {total_stats['files_skipped']}")
print(f"Files failed: {total_stats['files_failed']}")
print(f"Total institutions: {total_stats['total_institutions']}")
print(f"Total claims: {total_stats['total_claims']}")
print()
print("Hypernym Distribution:")
for hypernym, count in sorted(total_stats["by_hypernym"].items()):
print(f" {hypernym}: {count}")
print("=" * 70)
if __name__ == "__main__":
main()