#!/usr/bin/env python3 """ Apply CH-Annotator Convention to Algeria GLAM Dataset This script retroactively adds formal CH-Annotator (ch_annotator-v1_7_0) provenance metadata to the Algerian heritage institutions dataset. CH-Annotator v1.7.0 requirements: 1. Entity hypernym codes (GRP.HER for heritage institutions) 2. 5-component claim provenance model 3. Convention version reference 4. Ontology class mappings Usage: python scripts/apply_ch_annotator_algeria.py """ import yaml from datetime import datetime, timezone from pathlib import Path from typing import Any # ============================================================================= # CH-ANNOTATOR MAPPINGS # ============================================================================= # Map GLAMORCUBESFIXPHDNT institution_type to CH-Annotator hypernym codes INSTITUTION_TYPE_TO_HYPERNYM = { "GALLERY": "GRP.HER.GAL", "LIBRARY": "GRP.HER.LIB", "ARCHIVE": "GRP.HER.ARC", "MUSEUM": "GRP.HER.MUS", "OFFICIAL_INSTITUTION": "GRP.HER.OFF", "RESEARCH_CENTER": "GRP.HER.RES", "CORPORATION": "GRP.COR", "UNKNOWN": "GRP.HER", "BOTANICAL_ZOO": "GRP.HER.BOT", "EDUCATION_PROVIDER": "GRP.EDU", "COLLECTING_SOCIETY": "GRP.HER.SOC", "FEATURES": "TOP.MON", # Monuments are toponym-based "INTANGIBLE_HERITAGE_GROUP": "GRP.HER.INT", "MIXED": "GRP.HER.MIX", "PERSONAL_COLLECTION": "GRP.HER.PER", "HOLY_SITES": "GRP.HER.HOL", "DIGITAL_PLATFORM": "GRP.HER.DIG", "NGO": "GRP.ASS", "TASTE_SMELL": "GRP.HER.TAS", } # Ontology class mappings for each hypernym HYPERNYM_ONTOLOGY_CLASS = { "GRP.HER.GAL": "schema:Museum", "GRP.HER.LIB": "schema:Library", "GRP.HER.ARC": "schema:ArchiveOrganization", "GRP.HER.MUS": "schema:Museum", "GRP.HER.OFF": "schema:GovernmentOrganization", "GRP.HER.RES": "schema:ResearchOrganization", "GRP.COR": "schema:Corporation", "GRP.HER": "glam:HeritageCustodian", "GRP.HER.BOT": "schema:Zoo", "GRP.EDU": "schema:EducationalOrganization", "GRP.HER.SOC": "org:FormalOrganization", "TOP.MON": "crm:E22_Human-Made_Object", "GRP.HER.INT": "crm:E74_Group", "GRP.HER.MIX": "glam:HeritageCustodian", "GRP.HER.PER": "glam:PersonalCollection", "GRP.HER.HOL": "schema:PlaceOfWorship", "GRP.HER.DIG": "schema:WebSite", "GRP.ASS": "org:FormalOrganization", "GRP.HER.TAS": "glam:HeritageCustodian", } def get_hypernym_code(institution_type: str) -> str: """Get CH-Annotator hypernym code from institution type.""" return INSTITUTION_TYPE_TO_HYPERNYM.get(institution_type, "GRP.HER") def get_ontology_class(hypernym_code: str) -> str: """Get ontology class URI from hypernym code.""" return HYPERNYM_ONTOLOGY_CLASS.get(hypernym_code, "glam:HeritageCustodian") def create_ch_annotator_block(institution: dict, annotation_date: str) -> dict: """Create CH-Annotator entity annotation block for an institution.""" institution_type = institution.get("institution_type", "UNKNOWN") hypernym_code = get_hypernym_code(institution_type) ontology_class = get_ontology_class(hypernym_code) # Extract conversation_id from existing provenance provenance = institution.get("provenance", {}) conversation_id = provenance.get("conversation_id", "039a271a-f8e3-4bf3-9e89-b289ec80701d") extraction_date = provenance.get("extraction_date", "2025-11-09T00:00:00Z") confidence_score = provenance.get("confidence_score", 0.85) # Build entity annotation block entity_annotation = { "convention_id": "ch_annotator-v1_7_0", "convention_version": "1.7.0", "annotation_date": annotation_date, # Entity classification "entity_classification": { "hypernym": "GRP", "hypernym_label": "GROUP", "subtype": hypernym_code, "subtype_label": institution_type, "ontology_class": ontology_class, "alternative_classes": [ "org:FormalOrganization", "rov:RegisteredOrganization" if institution.get("identifiers") else None, "glam:HeritageCustodian", ], }, # Extraction provenance (5-component model) # Note: agent reflects the ORIGINAL extraction model from conversation # For Claude conversation exports, the specific model version is often unknown "extraction_provenance": { "namespace": "glam", "path": f"/conversations/{conversation_id}", "timestamp": extraction_date, "agent": provenance.get("extraction_agent", "claude-conversation"), # Original extraction agent "context_convention": "ch_annotator-v1_7_0", }, # CH-Annotator application provenance (separate from original extraction) "annotation_provenance": { "annotation_agent": "opencode-claude-sonnet-4", # Model applying CH-Annotator NOW "annotation_date": annotation_date, "annotation_method": "retroactive CH-Annotator application via batch script", }, # Confidence and verification "annotation_metadata": { "confidence_score": confidence_score, "verified": False, "verification_date": None, "verified_by": None, "annotation_notes": f"Retroactive CH-Annotator annotation applied {annotation_date}. " f"Original extraction from Algerian GLAM conversation.", }, } # Clean up None values in alternative_classes entity_annotation["entity_classification"]["alternative_classes"] = [ c for c in entity_annotation["entity_classification"]["alternative_classes"] if c ] return entity_annotation def create_entity_claims(institution: dict) -> list: """Create CH-Annotator claims for key entity attributes.""" claims = [] provenance = institution.get("provenance", {}) conversation_id = provenance.get("conversation_id", "039a271a-f8e3-4bf3-9e89-b289ec80701d") extraction_date = provenance.get("extraction_date", "2025-11-09T00:00:00Z") # Base provenance for all claims # Note: agent reflects original extraction, not CH-Annotator application base_provenance = { "namespace": "glam", "path": f"/conversations/{conversation_id}", "timestamp": extraction_date, "agent": provenance.get("extraction_agent", "claude-conversation"), "context_convention": "ch_annotator-v1_7_0", } # Claim 1: Institution name if institution.get("name"): claims.append({ "claim_type": "full_name", "claim_value": institution["name"], "property_uri": "skos:prefLabel", "provenance": base_provenance.copy(), "confidence": provenance.get("confidence_score", 0.9), }) # Claim 2: Institution type if institution.get("institution_type"): claims.append({ "claim_type": "institution_type", "claim_value": institution["institution_type"], "property_uri": "rdf:type", "provenance": base_provenance.copy(), "confidence": 0.95, # Type classification is usually reliable }) # Claim 3: Location (city) locations = institution.get("locations", []) if locations and locations[0].get("city"): claims.append({ "claim_type": "located_in_city", "claim_value": locations[0]["city"], "property_uri": "schema:addressLocality", "provenance": base_provenance.copy(), "confidence": 0.9, }) # Claim 4: Wikidata ID (if present) identifiers = institution.get("identifiers", []) for ident in identifiers: if ident.get("identifier_scheme") == "Wikidata": claims.append({ "claim_type": "wikidata_id", "claim_value": ident["identifier_value"], "property_uri": "owl:sameAs", "provenance": { **base_provenance, "namespace": "wikidata", "path": f"/entity/{ident['identifier_value']}", }, "confidence": 0.98, # Wikidata IDs verified via SPARQL }) break return claims def apply_ch_annotator(input_path: Path, output_path: Path) -> dict: """Apply CH-Annotator convention to Algeria dataset.""" # Load existing data with open(input_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) annotation_date = datetime.now(timezone.utc).isoformat() stats = { "total_institutions": 0, "annotations_added": 0, "claims_created": 0, "by_hypernym": {}, } # Process each institution for institution in data: stats["total_institutions"] += 1 # Add CH-Annotator entity annotation block ch_annotation = create_ch_annotator_block(institution, annotation_date) institution["ch_annotator"] = ch_annotation stats["annotations_added"] += 1 # Track hypernym distribution hypernym = ch_annotation["entity_classification"]["subtype"] stats["by_hypernym"][hypernym] = stats["by_hypernym"].get(hypernym, 0) + 1 # Add entity claims claims = create_entity_claims(institution) institution["ch_annotator"]["entity_claims"] = claims stats["claims_created"] += len(claims) # Write updated data with open(output_path, 'w', encoding='utf-8') as f: # Add header comment header = """# Algerian GLAM Institutions - CH-Annotator Enhanced # Last updated: {date} # CH-Annotator Convention: ch_annotator-v1_7_0 # # This file has been enhanced with formal CH-Annotator (Cultural Heritage Annotator) # provenance metadata following the ch_annotator-v1_7_0 convention. # # CH-Annotator Features Applied: # - Entity hypernym codes (GRP.HER.* for heritage institutions) # - 5-component claim provenance model (namespace, path, timestamp, agent, convention) # - Ontology class mappings (CIDOC-CRM, Schema.org, W3C Org) # - Entity claims for key attributes (name, type, location, identifiers) # # Statistics: # - Total institutions: {total} # - Annotations added: {annotations} # - Entity claims created: {claims} # - Hypernym distribution: {hypernyms} # # Original Data Source: Algerian GLAM conversation (039a271a-f8e3-4bf3-9e89-b289ec80701d) # Original Extraction Date: 2025-11-09 # Convention Applied: {date} """.format( date=annotation_date, total=stats["total_institutions"], annotations=stats["annotations_added"], claims=stats["claims_created"], hypernyms=", ".join(f"{k}: {v}" for k, v in sorted(stats["by_hypernym"].items())), ) f.write(header) yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) return stats def main(): """Main entry point.""" input_path = Path("/Users/kempersc/apps/glam/data/instances/algeria/algerian_institutions_ghcid.yaml") output_path = Path("/Users/kempersc/apps/glam/data/instances/algeria/algerian_institutions_ch_annotator.yaml") print("=" * 70) print("CH-Annotator Convention Application") print("=" * 70) print(f"Input: {input_path}") print(f"Output: {output_path}") print() stats = apply_ch_annotator(input_path, output_path) print("Results:") print(f" Total institutions: {stats['total_institutions']}") print(f" Annotations added: {stats['annotations_added']}") print(f" Entity claims created: {stats['claims_created']}") print() print("Hypernym Distribution:") for hypernym, count in sorted(stats["by_hypernym"].items()): print(f" {hypernym}: {count}") print() print(f"Output written to: {output_path}") print("=" * 70) if __name__ == "__main__": main()