glam/scripts/apply_ch_annotator_algeria.py
2025-12-07 00:26:01 +01:00

327 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Apply CH-Annotator Convention to Algeria GLAM Dataset
This script retroactively adds formal CH-Annotator (ch_annotator-v1_7_0)
provenance metadata to the Algerian heritage institutions dataset.
CH-Annotator v1.7.0 requirements:
1. Entity hypernym codes (GRP.HER for heritage institutions)
2. 5-component claim provenance model
3. Convention version reference
4. Ontology class mappings
Usage:
python scripts/apply_ch_annotator_algeria.py
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
# =============================================================================
# CH-ANNOTATOR MAPPINGS
# =============================================================================
# Map GLAMORCUBESFIXPHDNT institution_type to CH-Annotator hypernym codes
INSTITUTION_TYPE_TO_HYPERNYM = {
"GALLERY": "GRP.HER.GAL",
"LIBRARY": "GRP.HER.LIB",
"ARCHIVE": "GRP.HER.ARC",
"MUSEUM": "GRP.HER.MUS",
"OFFICIAL_INSTITUTION": "GRP.HER.OFF",
"RESEARCH_CENTER": "GRP.HER.RES",
"CORPORATION": "GRP.COR",
"UNKNOWN": "GRP.HER",
"BOTANICAL_ZOO": "GRP.HER.BOT",
"EDUCATION_PROVIDER": "GRP.EDU",
"COLLECTING_SOCIETY": "GRP.HER.SOC",
"FEATURES": "TOP.MON", # Monuments are toponym-based
"INTANGIBLE_HERITAGE_GROUP": "GRP.HER.INT",
"MIXED": "GRP.HER.MIX",
"PERSONAL_COLLECTION": "GRP.HER.PER",
"HOLY_SITES": "GRP.HER.HOL",
"DIGITAL_PLATFORM": "GRP.HER.DIG",
"NGO": "GRP.ASS",
"TASTE_SMELL": "GRP.HER.TAS",
}
# Ontology class mappings for each hypernym
HYPERNYM_ONTOLOGY_CLASS = {
"GRP.HER.GAL": "schema:Museum",
"GRP.HER.LIB": "schema:Library",
"GRP.HER.ARC": "schema:ArchiveOrganization",
"GRP.HER.MUS": "schema:Museum",
"GRP.HER.OFF": "schema:GovernmentOrganization",
"GRP.HER.RES": "schema:ResearchOrganization",
"GRP.COR": "schema:Corporation",
"GRP.HER": "glam:HeritageCustodian",
"GRP.HER.BOT": "schema:Zoo",
"GRP.EDU": "schema:EducationalOrganization",
"GRP.HER.SOC": "org:FormalOrganization",
"TOP.MON": "crm:E22_Human-Made_Object",
"GRP.HER.INT": "crm:E74_Group",
"GRP.HER.MIX": "glam:HeritageCustodian",
"GRP.HER.PER": "glam:PersonalCollection",
"GRP.HER.HOL": "schema:PlaceOfWorship",
"GRP.HER.DIG": "schema:WebSite",
"GRP.ASS": "org:FormalOrganization",
"GRP.HER.TAS": "glam:HeritageCustodian",
}
def get_hypernym_code(institution_type: str) -> str:
"""Get CH-Annotator hypernym code from institution type."""
return INSTITUTION_TYPE_TO_HYPERNYM.get(institution_type, "GRP.HER")
def get_ontology_class(hypernym_code: str) -> str:
"""Get ontology class URI from hypernym code."""
return HYPERNYM_ONTOLOGY_CLASS.get(hypernym_code, "glam:HeritageCustodian")
def create_ch_annotator_block(institution: dict, annotation_date: str) -> dict:
"""Create CH-Annotator entity annotation block for an institution."""
institution_type = institution.get("institution_type", "UNKNOWN")
hypernym_code = get_hypernym_code(institution_type)
ontology_class = get_ontology_class(hypernym_code)
# Extract conversation_id from existing provenance
provenance = institution.get("provenance", {})
conversation_id = provenance.get("conversation_id", "039a271a-f8e3-4bf3-9e89-b289ec80701d")
extraction_date = provenance.get("extraction_date", "2025-11-09T00:00:00Z")
confidence_score = provenance.get("confidence_score", 0.85)
# Build entity annotation block
entity_annotation = {
"convention_id": "ch_annotator-v1_7_0",
"convention_version": "1.7.0",
"annotation_date": annotation_date,
# Entity classification
"entity_classification": {
"hypernym": "GRP",
"hypernym_label": "GROUP",
"subtype": hypernym_code,
"subtype_label": institution_type,
"ontology_class": ontology_class,
"alternative_classes": [
"org:FormalOrganization",
"rov:RegisteredOrganization" if institution.get("identifiers") else None,
"glam:HeritageCustodian",
],
},
# Extraction provenance (5-component model)
# Note: agent reflects the ORIGINAL extraction model from conversation
# For Claude conversation exports, the specific model version is often unknown
"extraction_provenance": {
"namespace": "glam",
"path": f"/conversations/{conversation_id}",
"timestamp": extraction_date,
"agent": provenance.get("extraction_agent", "claude-conversation"), # Original extraction agent
"context_convention": "ch_annotator-v1_7_0",
},
# CH-Annotator application provenance (separate from original extraction)
"annotation_provenance": {
"annotation_agent": "opencode-claude-sonnet-4", # Model applying CH-Annotator NOW
"annotation_date": annotation_date,
"annotation_method": "retroactive CH-Annotator application via batch script",
},
# Confidence and verification
"annotation_metadata": {
"confidence_score": confidence_score,
"verified": False,
"verification_date": None,
"verified_by": None,
"annotation_notes": f"Retroactive CH-Annotator annotation applied {annotation_date}. "
f"Original extraction from Algerian GLAM conversation.",
},
}
# Clean up None values in alternative_classes
entity_annotation["entity_classification"]["alternative_classes"] = [
c for c in entity_annotation["entity_classification"]["alternative_classes"] if c
]
return entity_annotation
def create_entity_claims(institution: dict) -> list:
"""Create CH-Annotator claims for key entity attributes."""
claims = []
provenance = institution.get("provenance", {})
conversation_id = provenance.get("conversation_id", "039a271a-f8e3-4bf3-9e89-b289ec80701d")
extraction_date = provenance.get("extraction_date", "2025-11-09T00:00:00Z")
# Base provenance for all claims
# Note: agent reflects original extraction, not CH-Annotator application
base_provenance = {
"namespace": "glam",
"path": f"/conversations/{conversation_id}",
"timestamp": extraction_date,
"agent": provenance.get("extraction_agent", "claude-conversation"),
"context_convention": "ch_annotator-v1_7_0",
}
# Claim 1: Institution name
if institution.get("name"):
claims.append({
"claim_type": "full_name",
"claim_value": institution["name"],
"property_uri": "skos:prefLabel",
"provenance": base_provenance.copy(),
"confidence": provenance.get("confidence_score", 0.9),
})
# Claim 2: Institution type
if institution.get("institution_type"):
claims.append({
"claim_type": "institution_type",
"claim_value": institution["institution_type"],
"property_uri": "rdf:type",
"provenance": base_provenance.copy(),
"confidence": 0.95, # Type classification is usually reliable
})
# Claim 3: Location (city)
locations = institution.get("locations", [])
if locations and locations[0].get("city"):
claims.append({
"claim_type": "located_in_city",
"claim_value": locations[0]["city"],
"property_uri": "schema:addressLocality",
"provenance": base_provenance.copy(),
"confidence": 0.9,
})
# Claim 4: Wikidata ID (if present)
identifiers = institution.get("identifiers", [])
for ident in identifiers:
if ident.get("identifier_scheme") == "Wikidata":
claims.append({
"claim_type": "wikidata_id",
"claim_value": ident["identifier_value"],
"property_uri": "owl:sameAs",
"provenance": {
**base_provenance,
"namespace": "wikidata",
"path": f"/entity/{ident['identifier_value']}",
},
"confidence": 0.98, # Wikidata IDs verified via SPARQL
})
break
return claims
def apply_ch_annotator(input_path: Path, output_path: Path) -> dict:
"""Apply CH-Annotator convention to Algeria dataset."""
# Load existing data
with open(input_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
annotation_date = datetime.now(timezone.utc).isoformat()
stats = {
"total_institutions": 0,
"annotations_added": 0,
"claims_created": 0,
"by_hypernym": {},
}
# Process each institution
for institution in data:
stats["total_institutions"] += 1
# Add CH-Annotator entity annotation block
ch_annotation = create_ch_annotator_block(institution, annotation_date)
institution["ch_annotator"] = ch_annotation
stats["annotations_added"] += 1
# Track hypernym distribution
hypernym = ch_annotation["entity_classification"]["subtype"]
stats["by_hypernym"][hypernym] = stats["by_hypernym"].get(hypernym, 0) + 1
# Add entity claims
claims = create_entity_claims(institution)
institution["ch_annotator"]["entity_claims"] = claims
stats["claims_created"] += len(claims)
# Write updated data
with open(output_path, 'w', encoding='utf-8') as f:
# Add header comment
header = """# Algerian GLAM Institutions - CH-Annotator Enhanced
# Last updated: {date}
# CH-Annotator Convention: ch_annotator-v1_7_0
#
# This file has been enhanced with formal CH-Annotator (Cultural Heritage Annotator)
# provenance metadata following the ch_annotator-v1_7_0 convention.
#
# CH-Annotator Features Applied:
# - Entity hypernym codes (GRP.HER.* for heritage institutions)
# - 5-component claim provenance model (namespace, path, timestamp, agent, convention)
# - Ontology class mappings (CIDOC-CRM, Schema.org, W3C Org)
# - Entity claims for key attributes (name, type, location, identifiers)
#
# Statistics:
# - Total institutions: {total}
# - Annotations added: {annotations}
# - Entity claims created: {claims}
# - Hypernym distribution: {hypernyms}
#
# Original Data Source: Algerian GLAM conversation (039a271a-f8e3-4bf3-9e89-b289ec80701d)
# Original Extraction Date: 2025-11-09
# Convention Applied: {date}
""".format(
date=annotation_date,
total=stats["total_institutions"],
annotations=stats["annotations_added"],
claims=stats["claims_created"],
hypernyms=", ".join(f"{k}: {v}" for k, v in sorted(stats["by_hypernym"].items())),
)
f.write(header)
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
return stats
def main():
"""Main entry point."""
input_path = Path("/Users/kempersc/apps/glam/data/instances/algeria/algerian_institutions_ghcid.yaml")
output_path = Path("/Users/kempersc/apps/glam/data/instances/algeria/algerian_institutions_ch_annotator.yaml")
print("=" * 70)
print("CH-Annotator Convention Application")
print("=" * 70)
print(f"Input: {input_path}")
print(f"Output: {output_path}")
print()
stats = apply_ch_annotator(input_path, output_path)
print("Results:")
print(f" Total institutions: {stats['total_institutions']}")
print(f" Annotations added: {stats['annotations_added']}")
print(f" Entity claims created: {stats['claims_created']}")
print()
print("Hypernym Distribution:")
for hypernym, count in sorted(stats["by_hypernym"].items()):
print(f" {hypernym}: {count}")
print()
print(f"Output written to: {output_path}")
print("=" * 70)
if __name__ == "__main__":
main()