327 lines
12 KiB
Python
327 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Apply CH-Annotator Convention to Algeria GLAM Dataset
|
|
|
|
This script retroactively adds formal CH-Annotator (ch_annotator-v1_7_0)
|
|
provenance metadata to the Algerian heritage institutions dataset.
|
|
|
|
CH-Annotator v1.7.0 requirements:
|
|
1. Entity hypernym codes (GRP.HER for heritage institutions)
|
|
2. 5-component claim provenance model
|
|
3. Convention version reference
|
|
4. Ontology class mappings
|
|
|
|
Usage:
|
|
python scripts/apply_ch_annotator_algeria.py
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
|
|
# =============================================================================
|
|
# CH-ANNOTATOR MAPPINGS
|
|
# =============================================================================
|
|
|
|
# Map GLAMORCUBESFIXPHDNT institution_type to CH-Annotator hypernym codes
|
|
INSTITUTION_TYPE_TO_HYPERNYM = {
|
|
"GALLERY": "GRP.HER.GAL",
|
|
"LIBRARY": "GRP.HER.LIB",
|
|
"ARCHIVE": "GRP.HER.ARC",
|
|
"MUSEUM": "GRP.HER.MUS",
|
|
"OFFICIAL_INSTITUTION": "GRP.HER.OFF",
|
|
"RESEARCH_CENTER": "GRP.HER.RES",
|
|
"CORPORATION": "GRP.COR",
|
|
"UNKNOWN": "GRP.HER",
|
|
"BOTANICAL_ZOO": "GRP.HER.BOT",
|
|
"EDUCATION_PROVIDER": "GRP.EDU",
|
|
"COLLECTING_SOCIETY": "GRP.HER.SOC",
|
|
"FEATURES": "TOP.MON", # Monuments are toponym-based
|
|
"INTANGIBLE_HERITAGE_GROUP": "GRP.HER.INT",
|
|
"MIXED": "GRP.HER.MIX",
|
|
"PERSONAL_COLLECTION": "GRP.HER.PER",
|
|
"HOLY_SITES": "GRP.HER.HOL",
|
|
"DIGITAL_PLATFORM": "GRP.HER.DIG",
|
|
"NGO": "GRP.ASS",
|
|
"TASTE_SMELL": "GRP.HER.TAS",
|
|
}
|
|
|
|
# Ontology class mappings for each hypernym
|
|
HYPERNYM_ONTOLOGY_CLASS = {
|
|
"GRP.HER.GAL": "schema:Museum",
|
|
"GRP.HER.LIB": "schema:Library",
|
|
"GRP.HER.ARC": "schema:ArchiveOrganization",
|
|
"GRP.HER.MUS": "schema:Museum",
|
|
"GRP.HER.OFF": "schema:GovernmentOrganization",
|
|
"GRP.HER.RES": "schema:ResearchOrganization",
|
|
"GRP.COR": "schema:Corporation",
|
|
"GRP.HER": "glam:HeritageCustodian",
|
|
"GRP.HER.BOT": "schema:Zoo",
|
|
"GRP.EDU": "schema:EducationalOrganization",
|
|
"GRP.HER.SOC": "org:FormalOrganization",
|
|
"TOP.MON": "crm:E22_Human-Made_Object",
|
|
"GRP.HER.INT": "crm:E74_Group",
|
|
"GRP.HER.MIX": "glam:HeritageCustodian",
|
|
"GRP.HER.PER": "glam:PersonalCollection",
|
|
"GRP.HER.HOL": "schema:PlaceOfWorship",
|
|
"GRP.HER.DIG": "schema:WebSite",
|
|
"GRP.ASS": "org:FormalOrganization",
|
|
"GRP.HER.TAS": "glam:HeritageCustodian",
|
|
}
|
|
|
|
|
|
def get_hypernym_code(institution_type: str) -> str:
|
|
"""Get CH-Annotator hypernym code from institution type."""
|
|
return INSTITUTION_TYPE_TO_HYPERNYM.get(institution_type, "GRP.HER")
|
|
|
|
|
|
def get_ontology_class(hypernym_code: str) -> str:
|
|
"""Get ontology class URI from hypernym code."""
|
|
return HYPERNYM_ONTOLOGY_CLASS.get(hypernym_code, "glam:HeritageCustodian")
|
|
|
|
|
|
def create_ch_annotator_block(institution: dict, annotation_date: str) -> dict:
|
|
"""Create CH-Annotator entity annotation block for an institution."""
|
|
|
|
institution_type = institution.get("institution_type", "UNKNOWN")
|
|
hypernym_code = get_hypernym_code(institution_type)
|
|
ontology_class = get_ontology_class(hypernym_code)
|
|
|
|
# Extract conversation_id from existing provenance
|
|
provenance = institution.get("provenance", {})
|
|
conversation_id = provenance.get("conversation_id", "039a271a-f8e3-4bf3-9e89-b289ec80701d")
|
|
extraction_date = provenance.get("extraction_date", "2025-11-09T00:00:00Z")
|
|
confidence_score = provenance.get("confidence_score", 0.85)
|
|
|
|
# Build entity annotation block
|
|
entity_annotation = {
|
|
"convention_id": "ch_annotator-v1_7_0",
|
|
"convention_version": "1.7.0",
|
|
"annotation_date": annotation_date,
|
|
|
|
# Entity classification
|
|
"entity_classification": {
|
|
"hypernym": "GRP",
|
|
"hypernym_label": "GROUP",
|
|
"subtype": hypernym_code,
|
|
"subtype_label": institution_type,
|
|
"ontology_class": ontology_class,
|
|
"alternative_classes": [
|
|
"org:FormalOrganization",
|
|
"rov:RegisteredOrganization" if institution.get("identifiers") else None,
|
|
"glam:HeritageCustodian",
|
|
],
|
|
},
|
|
|
|
# Extraction provenance (5-component model)
|
|
# Note: agent reflects the ORIGINAL extraction model from conversation
|
|
# For Claude conversation exports, the specific model version is often unknown
|
|
"extraction_provenance": {
|
|
"namespace": "glam",
|
|
"path": f"/conversations/{conversation_id}",
|
|
"timestamp": extraction_date,
|
|
"agent": provenance.get("extraction_agent", "claude-conversation"), # Original extraction agent
|
|
"context_convention": "ch_annotator-v1_7_0",
|
|
},
|
|
|
|
# CH-Annotator application provenance (separate from original extraction)
|
|
"annotation_provenance": {
|
|
"annotation_agent": "opencode-claude-sonnet-4", # Model applying CH-Annotator NOW
|
|
"annotation_date": annotation_date,
|
|
"annotation_method": "retroactive CH-Annotator application via batch script",
|
|
},
|
|
|
|
# Confidence and verification
|
|
"annotation_metadata": {
|
|
"confidence_score": confidence_score,
|
|
"verified": False,
|
|
"verification_date": None,
|
|
"verified_by": None,
|
|
"annotation_notes": f"Retroactive CH-Annotator annotation applied {annotation_date}. "
|
|
f"Original extraction from Algerian GLAM conversation.",
|
|
},
|
|
}
|
|
|
|
# Clean up None values in alternative_classes
|
|
entity_annotation["entity_classification"]["alternative_classes"] = [
|
|
c for c in entity_annotation["entity_classification"]["alternative_classes"] if c
|
|
]
|
|
|
|
return entity_annotation
|
|
|
|
|
|
def create_entity_claims(institution: dict) -> list:
|
|
"""Create CH-Annotator claims for key entity attributes."""
|
|
|
|
claims = []
|
|
provenance = institution.get("provenance", {})
|
|
conversation_id = provenance.get("conversation_id", "039a271a-f8e3-4bf3-9e89-b289ec80701d")
|
|
extraction_date = provenance.get("extraction_date", "2025-11-09T00:00:00Z")
|
|
|
|
# Base provenance for all claims
|
|
# Note: agent reflects original extraction, not CH-Annotator application
|
|
base_provenance = {
|
|
"namespace": "glam",
|
|
"path": f"/conversations/{conversation_id}",
|
|
"timestamp": extraction_date,
|
|
"agent": provenance.get("extraction_agent", "claude-conversation"),
|
|
"context_convention": "ch_annotator-v1_7_0",
|
|
}
|
|
|
|
# Claim 1: Institution name
|
|
if institution.get("name"):
|
|
claims.append({
|
|
"claim_type": "full_name",
|
|
"claim_value": institution["name"],
|
|
"property_uri": "skos:prefLabel",
|
|
"provenance": base_provenance.copy(),
|
|
"confidence": provenance.get("confidence_score", 0.9),
|
|
})
|
|
|
|
# Claim 2: Institution type
|
|
if institution.get("institution_type"):
|
|
claims.append({
|
|
"claim_type": "institution_type",
|
|
"claim_value": institution["institution_type"],
|
|
"property_uri": "rdf:type",
|
|
"provenance": base_provenance.copy(),
|
|
"confidence": 0.95, # Type classification is usually reliable
|
|
})
|
|
|
|
# Claim 3: Location (city)
|
|
locations = institution.get("locations", [])
|
|
if locations and locations[0].get("city"):
|
|
claims.append({
|
|
"claim_type": "located_in_city",
|
|
"claim_value": locations[0]["city"],
|
|
"property_uri": "schema:addressLocality",
|
|
"provenance": base_provenance.copy(),
|
|
"confidence": 0.9,
|
|
})
|
|
|
|
# Claim 4: Wikidata ID (if present)
|
|
identifiers = institution.get("identifiers", [])
|
|
for ident in identifiers:
|
|
if ident.get("identifier_scheme") == "Wikidata":
|
|
claims.append({
|
|
"claim_type": "wikidata_id",
|
|
"claim_value": ident["identifier_value"],
|
|
"property_uri": "owl:sameAs",
|
|
"provenance": {
|
|
**base_provenance,
|
|
"namespace": "wikidata",
|
|
"path": f"/entity/{ident['identifier_value']}",
|
|
},
|
|
"confidence": 0.98, # Wikidata IDs verified via SPARQL
|
|
})
|
|
break
|
|
|
|
return claims
|
|
|
|
|
|
def apply_ch_annotator(input_path: Path, output_path: Path) -> dict:
|
|
"""Apply CH-Annotator convention to Algeria dataset."""
|
|
|
|
# Load existing data
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
annotation_date = datetime.now(timezone.utc).isoformat()
|
|
|
|
stats = {
|
|
"total_institutions": 0,
|
|
"annotations_added": 0,
|
|
"claims_created": 0,
|
|
"by_hypernym": {},
|
|
}
|
|
|
|
# Process each institution
|
|
for institution in data:
|
|
stats["total_institutions"] += 1
|
|
|
|
# Add CH-Annotator entity annotation block
|
|
ch_annotation = create_ch_annotator_block(institution, annotation_date)
|
|
institution["ch_annotator"] = ch_annotation
|
|
stats["annotations_added"] += 1
|
|
|
|
# Track hypernym distribution
|
|
hypernym = ch_annotation["entity_classification"]["subtype"]
|
|
stats["by_hypernym"][hypernym] = stats["by_hypernym"].get(hypernym, 0) + 1
|
|
|
|
# Add entity claims
|
|
claims = create_entity_claims(institution)
|
|
institution["ch_annotator"]["entity_claims"] = claims
|
|
stats["claims_created"] += len(claims)
|
|
|
|
# Write updated data
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
# Add header comment
|
|
header = """# Algerian GLAM Institutions - CH-Annotator Enhanced
|
|
# Last updated: {date}
|
|
# CH-Annotator Convention: ch_annotator-v1_7_0
|
|
#
|
|
# This file has been enhanced with formal CH-Annotator (Cultural Heritage Annotator)
|
|
# provenance metadata following the ch_annotator-v1_7_0 convention.
|
|
#
|
|
# CH-Annotator Features Applied:
|
|
# - Entity hypernym codes (GRP.HER.* for heritage institutions)
|
|
# - 5-component claim provenance model (namespace, path, timestamp, agent, convention)
|
|
# - Ontology class mappings (CIDOC-CRM, Schema.org, W3C Org)
|
|
# - Entity claims for key attributes (name, type, location, identifiers)
|
|
#
|
|
# Statistics:
|
|
# - Total institutions: {total}
|
|
# - Annotations added: {annotations}
|
|
# - Entity claims created: {claims}
|
|
# - Hypernym distribution: {hypernyms}
|
|
#
|
|
# Original Data Source: Algerian GLAM conversation (039a271a-f8e3-4bf3-9e89-b289ec80701d)
|
|
# Original Extraction Date: 2025-11-09
|
|
# Convention Applied: {date}
|
|
|
|
""".format(
|
|
date=annotation_date,
|
|
total=stats["total_institutions"],
|
|
annotations=stats["annotations_added"],
|
|
claims=stats["claims_created"],
|
|
hypernyms=", ".join(f"{k}: {v}" for k, v in sorted(stats["by_hypernym"].items())),
|
|
)
|
|
|
|
f.write(header)
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
|
|
input_path = Path("/Users/kempersc/apps/glam/data/instances/algeria/algerian_institutions_ghcid.yaml")
|
|
output_path = Path("/Users/kempersc/apps/glam/data/instances/algeria/algerian_institutions_ch_annotator.yaml")
|
|
|
|
print("=" * 70)
|
|
print("CH-Annotator Convention Application")
|
|
print("=" * 70)
|
|
print(f"Input: {input_path}")
|
|
print(f"Output: {output_path}")
|
|
print()
|
|
|
|
stats = apply_ch_annotator(input_path, output_path)
|
|
|
|
print("Results:")
|
|
print(f" Total institutions: {stats['total_institutions']}")
|
|
print(f" Annotations added: {stats['annotations_added']}")
|
|
print(f" Entity claims created: {stats['claims_created']}")
|
|
print()
|
|
print("Hypernym Distribution:")
|
|
for hypernym, count in sorted(stats["by_hypernym"].items()):
|
|
print(f" {hypernym}: {count}")
|
|
print()
|
|
print(f"Output written to: {output_path}")
|
|
print("=" * 70)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|