glam/scripts/enrich_chilean_batch17.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

111 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""
Chilean GLAM Batch 17 Enrichment - Library Parent Organization Linkage
Strategy: Apply municipality/parent organization Q-numbers to municipal libraries
where we have confirmed parent organization Wikidata entries.
Batch 17 Target:
- Biblioteca Pública Federico Varela → Q3763 (Chañaral municipality)
Rationale:
- Federico Varela is a municipal public library in Chañaral
- Chañaral municipality has Wikidata entry Q3763
- Parent organization enrichment is appropriate for municipal libraries
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
# Repository root
REPO_ROOT = Path(__file__).parent.parent
def enrich_batch_17():
"""Apply parent organization Q-numbers to municipal libraries."""
# Load current data
input_file = REPO_ROOT / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch16_enriched.yaml'
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
enrichment_count = 0
# Batch 17 enrichments: Municipal library parent organizations
enrichments = {
'https://w3id.org/heritage/custodian/cl/l-biblioteca-p-blica-federico-va-0015': {
'q_number': 'Q3763',
'wikidata_name': 'Chañaral, commune in Atacama Region, Chile',
'match_reason': 'Parent organization - Municipal public library in Chañaral',
'enrichment_type': 'parent_organization'
}
}
# Apply enrichments
for inst in institutions:
inst_id = inst.get('id')
if inst_id in enrichments:
enrichment = enrichments[inst_id]
# Add Wikidata identifier
if 'identifiers' not in inst:
inst['identifiers'] = []
# Check if Wikidata identifier already exists
has_wikidata = any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst['identifiers']
)
if not has_wikidata:
inst['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': enrichment['q_number'],
'identifier_url': f'https://www.wikidata.org/wiki/{enrichment["q_number"]}'
})
# Update provenance
inst['provenance']['data_tier'] = 'TIER_3_CROWD_SOURCED'
inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
inst['provenance']['enrichment_batch'] = 17
inst['provenance']['wikidata_match_confidence'] = 'MEDIUM'
inst['provenance']['wikidata_match_reason'] = enrichment['match_reason']
inst['provenance']['wikidata_name'] = enrichment['wikidata_name']
if 'notes' not in inst['provenance']:
inst['provenance']['notes'] = []
inst['provenance']['notes'].append(
f"Batch 17: {enrichment['enrichment_type']} enrichment - "
f"{enrichment['wikidata_name']} (parent organization for municipal library)"
)
enrichment_count += 1
print(f"✓ Enriched: {inst['name']}")
print(f" Q-number: {enrichment['q_number']} ({enrichment['enrichment_type']})")
print(f" Reason: {enrichment['match_reason']}\n")
# Save updated data
output_file = REPO_ROOT / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch17_enriched.yaml'
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(f"\n{'='*60}")
print(f"Batch 17 Enrichment Complete")
print(f"{'='*60}")
print(f"Institutions enriched: {enrichment_count}")
print(f"Output: {output_file}")
print(f"\nNext: Calculate coverage statistics to assess impact")
return enrichment_count
if __name__ == '__main__':
enriched = enrich_batch_17()
if enriched == 0:
print("\n⚠️ WARNING: No institutions were enriched!")
print("This may indicate the target institutions already have Wikidata identifiers")
print("or the institution IDs have changed.")