glam/scripts/enrich_chilean_batch13.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

140 lines
4.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Chilean Batch 13: Apply Validated Wikidata Enrichment
Adds 1 confirmed Q-number from manual search results.
Validated match:
- Archivo General de Asuntos Indígenas (CONADI) → Q21002896
Expected outcome: 61/90 (67.8%) coverage
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
def load_yaml(filepath: str):
"""Load YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_yaml(data, filepath: str):
"""Save data to YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, sort_keys=False, width=120)
def main():
"""Apply validated Wikidata enrichment for Batch 13."""
# Load the current enriched dataset (from Batch 11)
input_file = 'data/instances/chile/chilean_institutions_batch11_enriched.yaml'
output_file = 'data/instances/chile/chilean_institutions_batch13_enriched.yaml'
print("=" * 80)
print("Chilean Batch 13: Wikidata Enrichment")
print("=" * 80)
print(f"Input: {input_file}")
print(f"Output: {output_file}")
print()
institutions = load_yaml(input_file)
# Validated match from manual search
validated_match = {
'name': 'Archivo General de Asuntos Indígenas (CONADI)',
'q_number': 'Q21002896',
'wikidata_url': 'https://www.wikidata.org/wiki/Q21002896',
'rationale': 'Exact name match from Wikidata SPARQL query'
}
print(f"Applying validated match:")
print(f" {validated_match['name']}{validated_match['q_number']}")
print()
# Find and enrich the institution
enriched_count = 0
for institution in institutions:
# Match by name
if institution.get('name') == validated_match['name']:
# Check if already has Wikidata identifier
existing_wikidata = False
if institution.get('identifiers'):
for identifier in institution['identifiers']:
if identifier.get('identifier_scheme') == 'Wikidata':
existing_wikidata = True
print(f" ⚠ Already has Wikidata: {identifier.get('identifier_value')}")
break
if not existing_wikidata:
# Add Wikidata identifier
if not institution.get('identifiers'):
institution['identifiers'] = []
institution['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': validated_match['q_number'],
'identifier_url': validated_match['wikidata_url']
})
# Update provenance
if not institution.get('provenance'):
institution['provenance'] = {}
institution['provenance']['wikidata_enrichment'] = {
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'Manual Wikidata SPARQL search with exact name matching',
'enrichment_batch': 'batch13',
'match_confidence': 'high',
'match_rationale': validated_match['rationale']
}
enriched_count += 1
print(f" ✓ Added Wikidata Q-number: {validated_match['q_number']}")
break
# Save enriched dataset
save_yaml(institutions, output_file)
print()
print("=" * 80)
print("Enrichment Summary")
print("=" * 80)
# Count institutions with Wikidata
total_institutions = len(institutions)
institutions_with_wikidata = sum(
1 for inst in institutions
if inst.get('identifiers') and any(
id.get('identifier_scheme') == 'Wikidata'
for id in inst.get('identifiers', [])
)
)
coverage_pct = (institutions_with_wikidata / total_institutions) * 100
print(f"Total institutions: {total_institutions}")
print(f"Institutions with Wikidata: {institutions_with_wikidata}")
print(f"Coverage: {coverage_pct:.1f}%")
print(f"Institutions enriched in this batch: {enriched_count}")
print()
print(f"Output saved to: {output_file}")
print()
# Report on target
print("=" * 80)
print("Target Progress")
print("=" * 80)
print(f"Target coverage: 70% (63/90)")
print(f"Current coverage: {coverage_pct:.1f}% ({institutions_with_wikidata}/{total_institutions})")
if coverage_pct >= 70.0:
print("✓ TARGET REACHED!")
else:
remaining = 63 - institutions_with_wikidata
print(f"⚠ Need {remaining} more matches to reach 70% target")
print()
if __name__ == '__main__':
main()