- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
140 lines
4.8 KiB
Python
Executable file
140 lines
4.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Chilean Batch 13: Apply Validated Wikidata Enrichment
|
|
Adds 1 confirmed Q-number from manual search results.
|
|
|
|
Validated match:
|
|
- Archivo General de Asuntos Indígenas (CONADI) → Q21002896
|
|
|
|
Expected outcome: 61/90 (67.8%) coverage
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
def load_yaml(filepath: str):
|
|
"""Load YAML file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def save_yaml(data, filepath: str):
|
|
"""Save data to YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
def main():
|
|
"""Apply validated Wikidata enrichment for Batch 13."""
|
|
|
|
# Load the current enriched dataset (from Batch 11)
|
|
input_file = 'data/instances/chile/chilean_institutions_batch11_enriched.yaml'
|
|
output_file = 'data/instances/chile/chilean_institutions_batch13_enriched.yaml'
|
|
|
|
print("=" * 80)
|
|
print("Chilean Batch 13: Wikidata Enrichment")
|
|
print("=" * 80)
|
|
print(f"Input: {input_file}")
|
|
print(f"Output: {output_file}")
|
|
print()
|
|
|
|
institutions = load_yaml(input_file)
|
|
|
|
# Validated match from manual search
|
|
validated_match = {
|
|
'name': 'Archivo General de Asuntos Indígenas (CONADI)',
|
|
'q_number': 'Q21002896',
|
|
'wikidata_url': 'https://www.wikidata.org/wiki/Q21002896',
|
|
'rationale': 'Exact name match from Wikidata SPARQL query'
|
|
}
|
|
|
|
print(f"Applying validated match:")
|
|
print(f" {validated_match['name']} → {validated_match['q_number']}")
|
|
print()
|
|
|
|
# Find and enrich the institution
|
|
enriched_count = 0
|
|
|
|
for institution in institutions:
|
|
# Match by name
|
|
if institution.get('name') == validated_match['name']:
|
|
# Check if already has Wikidata identifier
|
|
existing_wikidata = False
|
|
if institution.get('identifiers'):
|
|
for identifier in institution['identifiers']:
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
existing_wikidata = True
|
|
print(f" ⚠ Already has Wikidata: {identifier.get('identifier_value')}")
|
|
break
|
|
|
|
if not existing_wikidata:
|
|
# Add Wikidata identifier
|
|
if not institution.get('identifiers'):
|
|
institution['identifiers'] = []
|
|
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': validated_match['q_number'],
|
|
'identifier_url': validated_match['wikidata_url']
|
|
})
|
|
|
|
# Update provenance
|
|
if not institution.get('provenance'):
|
|
institution['provenance'] = {}
|
|
|
|
institution['provenance']['wikidata_enrichment'] = {
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'Manual Wikidata SPARQL search with exact name matching',
|
|
'enrichment_batch': 'batch13',
|
|
'match_confidence': 'high',
|
|
'match_rationale': validated_match['rationale']
|
|
}
|
|
|
|
enriched_count += 1
|
|
print(f" ✓ Added Wikidata Q-number: {validated_match['q_number']}")
|
|
|
|
break
|
|
|
|
# Save enriched dataset
|
|
save_yaml(institutions, output_file)
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("Enrichment Summary")
|
|
print("=" * 80)
|
|
|
|
# Count institutions with Wikidata
|
|
total_institutions = len(institutions)
|
|
institutions_with_wikidata = sum(
|
|
1 for inst in institutions
|
|
if inst.get('identifiers') and any(
|
|
id.get('identifier_scheme') == 'Wikidata'
|
|
for id in inst.get('identifiers', [])
|
|
)
|
|
)
|
|
|
|
coverage_pct = (institutions_with_wikidata / total_institutions) * 100
|
|
|
|
print(f"Total institutions: {total_institutions}")
|
|
print(f"Institutions with Wikidata: {institutions_with_wikidata}")
|
|
print(f"Coverage: {coverage_pct:.1f}%")
|
|
print(f"Institutions enriched in this batch: {enriched_count}")
|
|
print()
|
|
print(f"Output saved to: {output_file}")
|
|
print()
|
|
|
|
# Report on target
|
|
print("=" * 80)
|
|
print("Target Progress")
|
|
print("=" * 80)
|
|
print(f"Target coverage: 70% (63/90)")
|
|
print(f"Current coverage: {coverage_pct:.1f}% ({institutions_with_wikidata}/{total_institutions})")
|
|
|
|
if coverage_pct >= 70.0:
|
|
print("✓ TARGET REACHED!")
|
|
else:
|
|
remaining = 63 - institutions_with_wikidata
|
|
print(f"⚠ Need {remaining} more matches to reach 70% target")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
main()
|