- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
111 lines
4.3 KiB
Python
111 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Chilean GLAM Batch 17 Enrichment - Library Parent Organization Linkage
|
|
|
|
Strategy: Apply municipality/parent organization Q-numbers to municipal libraries
|
|
where we have confirmed parent organization Wikidata entries.
|
|
|
|
Batch 17 Target:
|
|
- Biblioteca Pública Federico Varela → Q3763 (Chañaral municipality)
|
|
|
|
Rationale:
|
|
- Federico Varela is a municipal public library in Chañaral
|
|
- Chañaral municipality has Wikidata entry Q3763
|
|
- Parent organization enrichment is appropriate for municipal libraries
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Repository root
|
|
REPO_ROOT = Path(__file__).parent.parent
|
|
|
|
def enrich_batch_17():
|
|
"""Apply parent organization Q-numbers to municipal libraries."""
|
|
|
|
# Load current data
|
|
input_file = REPO_ROOT / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch16_enriched.yaml'
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
enrichment_count = 0
|
|
|
|
# Batch 17 enrichments: Municipal library parent organizations
|
|
enrichments = {
|
|
'https://w3id.org/heritage/custodian/cl/l-biblioteca-p-blica-federico-va-0015': {
|
|
'q_number': 'Q3763',
|
|
'wikidata_name': 'Chañaral, commune in Atacama Region, Chile',
|
|
'match_reason': 'Parent organization - Municipal public library in Chañaral',
|
|
'enrichment_type': 'parent_organization'
|
|
}
|
|
}
|
|
|
|
# Apply enrichments
|
|
for inst in institutions:
|
|
inst_id = inst.get('id')
|
|
|
|
if inst_id in enrichments:
|
|
enrichment = enrichments[inst_id]
|
|
|
|
# Add Wikidata identifier
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
# Check if Wikidata identifier already exists
|
|
has_wikidata = any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst['identifiers']
|
|
)
|
|
|
|
if not has_wikidata:
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': enrichment['q_number'],
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{enrichment["q_number"]}'
|
|
})
|
|
|
|
# Update provenance
|
|
inst['provenance']['data_tier'] = 'TIER_3_CROWD_SOURCED'
|
|
inst['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
|
|
inst['provenance']['enrichment_batch'] = 17
|
|
inst['provenance']['wikidata_match_confidence'] = 'MEDIUM'
|
|
inst['provenance']['wikidata_match_reason'] = enrichment['match_reason']
|
|
inst['provenance']['wikidata_name'] = enrichment['wikidata_name']
|
|
|
|
if 'notes' not in inst['provenance']:
|
|
inst['provenance']['notes'] = []
|
|
|
|
inst['provenance']['notes'].append(
|
|
f"Batch 17: {enrichment['enrichment_type']} enrichment - "
|
|
f"{enrichment['wikidata_name']} (parent organization for municipal library)"
|
|
)
|
|
|
|
enrichment_count += 1
|
|
print(f"✓ Enriched: {inst['name']}")
|
|
print(f" Q-number: {enrichment['q_number']} ({enrichment['enrichment_type']})")
|
|
print(f" Reason: {enrichment['match_reason']}\n")
|
|
|
|
# Save updated data
|
|
output_file = REPO_ROOT / 'data' / 'instances' / 'chile' / 'chilean_institutions_batch17_enriched.yaml'
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Batch 17 Enrichment Complete")
|
|
print(f"{'='*60}")
|
|
print(f"Institutions enriched: {enrichment_count}")
|
|
print(f"Output: {output_file}")
|
|
print(f"\nNext: Calculate coverage statistics to assess impact")
|
|
|
|
return enrichment_count
|
|
|
|
if __name__ == '__main__':
|
|
enriched = enrich_batch_17()
|
|
|
|
if enriched == 0:
|
|
print("\n⚠️ WARNING: No institutions were enriched!")
|
|
print("This may indicate the target institutions already have Wikidata identifiers")
|
|
print("or the institution IDs have changed.")
|