- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
113 lines
4.2 KiB
Python
113 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge US Wikidata enrichment into unified global dataset.
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
def merge_us_enrichment():
|
|
"""Merge US enriched institutions into unified dataset."""
|
|
|
|
# Load enriched US data
|
|
us_enriched_path = Path('data/instances/united_states/us_institutions_enriched_manual.yaml')
|
|
with open(us_enriched_path, 'r', encoding='utf-8') as f:
|
|
us_enriched = yaml.safe_load(f)
|
|
|
|
# Load unified dataset
|
|
unified_path = Path('data/instances/all/globalglam-20251111.yaml')
|
|
with open(unified_path, 'r', encoding='utf-8') as f:
|
|
unified = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(us_enriched)} enriched US institutions")
|
|
print(f"Loaded {len(unified)} institutions from unified dataset")
|
|
|
|
# Create lookup by name and country for US institutions
|
|
us_enriched_lookup = {}
|
|
for inst in us_enriched:
|
|
name = inst.get('name')
|
|
# Find US location
|
|
locations = inst.get('locations', [])
|
|
if locations and locations[0].get('country') == 'US':
|
|
us_enriched_lookup[name] = inst
|
|
|
|
print(f"\nCreated lookup for {len(us_enriched_lookup)} US institutions")
|
|
|
|
# Merge enrichment data into unified dataset
|
|
merged_count = 0
|
|
updated_records = []
|
|
|
|
for i, inst in enumerate(unified):
|
|
locations = inst.get('locations', [])
|
|
if not locations or locations[0].get('country') != 'US':
|
|
continue
|
|
|
|
name = inst.get('name')
|
|
if name not in us_enriched_lookup:
|
|
continue
|
|
|
|
# Found matching institution
|
|
enriched = us_enriched_lookup[name]
|
|
|
|
# Get existing identifiers
|
|
existing_identifiers = inst.get('identifiers', [])
|
|
existing_schemes = {id.get('identifier_scheme') for id in existing_identifiers}
|
|
|
|
# Add Wikidata and VIAF if not present
|
|
added_identifiers = []
|
|
for id in enriched.get('identifiers', []):
|
|
scheme = id.get('identifier_scheme')
|
|
if scheme in ['Wikidata', 'VIAF'] and scheme not in existing_schemes:
|
|
existing_identifiers.append(id)
|
|
added_identifiers.append(scheme)
|
|
|
|
if added_identifiers:
|
|
# Update provenance
|
|
provenance = inst.get('provenance', {})
|
|
old_method = provenance.get('extraction_method', '')
|
|
if 'Manual Wikidata enrichment' not in old_method:
|
|
# Extract the Wikidata description from enriched version
|
|
enriched_method = enriched.get('provenance', {}).get('extraction_method', '')
|
|
if 'Manual Wikidata enrichment:' in enriched_method:
|
|
wikidata_part = enriched_method.split('Manual Wikidata enrichment:')[1].strip()
|
|
provenance['extraction_method'] = f"{old_method} + Manual Wikidata enrichment: {wikidata_part}"
|
|
|
|
provenance['last_updated'] = datetime.now(timezone.utc).isoformat()
|
|
provenance['wikidata_verified'] = True
|
|
|
|
merged_count += 1
|
|
updated_records.append({
|
|
'name': name,
|
|
'added': added_identifiers
|
|
})
|
|
|
|
print(f"✓ Merged {name}: added {', '.join(added_identifiers)}")
|
|
|
|
# Save updated unified dataset
|
|
if merged_count > 0:
|
|
backup_path = unified_path.with_suffix('.yaml.backup')
|
|
import shutil
|
|
shutil.copy(unified_path, backup_path)
|
|
print(f"\n✓ Created backup: {backup_path}")
|
|
|
|
with open(unified_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(unified, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"\n✓ Updated unified dataset: {unified_path}")
|
|
print(f"✓ Merged {merged_count} US institutions")
|
|
|
|
# Print summary
|
|
print("\n" + "="*60)
|
|
print("MERGE SUMMARY")
|
|
print("="*60)
|
|
for record in updated_records:
|
|
print(f" {record['name']}")
|
|
print(f" Added: {', '.join(record['added'])}")
|
|
else:
|
|
print("\n⚠ No institutions merged (already up to date)")
|
|
|
|
return merged_count
|
|
|
|
if __name__ == '__main__':
|
|
merge_us_enrichment()
|