glam/scripts/enrich_it_manual.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

150 lines
6.1 KiB
Python

#!/usr/bin/env python3
"""
Manual enrichment for Italian institutions with Wikidata identifiers.
Research findings:
1. Giovio Musaeum - Already has Wikidata (Q3868171)
2. European University Institute - Archives → Q1378099 (parent institution)
3. European University Institute - Library → Q1378099 (parent institution)
Note: EUI sub-units (Archives/Library) do not have separate Wikidata entities,
so both use the parent institution Q1378099.
"""
import yaml
from datetime import datetime, timezone
def enrich_italian_institutions():
print("=" * 80)
print("🇮🇹 Italian Institutions Manual Enrichment")
print("=" * 80)
# Load unified dataset
print("\n📂 Loading unified dataset...")
with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
all_institutions = yaml.safe_load(f)
print(f" ✅ Loaded {len(all_institutions)} institutions")
# Extract Italian institutions
italian = [
inst for inst in all_institutions
if any(loc.get('country') == 'IT' for loc in inst.get('locations', []))
]
print(f"\n🔍 Found {len(italian)} Italian institutions")
# Enrichment data
enrichments = {
'EUR-EUI0001': { # European University Institute - Archives
'wikidata': 'Q1378099',
'viaf': '133087619',
'coordinates': {'latitude': 43.7524, 'longitude': 11.2947},
'website': 'https://www.eui.eu/Research/HistoricalArchivesOfEU',
'note': 'Using parent institution Q1378099 (EUI) as no separate Wikidata entity exists for Archives'
},
'EUR-EUI0002': { # European University Institute - Library
'wikidata': 'Q1378099',
'viaf': '133087619',
'coordinates': {'latitude': 43.7524, 'longitude': 11.2947},
'website': 'https://www.eui.eu/Research/Library',
'note': 'Using parent institution Q1378099 (EUI) as no separate Wikidata entity exists for Library'
}
}
print("\n🔄 Applying enrichments...")
enriched_count = 0
for inst in italian:
inst_id = inst.get('id')
if inst_id in enrichments:
enrich_data = enrichments[inst_id]
# Initialize identifiers list if missing
if 'identifiers' not in inst:
inst['identifiers'] = []
# Add/update Wikidata identifier
existing_wd = [i for i in inst['identifiers'] if i.get('identifier_scheme') == 'Wikidata']
if not existing_wd:
inst['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': enrich_data['wikidata'],
'identifier_url': f"https://www.wikidata.org/wiki/{enrich_data['wikidata']}"
})
print(f" ✅ Added Wikidata {enrich_data['wikidata']}: {inst['name']}")
# Add/update VIAF identifier
existing_viaf = [i for i in inst['identifiers'] if i.get('identifier_scheme') == 'VIAF']
if not existing_viaf and enrich_data.get('viaf'):
inst['identifiers'].append({
'identifier_scheme': 'VIAF',
'identifier_value': enrich_data['viaf'],
'identifier_url': f"https://viaf.org/viaf/{enrich_data['viaf']}"
})
print(f" + VIAF {enrich_data['viaf']}")
# Add/update Website
existing_website = [i for i in inst['identifiers'] if i.get('identifier_scheme') == 'Website']
if not existing_website and enrich_data.get('website'):
inst['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': enrich_data['website'],
'identifier_url': enrich_data['website']
})
print(f" + Website")
# Add coordinates to location
if inst.get('locations') and enrich_data.get('coordinates'):
loc = inst['locations'][0]
if 'latitude' not in loc:
loc.update(enrich_data['coordinates'])
print(f" + Coordinates")
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['last_enrichment_date'] = datetime.now(timezone.utc).isoformat()
inst['provenance']['enrichment_method'] = 'Manual Wikidata research'
# Add note to description if provided
if enrich_data.get('note'):
if 'description' in inst:
inst['description'] = f"{inst['description']} [Wikidata note: {enrich_data['note']}]"
enriched_count += 1
print(f"\n 📊 Total enriched: {enriched_count}")
# Calculate coverage
italian_with_wd = sum(
1 for inst in italian
if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))
)
print("\n" + "=" * 80)
print("📊 Italian Institutions - Wikidata Coverage")
print("=" * 80)
print(f"Total Italian institutions: {len(italian)}")
print(f"With Wikidata identifiers: {italian_with_wd}")
print(f"Coverage: {italian_with_wd/len(italian)*100:.1f}%")
if italian_with_wd == len(italian):
print("\n✅ SUCCESS: 100% Wikidata coverage achieved!")
# Save enriched Italian institutions
output_path = 'data/instances/italy/it_institutions_enriched_manual.yaml'
print(f"\n💾 Saving enriched Italian institutions to {output_path}...")
import os
os.makedirs('data/instances/italy', exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(italian, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(" ✅ Saved")
print("\n🔄 Next step: Run scripts/merge_it_enriched.py to merge back into unified dataset")
print()
if __name__ == '__main__':
enrich_italian_institutions()