- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
150 lines
6.1 KiB
Python
150 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Manual enrichment for Italian institutions with Wikidata identifiers.
|
|
|
|
Research findings:
|
|
1. Giovio Musaeum - Already has Wikidata (Q3868171)
|
|
2. European University Institute - Archives → Q1378099 (parent institution)
|
|
3. European University Institute - Library → Q1378099 (parent institution)
|
|
|
|
Note: EUI sub-units (Archives/Library) do not have separate Wikidata entities,
|
|
so both use the parent institution Q1378099.
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
|
|
def enrich_italian_institutions():
|
|
print("=" * 80)
|
|
print("🇮🇹 Italian Institutions Manual Enrichment")
|
|
print("=" * 80)
|
|
|
|
# Load unified dataset
|
|
print("\n📂 Loading unified dataset...")
|
|
with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
|
|
all_institutions = yaml.safe_load(f)
|
|
print(f" ✅ Loaded {len(all_institutions)} institutions")
|
|
|
|
# Extract Italian institutions
|
|
italian = [
|
|
inst for inst in all_institutions
|
|
if any(loc.get('country') == 'IT' for loc in inst.get('locations', []))
|
|
]
|
|
print(f"\n🔍 Found {len(italian)} Italian institutions")
|
|
|
|
# Enrichment data
|
|
enrichments = {
|
|
'EUR-EUI0001': { # European University Institute - Archives
|
|
'wikidata': 'Q1378099',
|
|
'viaf': '133087619',
|
|
'coordinates': {'latitude': 43.7524, 'longitude': 11.2947},
|
|
'website': 'https://www.eui.eu/Research/HistoricalArchivesOfEU',
|
|
'note': 'Using parent institution Q1378099 (EUI) as no separate Wikidata entity exists for Archives'
|
|
},
|
|
'EUR-EUI0002': { # European University Institute - Library
|
|
'wikidata': 'Q1378099',
|
|
'viaf': '133087619',
|
|
'coordinates': {'latitude': 43.7524, 'longitude': 11.2947},
|
|
'website': 'https://www.eui.eu/Research/Library',
|
|
'note': 'Using parent institution Q1378099 (EUI) as no separate Wikidata entity exists for Library'
|
|
}
|
|
}
|
|
|
|
print("\n🔄 Applying enrichments...")
|
|
enriched_count = 0
|
|
|
|
for inst in italian:
|
|
inst_id = inst.get('id')
|
|
|
|
if inst_id in enrichments:
|
|
enrich_data = enrichments[inst_id]
|
|
|
|
# Initialize identifiers list if missing
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
# Add/update Wikidata identifier
|
|
existing_wd = [i for i in inst['identifiers'] if i.get('identifier_scheme') == 'Wikidata']
|
|
if not existing_wd:
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': enrich_data['wikidata'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{enrich_data['wikidata']}"
|
|
})
|
|
print(f" ✅ Added Wikidata {enrich_data['wikidata']}: {inst['name']}")
|
|
|
|
# Add/update VIAF identifier
|
|
existing_viaf = [i for i in inst['identifiers'] if i.get('identifier_scheme') == 'VIAF']
|
|
if not existing_viaf and enrich_data.get('viaf'):
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': enrich_data['viaf'],
|
|
'identifier_url': f"https://viaf.org/viaf/{enrich_data['viaf']}"
|
|
})
|
|
print(f" + VIAF {enrich_data['viaf']}")
|
|
|
|
# Add/update Website
|
|
existing_website = [i for i in inst['identifiers'] if i.get('identifier_scheme') == 'Website']
|
|
if not existing_website and enrich_data.get('website'):
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': enrich_data['website'],
|
|
'identifier_url': enrich_data['website']
|
|
})
|
|
print(f" + Website")
|
|
|
|
# Add coordinates to location
|
|
if inst.get('locations') and enrich_data.get('coordinates'):
|
|
loc = inst['locations'][0]
|
|
if 'latitude' not in loc:
|
|
loc.update(enrich_data['coordinates'])
|
|
print(f" + Coordinates")
|
|
|
|
# Update provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
inst['provenance']['last_enrichment_date'] = datetime.now(timezone.utc).isoformat()
|
|
inst['provenance']['enrichment_method'] = 'Manual Wikidata research'
|
|
|
|
# Add note to description if provided
|
|
if enrich_data.get('note'):
|
|
if 'description' in inst:
|
|
inst['description'] = f"{inst['description']} [Wikidata note: {enrich_data['note']}]"
|
|
|
|
enriched_count += 1
|
|
|
|
print(f"\n 📊 Total enriched: {enriched_count}")
|
|
|
|
# Calculate coverage
|
|
italian_with_wd = sum(
|
|
1 for inst in italian
|
|
if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', []))
|
|
)
|
|
|
|
print("\n" + "=" * 80)
|
|
print("📊 Italian Institutions - Wikidata Coverage")
|
|
print("=" * 80)
|
|
print(f"Total Italian institutions: {len(italian)}")
|
|
print(f"With Wikidata identifiers: {italian_with_wd}")
|
|
print(f"Coverage: {italian_with_wd/len(italian)*100:.1f}%")
|
|
|
|
if italian_with_wd == len(italian):
|
|
print("\n✅ SUCCESS: 100% Wikidata coverage achieved!")
|
|
|
|
# Save enriched Italian institutions
|
|
output_path = 'data/instances/italy/it_institutions_enriched_manual.yaml'
|
|
print(f"\n💾 Saving enriched Italian institutions to {output_path}...")
|
|
|
|
import os
|
|
os.makedirs('data/instances/italy', exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(italian, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print(" ✅ Saved")
|
|
print("\n🔄 Next step: Run scripts/merge_it_enriched.py to merge back into unified dataset")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
enrich_italian_institutions()
|