glam/scripts/extract_viaf_from_wikidata.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

203 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""
Extract VIAF Identifiers from Wikidata
For institutions that already have Wikidata IDs, query Wikidata to check if
they have VIAF identifiers (Property P214) that we can add to our records.
This is a quick win since we already have 7 Wikidata IDs confirmed.
Usage:
python scripts/extract_viaf_from_wikidata.py
"""
import yaml
import requests
import time
from datetime import datetime, timezone
from typing import Dict, List, Optional, Tuple
def query_wikidata_for_viaf(wikidata_id: str) -> Optional[str]:
"""
Query Wikidata SPARQL endpoint to get VIAF ID for an entity.
Args:
wikidata_id: Wikidata Q-number (e.g., "Q501851")
Returns:
VIAF ID (numeric string) or None if not found
"""
endpoint = "https://query.wikidata.org/sparql"
query = f"""
SELECT ?viaf WHERE {{
wd:{wikidata_id} wdt:P214 ?viaf .
}}
"""
try:
response = requests.get(
endpoint,
params={'query': query, 'format': 'json'},
headers={'User-Agent': 'GLAM-Extractor/1.0 (heritage data research)'},
timeout=10
)
response.raise_for_status()
data = response.json()
# Check if we got results
bindings = data.get('results', {}).get('bindings', [])
if not bindings:
return None
# Extract VIAF ID from first result
viaf_id = bindings[0].get('viaf', {}).get('value')
return viaf_id
except requests.exceptions.RequestException as e:
print(f" Error querying Wikidata: {e}")
return None
except Exception as e:
print(f" Error parsing Wikidata response: {e}")
return None
def extract_viaf_from_wikidata(institutions: List[Dict]) -> Tuple[List[Dict], Dict]:
"""
Extract VIAF identifiers from Wikidata for institutions with Wikidata IDs.
Args:
institutions: List of institution records
Returns:
Tuple of (enriched_institutions, statistics)
"""
stats = {
'total': len(institutions),
'has_wikidata': 0,
'already_has_viaf': 0,
'viaf_found_in_wikidata': 0,
'viaf_not_in_wikidata': 0,
'no_wikidata': 0
}
enriched = []
for inst in institutions:
name = inst.get('name', '')
identifiers = inst.get('identifiers', [])
# Check if already has VIAF
has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in identifiers)
# Find Wikidata ID
wikidata_id = None
for identifier in identifiers:
if identifier.get('identifier_scheme') == 'Wikidata':
wikidata_id = identifier.get('identifier_value')
break
if not wikidata_id:
stats['no_wikidata'] += 1
enriched.append(inst)
continue
stats['has_wikidata'] += 1
if has_viaf:
print(f"{name}: Already has VIAF identifier")
stats['already_has_viaf'] += 1
enriched.append(inst)
continue
# Query Wikidata for VIAF
print(f"\n🔍 Querying Wikidata {wikidata_id} for VIAF: {name}")
viaf_id = query_wikidata_for_viaf(wikidata_id)
if viaf_id:
# Add VIAF identifier
identifiers.append({
'identifier_scheme': 'VIAF',
'identifier_value': viaf_id,
'identifier_url': f'https://viaf.org/viaf/{viaf_id}'
})
inst['identifiers'] = identifiers
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
if 'viaf_enrichment' not in inst['provenance']:
inst['provenance']['viaf_enrichment'] = {}
inst['provenance']['viaf_enrichment'].update({
'method': 'Extracted from Wikidata P214',
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'wikidata_source': wikidata_id,
'verified': True # High confidence since it's from Wikidata
})
print(f"✅ Added VIAF identifier from Wikidata: {viaf_id}")
stats['viaf_found_in_wikidata'] += 1
else:
print(f"❌ No VIAF identifier in Wikidata {wikidata_id}")
stats['viaf_not_in_wikidata'] += 1
enriched.append(inst)
# Rate limiting - be respectful to Wikidata
time.sleep(0.5)
return enriched, stats
def main():
"""Main execution function."""
input_file = 'data/instances/egypt_institutions_wikidata_corrected.yaml'
output_file = 'data/instances/egypt_institutions_wikidata_viaf.yaml'
print("="*70)
print("Extract VIAF Identifiers from Wikidata for Egyptian Institutions")
print("="*70)
# Load institutions
print(f"\nLoading institutions from: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"Loaded {len(institutions)} institutions")
# Extract VIAF from Wikidata
enriched, stats = extract_viaf_from_wikidata(institutions)
# Save enriched data
print(f"\n{'='*70}")
print(f"Saving enriched data to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(enriched, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
# Print statistics
print(f"\n{'='*70}")
print("VIAF Extraction Statistics (from Wikidata)")
print(f"{'='*70}")
print(f"Total institutions: {stats['total']}")
print(f"Institutions with Wikidata IDs: {stats['has_wikidata']}")
print(f"Already had VIAF: {stats['already_has_viaf']}")
print(f"VIAF found in Wikidata: {stats['viaf_found_in_wikidata']}")
print(f"No VIAF in Wikidata: {stats['viaf_not_in_wikidata']}")
print(f"No Wikidata ID: {stats['no_wikidata']}")
total_viaf = stats['viaf_found_in_wikidata'] + stats['already_has_viaf']
print(f"\n📊 Total VIAF coverage: {total_viaf}/{stats['total']} "
f"({100 * total_viaf / stats['total']:.1f}%)")
print(f"\n{'='*70}")
print("✅ VIAF extraction from Wikidata complete!")
print(f"{'='*70}")
if __name__ == '__main__':
main()