- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
203 lines
6.3 KiB
Python
203 lines
6.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract VIAF Identifiers from Wikidata
|
|
|
|
For institutions that already have Wikidata IDs, query Wikidata to check if
|
|
they have VIAF identifiers (Property P214) that we can add to our records.
|
|
|
|
This is a quick win since we already have 7 Wikidata IDs confirmed.
|
|
|
|
Usage:
|
|
python scripts/extract_viaf_from_wikidata.py
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
import time
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
|
|
def query_wikidata_for_viaf(wikidata_id: str) -> Optional[str]:
|
|
"""
|
|
Query Wikidata SPARQL endpoint to get VIAF ID for an entity.
|
|
|
|
Args:
|
|
wikidata_id: Wikidata Q-number (e.g., "Q501851")
|
|
|
|
Returns:
|
|
VIAF ID (numeric string) or None if not found
|
|
"""
|
|
endpoint = "https://query.wikidata.org/sparql"
|
|
|
|
query = f"""
|
|
SELECT ?viaf WHERE {{
|
|
wd:{wikidata_id} wdt:P214 ?viaf .
|
|
}}
|
|
"""
|
|
|
|
try:
|
|
response = requests.get(
|
|
endpoint,
|
|
params={'query': query, 'format': 'json'},
|
|
headers={'User-Agent': 'GLAM-Extractor/1.0 (heritage data research)'},
|
|
timeout=10
|
|
)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
# Check if we got results
|
|
bindings = data.get('results', {}).get('bindings', [])
|
|
if not bindings:
|
|
return None
|
|
|
|
# Extract VIAF ID from first result
|
|
viaf_id = bindings[0].get('viaf', {}).get('value')
|
|
return viaf_id
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" Error querying Wikidata: {e}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" Error parsing Wikidata response: {e}")
|
|
return None
|
|
|
|
|
|
def extract_viaf_from_wikidata(institutions: List[Dict]) -> Tuple[List[Dict], Dict]:
|
|
"""
|
|
Extract VIAF identifiers from Wikidata for institutions with Wikidata IDs.
|
|
|
|
Args:
|
|
institutions: List of institution records
|
|
|
|
Returns:
|
|
Tuple of (enriched_institutions, statistics)
|
|
"""
|
|
stats = {
|
|
'total': len(institutions),
|
|
'has_wikidata': 0,
|
|
'already_has_viaf': 0,
|
|
'viaf_found_in_wikidata': 0,
|
|
'viaf_not_in_wikidata': 0,
|
|
'no_wikidata': 0
|
|
}
|
|
|
|
enriched = []
|
|
|
|
for inst in institutions:
|
|
name = inst.get('name', '')
|
|
identifiers = inst.get('identifiers', [])
|
|
|
|
# Check if already has VIAF
|
|
has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in identifiers)
|
|
|
|
# Find Wikidata ID
|
|
wikidata_id = None
|
|
for identifier in identifiers:
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
wikidata_id = identifier.get('identifier_value')
|
|
break
|
|
|
|
if not wikidata_id:
|
|
stats['no_wikidata'] += 1
|
|
enriched.append(inst)
|
|
continue
|
|
|
|
stats['has_wikidata'] += 1
|
|
|
|
if has_viaf:
|
|
print(f"✓ {name}: Already has VIAF identifier")
|
|
stats['already_has_viaf'] += 1
|
|
enriched.append(inst)
|
|
continue
|
|
|
|
# Query Wikidata for VIAF
|
|
print(f"\n🔍 Querying Wikidata {wikidata_id} for VIAF: {name}")
|
|
viaf_id = query_wikidata_for_viaf(wikidata_id)
|
|
|
|
if viaf_id:
|
|
# Add VIAF identifier
|
|
identifiers.append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': viaf_id,
|
|
'identifier_url': f'https://viaf.org/viaf/{viaf_id}'
|
|
})
|
|
|
|
inst['identifiers'] = identifiers
|
|
|
|
# Update provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
if 'viaf_enrichment' not in inst['provenance']:
|
|
inst['provenance']['viaf_enrichment'] = {}
|
|
|
|
inst['provenance']['viaf_enrichment'].update({
|
|
'method': 'Extracted from Wikidata P214',
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'wikidata_source': wikidata_id,
|
|
'verified': True # High confidence since it's from Wikidata
|
|
})
|
|
|
|
print(f"✅ Added VIAF identifier from Wikidata: {viaf_id}")
|
|
stats['viaf_found_in_wikidata'] += 1
|
|
else:
|
|
print(f"❌ No VIAF identifier in Wikidata {wikidata_id}")
|
|
stats['viaf_not_in_wikidata'] += 1
|
|
|
|
enriched.append(inst)
|
|
|
|
# Rate limiting - be respectful to Wikidata
|
|
time.sleep(0.5)
|
|
|
|
return enriched, stats
|
|
|
|
|
|
def main():
|
|
"""Main execution function."""
|
|
input_file = 'data/instances/egypt_institutions_wikidata_corrected.yaml'
|
|
output_file = 'data/instances/egypt_institutions_wikidata_viaf.yaml'
|
|
|
|
print("="*70)
|
|
print("Extract VIAF Identifiers from Wikidata for Egyptian Institutions")
|
|
print("="*70)
|
|
|
|
# Load institutions
|
|
print(f"\nLoading institutions from: {input_file}")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
|
|
# Extract VIAF from Wikidata
|
|
enriched, stats = extract_viaf_from_wikidata(institutions)
|
|
|
|
# Save enriched data
|
|
print(f"\n{'='*70}")
|
|
print(f"Saving enriched data to: {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(enriched, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
# Print statistics
|
|
print(f"\n{'='*70}")
|
|
print("VIAF Extraction Statistics (from Wikidata)")
|
|
print(f"{'='*70}")
|
|
print(f"Total institutions: {stats['total']}")
|
|
print(f"Institutions with Wikidata IDs: {stats['has_wikidata']}")
|
|
print(f"Already had VIAF: {stats['already_has_viaf']}")
|
|
print(f"VIAF found in Wikidata: {stats['viaf_found_in_wikidata']}")
|
|
print(f"No VIAF in Wikidata: {stats['viaf_not_in_wikidata']}")
|
|
print(f"No Wikidata ID: {stats['no_wikidata']}")
|
|
|
|
total_viaf = stats['viaf_found_in_wikidata'] + stats['already_has_viaf']
|
|
print(f"\n📊 Total VIAF coverage: {total_viaf}/{stats['total']} "
|
|
f"({100 * total_viaf / stats['total']:.1f}%)")
|
|
|
|
print(f"\n{'='*70}")
|
|
print("✅ VIAF extraction from Wikidata complete!")
|
|
print(f"{'='*70}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|