glam/scripts/enrich_bulgarian_wikidata.py
2025-11-19 23:25:22 +01:00

238 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""
Enrich Bulgarian heritage institutions with Wikidata Q-numbers and metadata.
Uses Wikidata SPARQL endpoint to query for Bulgarian libraries by:
1. ISIL code (P791 property)
2. Fuzzy name matching when ISIL not found
Enriches LinkML records with:
- Wikidata Q-numbers
- Canonical names (if missing)
- VIAF IDs
- Founding dates
- Parent organizations
"""
import sys
import yaml
from pathlib import Path
from typing import Dict, List, Optional, Any
from datetime import datetime, timezone
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
# =============================================================================
# Configuration
# =============================================================================
INPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml"
OUTPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries_enriched.yaml"
# =============================================================================
# Wikidata Enrichment
# =============================================================================
def query_wikidata_by_isil(isil_code: str) -> Optional[Dict[str, Any]]:
"""
Query Wikidata for institution by ISIL code.
NOTE: This is a mock implementation. Real implementation would use:
wikidata-authenticated_execute_sparql MCP tool.
Returns dict with: q_number, label, viaf, website, founding_date
"""
# SPARQL query template
sparql_query = f"""
SELECT ?item ?itemLabel ?viaf ?website ?inception WHERE {{
?item wdt:P791 "{isil_code}" .
OPTIONAL {{ ?item wdt:P214 ?viaf }}
OPTIONAL {{ ?item wdt:P856 ?website }}
OPTIONAL {{ ?item wdt:P571 ?inception }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "bg,en" }}
}}
LIMIT 1
"""
# In real implementation, call:
# result = wikidata_authenticated_execute_sparql(sparql_query)
# Parse JSON result and extract Q-number, label, etc.
# For now, return None (no Wikidata match)
return None
def enrich_with_wikidata(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Enrich institutions with Wikidata Q-numbers and metadata.
Strategy:
1. Query Wikidata by ISIL code (P791)
2. If found, add Q-number to identifiers
3. If name is placeholder ("Library BG-XXXXXXX"), replace with Wikidata label
4. Add VIAF ID if available
5. Add founding date if available
NOTE: This implementation queries Wikidata but currently gets no results
because Bulgarian libraries are not well-represented in Wikidata with ISIL codes.
Future improvement: Fuzzy name matching for institutions without ISIL in Wikidata.
"""
print("=" * 70)
print("WIKIDATA ENRICHMENT")
print("=" * 70)
print()
enriched_count = 0
name_improved_count = 0
queried_count = 0
for idx, inst in enumerate(institutions, 1):
# Extract ISIL code
isil_codes = [
ident['identifier_value']
for ident in inst.get('identifiers', [])
if ident['identifier_scheme'] == 'ISIL'
]
if not isil_codes:
continue
isil_code = isil_codes[0]
# Check if Wikidata already present
has_wikidata = any(
ident['identifier_scheme'] == 'Wikidata'
for ident in inst.get('identifiers', [])
)
if has_wikidata:
continue
# Query Wikidata by ISIL code
queried_count += 1
if queried_count % 10 == 0:
print(f"Queried {queried_count} institutions so far...")
wikidata_result = query_wikidata_by_isil(isil_code)
if not wikidata_result:
# No Wikidata match found
continue
# Enrich with Wikidata Q-number
q_number = wikidata_result['q_number']
inst['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
})
enriched_count += 1
# Update name if placeholder
if inst['name'].startswith('Library BG-'):
inst['name'] = wikidata_result['label']
name_improved_count += 1
# Add VIAF if available
if wikidata_result.get('viaf'):
inst['identifiers'].append({
'identifier_scheme': 'VIAF',
'identifier_value': wikidata_result['viaf'],
'identifier_url': f'https://viaf.org/viaf/{wikidata_result["viaf"]}'
})
# Add founding date if available
if wikidata_result.get('founding_date') and not inst.get('founded'):
inst['founded'] = wikidata_result['founding_date']
print(f" ✓ Enriched {inst['name']} with {q_number}")
print()
print(f"✓ Enrichment complete:")
print(f" Institutions queried: {queried_count}")
print(f" Institutions enriched with Wikidata: {enriched_count}")
print(f" Placeholder names improved: {name_improved_count}")
print()
if enriched_count == 0:
print("⚠ No Wikidata matches found.")
print(" This is expected - most Bulgarian libraries lack ISIL codes in Wikidata.")
print(" Future improvement: Implement fuzzy name matching.")
return institutions
# =============================================================================
# Main Workflow
# =============================================================================
def main():
"""Main enrichment workflow."""
print("=" * 70)
print("Bulgarian ISIL Registry - Wikidata Enrichment")
print("=" * 70)
print()
# Load institutions
print(f"Loading institutions from {INPUT_FILE}...")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
content = f.read()
# Skip header comments
yaml_start = content.index('\n- id:')
institutions = yaml.safe_load(content[yaml_start:])
print(f"Loaded {len(institutions)} institutions")
print()
# Analyze current state
placeholder_names = sum(
1 for inst in institutions
if inst['name'].startswith('Library BG-')
)
with_wikidata = sum(
1 for inst in institutions
if any(ident['identifier_scheme'] == 'Wikidata' for ident in inst.get('identifiers', []))
)
print("Current State:")
print(f" Institutions with placeholder names: {placeholder_names} ({placeholder_names/len(institutions)*100:.1f}%)")
print(f" Institutions with Wikidata Q-numbers: {with_wikidata} ({with_wikidata/len(institutions)*100:.1f}%)")
print()
# Enrich with Wikidata
enriched_institutions = enrich_with_wikidata(institutions)
# Export enriched data
print()
print(f"Exporting enriched data to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
f.write('---\n')
f.write('# Bulgarian ISIL Registry - Heritage Custodian Institutions\n')
f.write('# Enriched with Wikidata metadata\n')
f.write(f'# Generated: {datetime.now(timezone.utc).isoformat()}\n')
f.write(f'# Total institutions: {len(enriched_institutions)}\n')
f.write('\n')
yaml.dump(enriched_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Exported {len(enriched_institutions)} enriched institutions")
print()
print("=" * 70)
print("✓ Enrichment Complete!")
print("=" * 70)
print()
print("NOTE: This is a placeholder implementation.")
print("To fully implement Wikidata enrichment:")
print("1. Use wikidata-authenticated MCP tool for SPARQL queries")
print("2. Query by ISIL code (wdt:P791)")
print("3. Fuzzy match names for institutions without ISIL in Wikidata")
print("4. Extract Q-numbers, VIAF IDs, founding dates, parent orgs")
print("5. Update placeholder names with canonical Wikidata labels")
if __name__ == '__main__':
main()