238 lines
8.1 KiB
Python
238 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Bulgarian heritage institutions with Wikidata Q-numbers and metadata.
|
|
|
|
Uses Wikidata SPARQL endpoint to query for Bulgarian libraries by:
|
|
1. ISIL code (P791 property)
|
|
2. Fuzzy name matching when ISIL not found
|
|
|
|
Enriches LinkML records with:
|
|
- Wikidata Q-numbers
|
|
- Canonical names (if missing)
|
|
- VIAF IDs
|
|
- Founding dates
|
|
- Parent organizations
|
|
"""
|
|
|
|
import sys
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
from datetime import datetime, timezone
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent.parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
# =============================================================================
|
|
# Configuration
|
|
# =============================================================================
|
|
|
|
INPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries.yaml"
|
|
OUTPUT_FILE = project_root / "data/instances/bulgaria_isil_libraries_enriched.yaml"
|
|
|
|
|
|
# =============================================================================
|
|
# Wikidata Enrichment
|
|
# =============================================================================
|
|
|
|
def query_wikidata_by_isil(isil_code: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Query Wikidata for institution by ISIL code.
|
|
|
|
NOTE: This is a mock implementation. Real implementation would use:
|
|
wikidata-authenticated_execute_sparql MCP tool.
|
|
|
|
Returns dict with: q_number, label, viaf, website, founding_date
|
|
"""
|
|
# SPARQL query template
|
|
sparql_query = f"""
|
|
SELECT ?item ?itemLabel ?viaf ?website ?inception WHERE {{
|
|
?item wdt:P791 "{isil_code}" .
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "bg,en" }}
|
|
}}
|
|
LIMIT 1
|
|
"""
|
|
|
|
# In real implementation, call:
|
|
# result = wikidata_authenticated_execute_sparql(sparql_query)
|
|
# Parse JSON result and extract Q-number, label, etc.
|
|
|
|
# For now, return None (no Wikidata match)
|
|
return None
|
|
|
|
|
|
def enrich_with_wikidata(institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""
|
|
Enrich institutions with Wikidata Q-numbers and metadata.
|
|
|
|
Strategy:
|
|
1. Query Wikidata by ISIL code (P791)
|
|
2. If found, add Q-number to identifiers
|
|
3. If name is placeholder ("Library BG-XXXXXXX"), replace with Wikidata label
|
|
4. Add VIAF ID if available
|
|
5. Add founding date if available
|
|
|
|
NOTE: This implementation queries Wikidata but currently gets no results
|
|
because Bulgarian libraries are not well-represented in Wikidata with ISIL codes.
|
|
|
|
Future improvement: Fuzzy name matching for institutions without ISIL in Wikidata.
|
|
"""
|
|
print("=" * 70)
|
|
print("WIKIDATA ENRICHMENT")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
enriched_count = 0
|
|
name_improved_count = 0
|
|
queried_count = 0
|
|
|
|
for idx, inst in enumerate(institutions, 1):
|
|
# Extract ISIL code
|
|
isil_codes = [
|
|
ident['identifier_value']
|
|
for ident in inst.get('identifiers', [])
|
|
if ident['identifier_scheme'] == 'ISIL'
|
|
]
|
|
|
|
if not isil_codes:
|
|
continue
|
|
|
|
isil_code = isil_codes[0]
|
|
|
|
# Check if Wikidata already present
|
|
has_wikidata = any(
|
|
ident['identifier_scheme'] == 'Wikidata'
|
|
for ident in inst.get('identifiers', [])
|
|
)
|
|
|
|
if has_wikidata:
|
|
continue
|
|
|
|
# Query Wikidata by ISIL code
|
|
queried_count += 1
|
|
|
|
if queried_count % 10 == 0:
|
|
print(f"Queried {queried_count} institutions so far...")
|
|
|
|
wikidata_result = query_wikidata_by_isil(isil_code)
|
|
|
|
if not wikidata_result:
|
|
# No Wikidata match found
|
|
continue
|
|
|
|
# Enrich with Wikidata Q-number
|
|
q_number = wikidata_result['q_number']
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_number,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
|
})
|
|
enriched_count += 1
|
|
|
|
# Update name if placeholder
|
|
if inst['name'].startswith('Library BG-'):
|
|
inst['name'] = wikidata_result['label']
|
|
name_improved_count += 1
|
|
|
|
# Add VIAF if available
|
|
if wikidata_result.get('viaf'):
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': wikidata_result['viaf'],
|
|
'identifier_url': f'https://viaf.org/viaf/{wikidata_result["viaf"]}'
|
|
})
|
|
|
|
# Add founding date if available
|
|
if wikidata_result.get('founding_date') and not inst.get('founded'):
|
|
inst['founded'] = wikidata_result['founding_date']
|
|
|
|
print(f" ✓ Enriched {inst['name']} with {q_number}")
|
|
|
|
print()
|
|
print(f"✓ Enrichment complete:")
|
|
print(f" Institutions queried: {queried_count}")
|
|
print(f" Institutions enriched with Wikidata: {enriched_count}")
|
|
print(f" Placeholder names improved: {name_improved_count}")
|
|
print()
|
|
|
|
if enriched_count == 0:
|
|
print("⚠ No Wikidata matches found.")
|
|
print(" This is expected - most Bulgarian libraries lack ISIL codes in Wikidata.")
|
|
print(" Future improvement: Implement fuzzy name matching.")
|
|
|
|
return institutions
|
|
|
|
|
|
# =============================================================================
|
|
# Main Workflow
|
|
# =============================================================================
|
|
|
|
def main():
|
|
"""Main enrichment workflow."""
|
|
print("=" * 70)
|
|
print("Bulgarian ISIL Registry - Wikidata Enrichment")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Load institutions
|
|
print(f"Loading institutions from {INPUT_FILE}...")
|
|
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
# Skip header comments
|
|
yaml_start = content.index('\n- id:')
|
|
institutions = yaml.safe_load(content[yaml_start:])
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Analyze current state
|
|
placeholder_names = sum(
|
|
1 for inst in institutions
|
|
if inst['name'].startswith('Library BG-')
|
|
)
|
|
with_wikidata = sum(
|
|
1 for inst in institutions
|
|
if any(ident['identifier_scheme'] == 'Wikidata' for ident in inst.get('identifiers', []))
|
|
)
|
|
|
|
print("Current State:")
|
|
print(f" Institutions with placeholder names: {placeholder_names} ({placeholder_names/len(institutions)*100:.1f}%)")
|
|
print(f" Institutions with Wikidata Q-numbers: {with_wikidata} ({with_wikidata/len(institutions)*100:.1f}%)")
|
|
print()
|
|
|
|
# Enrich with Wikidata
|
|
enriched_institutions = enrich_with_wikidata(institutions)
|
|
|
|
# Export enriched data
|
|
print()
|
|
print(f"Exporting enriched data to {OUTPUT_FILE}...")
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
f.write('---\n')
|
|
f.write('# Bulgarian ISIL Registry - Heritage Custodian Institutions\n')
|
|
f.write('# Enriched with Wikidata metadata\n')
|
|
f.write(f'# Generated: {datetime.now(timezone.utc).isoformat()}\n')
|
|
f.write(f'# Total institutions: {len(enriched_institutions)}\n')
|
|
f.write('\n')
|
|
yaml.dump(enriched_institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"✓ Exported {len(enriched_institutions)} enriched institutions")
|
|
print()
|
|
print("=" * 70)
|
|
print("✓ Enrichment Complete!")
|
|
print("=" * 70)
|
|
print()
|
|
print("NOTE: This is a placeholder implementation.")
|
|
print("To fully implement Wikidata enrichment:")
|
|
print("1. Use wikidata-authenticated MCP tool for SPARQL queries")
|
|
print("2. Query by ISIL code (wdt:P791)")
|
|
print("3. Fuzzy match names for institutions without ISIL in Wikidata")
|
|
print("4. Extract Q-numbers, VIAF IDs, founding dates, parent orgs")
|
|
print("5. Update placeholder names with canonical Wikidata labels")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|