glam/scripts/enrich_czech_wikidata.py
2025-11-21 22:12:33 +01:00

329 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Enrich Czech institutions with Wikidata Q-numbers.
Uses Wikidata SPARQL endpoint to find matching institutions by name,
location, and type. Adds Wikidata identifiers to czech_unified.yaml.
Process:
1. Load czech_unified.yaml (8,694 institutions)
2. Filter institutions WITHOUT Wikidata Q-numbers (estimate: ~95%)
3. Query Wikidata for Czech heritage institutions
4. Fuzzy match by name + location + type
5. Add Wikidata identifiers to records
6. Save to czech_unified_wikidata.yaml
Estimated time: 5-10 minutes (SPARQL queries + fuzzy matching)
"""
import yaml
import requests
from typing import List, Dict, Optional, Tuple
from rapidfuzz import fuzz
from datetime import datetime, timezone
# Wikidata SPARQL endpoint
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
# Wikidata institution type mapping (GLAM → Wikidata Q-numbers)
WIKIDATA_TYPES = {
'MUSEUM': ['Q33506'], # museum
'LIBRARY': ['Q7075'], # library
'ARCHIVE': ['Q166118'], # archive
'GALLERY': ['Q1007870'], # art gallery
}
def query_wikidata_institutions(country_code: str = 'Q213') -> List[Dict]:
"""
Query Wikidata for Czech Republic heritage institutions.
Args:
country_code: Wikidata Q-number for country (Q213 = Czech Republic)
Returns:
List of dicts with: qid, label, type, location, coordinates
"""
# SPARQL query for Czech heritage institutions
query = f"""
SELECT DISTINCT ?item ?itemLabel ?typeLabel ?locationLabel ?coords ?isil ?viaf
WHERE {{
# Institution types (museum, library, archive, gallery)
VALUES ?type {{ wd:Q33506 wd:Q7075 wd:Q166118 wd:Q1007870 }}
# Instance of heritage institution type
?item wdt:P31/wdt:P279* ?type .
# Located in Czech Republic (or subdivisions)
?item wdt:P17 wd:{country_code} .
# Optional: specific location (city/town)
OPTIONAL {{ ?item wdt:P131 ?location }}
# Optional: coordinates
OPTIONAL {{ ?item wdt:P625 ?coords }}
# Optional: ISIL code
OPTIONAL {{ ?item wdt:P791 ?isil }}
# Optional: VIAF ID
OPTIONAL {{ ?item wdt:P214 ?viaf }}
# Get labels in Czech and English
SERVICE wikibase:label {{
bd:serviceParam wikibase:language "cs,en"
}}
}}
LIMIT 10000
"""
print("Querying Wikidata for Czech heritage institutions...")
print(f"SPARQL endpoint: {WIKIDATA_SPARQL}")
headers = {
'User-Agent': 'GLAM-Data-Extraction/0.2.0 (heritage institution research)',
'Accept': 'application/sparql-results+json'
}
try:
response = requests.get(
WIKIDATA_SPARQL,
params={'query': query},
headers=headers,
timeout=60
)
response.raise_for_status()
data = response.json()
# Parse results
institutions = []
for binding in data['results']['bindings']:
qid = binding['item']['value'].split('/')[-1]
label = binding['itemLabel']['value']
inst_type = binding['typeLabel']['value']
location = binding.get('locationLabel', {}).get('value', '')
coords = binding.get('coords', {}).get('value', '')
isil = binding.get('isil', {}).get('value', '')
viaf = binding.get('viaf', {}).get('value', '')
institutions.append({
'qid': qid,
'label': label,
'type': inst_type,
'location': location,
'coordinates': coords,
'isil': isil,
'viaf': viaf
})
print(f"Found {len(institutions)} institutions in Wikidata")
return institutions
except Exception as e:
print(f"Error querying Wikidata: {e}")
return []
def fuzzy_match_institution(
inst_name: str,
inst_city: str,
inst_type: str,
wikidata_results: List[Dict],
threshold: float = 85.0
) -> Optional[Tuple[Dict, float]]:
"""
Fuzzy match institution to Wikidata results.
Args:
inst_name: Institution name from our dataset
inst_city: City location
inst_type: Institution type (MUSEUM, LIBRARY, ARCHIVE, GALLERY)
wikidata_results: List of Wikidata query results
threshold: Minimum similarity score (0-100)
Returns:
Tuple of (matched_wikidata_record, confidence_score) or None
"""
best_match = None
best_score = 0.0
for wd in wikidata_results:
# Name similarity
name_score = fuzz.ratio(inst_name.lower(), wd['label'].lower())
# Location boost (if cities match)
location_boost = 0
if inst_city and wd['location']:
location_score = fuzz.partial_ratio(inst_city.lower(), wd['location'].lower())
if location_score > 85:
location_boost = 10
# Type match check (optional, informational only)
# We don't penalize type mismatches since Wikidata typing can be inconsistent
# Combined score
total_score = name_score + location_boost
if total_score > best_score and total_score >= threshold:
best_score = total_score
best_match = wd
if best_match:
return (best_match, best_score)
else:
return None
def enrich_with_wikidata():
"""Main enrichment workflow."""
print("="*80)
print("CZECH INSTITUTIONS - WIKIDATA ENRICHMENT")
print("="*80)
print()
# Load unified dataset
print("Loading czech_unified.yaml...")
with open('data/instances/czech_unified.yaml', 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"Loaded {len(institutions)} institutions")
# Filter institutions without Wikidata Q-numbers
needs_wikidata = []
has_wikidata = 0
for inst in institutions:
has_qid = False
for identifier in inst.get('identifiers', []):
if identifier.get('identifier_scheme') == 'Wikidata':
has_qid = True
has_wikidata += 1
break
if not has_qid:
needs_wikidata.append(inst)
print(f"Institutions with Wikidata: {has_wikidata}")
print(f"Institutions needing Wikidata: {len(needs_wikidata)}")
print()
# Query Wikidata
wikidata_results = query_wikidata_institutions()
if not wikidata_results:
print("No Wikidata results found. Exiting.")
return
print()
print(f"Fuzzy matching {len(needs_wikidata)} institutions...")
print(f"Match threshold: 85% similarity")
print()
# Fuzzy match
matched = 0
low_confidence = 0
for idx, inst in enumerate(needs_wikidata, 1):
if idx % 100 == 0:
print(f" Processed {idx}/{len(needs_wikidata)} institutions...")
# Extract city from locations
city = ''
if inst.get('locations'):
city = inst['locations'][0].get('city', '')
# Fuzzy match
match_result = fuzzy_match_institution(
inst['name'],
city,
inst['institution_type'],
wikidata_results,
threshold=85.0
)
if match_result:
matched_wd, confidence = match_result
# Add Wikidata identifier
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': matched_wd['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{matched_wd['qid']}"
})
# Add ISIL if available and not already present
if matched_wd.get('isil'):
has_isil = any(
i.get('identifier_scheme') == 'ISIL'
for i in inst['identifiers']
)
if not has_isil:
inst['identifiers'].append({
'identifier_scheme': 'ISIL',
'identifier_value': matched_wd['isil'],
'identifier_url': f"https://isil.org/{matched_wd['isil']}"
})
# Add VIAF if available and not already present
if matched_wd.get('viaf'):
has_viaf = any(
i.get('identifier_scheme') == 'VIAF'
for i in inst['identifiers']
)
if not has_viaf:
inst['identifiers'].append({
'identifier_scheme': 'VIAF',
'identifier_value': matched_wd['viaf'],
'identifier_url': f"https://viaf.org/viaf/{matched_wd['viaf']}"
})
# Update provenance
if 'enrichment_history' not in inst['provenance']:
inst['provenance']['enrichment_history'] = []
inst['provenance']['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'Wikidata SPARQL query + fuzzy matching',
'match_score': confidence,
'verified': True if confidence > 95 else False
})
matched += 1
if confidence < 90:
low_confidence += 1
print(f"\n✅ Matched {matched} institutions ({matched/len(needs_wikidata)*100:.1f}%)")
print(f"⚠️ Low confidence matches (<90%): {low_confidence}")
print(f"❌ No match: {len(needs_wikidata) - matched}")
print()
# Save enriched dataset
output_path = 'data/instances/czech_unified_wikidata.yaml'
print(f"Saving enriched dataset to {output_path}...")
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(
institutions,
f,
allow_unicode=True,
sort_keys=False,
default_flow_style=False,
width=100
)
print(f"✅ Saved {len(institutions)} institutions")
print()
print("="*80)
print("ENRICHMENT COMPLETE")
print("="*80)
print(f"Total institutions: {len(institutions)}")
print(f"With Wikidata Q-numbers: {has_wikidata + matched}")
print(f"Newly enriched: {matched}")
print(f"Enrichment rate: {(has_wikidata + matched)/len(institutions)*100:.1f}%")
if __name__ == '__main__':
enrich_with_wikidata()