329 lines
10 KiB
Python
329 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Czech institutions with Wikidata Q-numbers.
|
|
|
|
Uses Wikidata SPARQL endpoint to find matching institutions by name,
|
|
location, and type. Adds Wikidata identifiers to czech_unified.yaml.
|
|
|
|
Process:
|
|
1. Load czech_unified.yaml (8,694 institutions)
|
|
2. Filter institutions WITHOUT Wikidata Q-numbers (estimate: ~95%)
|
|
3. Query Wikidata for Czech heritage institutions
|
|
4. Fuzzy match by name + location + type
|
|
5. Add Wikidata identifiers to records
|
|
6. Save to czech_unified_wikidata.yaml
|
|
|
|
Estimated time: 5-10 minutes (SPARQL queries + fuzzy matching)
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
from typing import List, Dict, Optional, Tuple
|
|
from rapidfuzz import fuzz
|
|
from datetime import datetime, timezone
|
|
|
|
# Wikidata SPARQL endpoint
|
|
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
|
|
|
# Wikidata institution type mapping (GLAM → Wikidata Q-numbers)
|
|
WIKIDATA_TYPES = {
|
|
'MUSEUM': ['Q33506'], # museum
|
|
'LIBRARY': ['Q7075'], # library
|
|
'ARCHIVE': ['Q166118'], # archive
|
|
'GALLERY': ['Q1007870'], # art gallery
|
|
}
|
|
|
|
def query_wikidata_institutions(country_code: str = 'Q213') -> List[Dict]:
|
|
"""
|
|
Query Wikidata for Czech Republic heritage institutions.
|
|
|
|
Args:
|
|
country_code: Wikidata Q-number for country (Q213 = Czech Republic)
|
|
|
|
Returns:
|
|
List of dicts with: qid, label, type, location, coordinates
|
|
"""
|
|
|
|
# SPARQL query for Czech heritage institutions
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?typeLabel ?locationLabel ?coords ?isil ?viaf
|
|
WHERE {{
|
|
# Institution types (museum, library, archive, gallery)
|
|
VALUES ?type {{ wd:Q33506 wd:Q7075 wd:Q166118 wd:Q1007870 }}
|
|
|
|
# Instance of heritage institution type
|
|
?item wdt:P31/wdt:P279* ?type .
|
|
|
|
# Located in Czech Republic (or subdivisions)
|
|
?item wdt:P17 wd:{country_code} .
|
|
|
|
# Optional: specific location (city/town)
|
|
OPTIONAL {{ ?item wdt:P131 ?location }}
|
|
|
|
# Optional: coordinates
|
|
OPTIONAL {{ ?item wdt:P625 ?coords }}
|
|
|
|
# Optional: ISIL code
|
|
OPTIONAL {{ ?item wdt:P791 ?isil }}
|
|
|
|
# Optional: VIAF ID
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf }}
|
|
|
|
# Get labels in Czech and English
|
|
SERVICE wikibase:label {{
|
|
bd:serviceParam wikibase:language "cs,en"
|
|
}}
|
|
}}
|
|
LIMIT 10000
|
|
"""
|
|
|
|
print("Querying Wikidata for Czech heritage institutions...")
|
|
print(f"SPARQL endpoint: {WIKIDATA_SPARQL}")
|
|
|
|
headers = {
|
|
'User-Agent': 'GLAM-Data-Extraction/0.2.0 (heritage institution research)',
|
|
'Accept': 'application/sparql-results+json'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(
|
|
WIKIDATA_SPARQL,
|
|
params={'query': query},
|
|
headers=headers,
|
|
timeout=60
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# Parse results
|
|
institutions = []
|
|
for binding in data['results']['bindings']:
|
|
qid = binding['item']['value'].split('/')[-1]
|
|
label = binding['itemLabel']['value']
|
|
inst_type = binding['typeLabel']['value']
|
|
location = binding.get('locationLabel', {}).get('value', '')
|
|
coords = binding.get('coords', {}).get('value', '')
|
|
isil = binding.get('isil', {}).get('value', '')
|
|
viaf = binding.get('viaf', {}).get('value', '')
|
|
|
|
institutions.append({
|
|
'qid': qid,
|
|
'label': label,
|
|
'type': inst_type,
|
|
'location': location,
|
|
'coordinates': coords,
|
|
'isil': isil,
|
|
'viaf': viaf
|
|
})
|
|
|
|
print(f"Found {len(institutions)} institutions in Wikidata")
|
|
return institutions
|
|
|
|
except Exception as e:
|
|
print(f"Error querying Wikidata: {e}")
|
|
return []
|
|
|
|
def fuzzy_match_institution(
|
|
inst_name: str,
|
|
inst_city: str,
|
|
inst_type: str,
|
|
wikidata_results: List[Dict],
|
|
threshold: float = 85.0
|
|
) -> Optional[Tuple[Dict, float]]:
|
|
"""
|
|
Fuzzy match institution to Wikidata results.
|
|
|
|
Args:
|
|
inst_name: Institution name from our dataset
|
|
inst_city: City location
|
|
inst_type: Institution type (MUSEUM, LIBRARY, ARCHIVE, GALLERY)
|
|
wikidata_results: List of Wikidata query results
|
|
threshold: Minimum similarity score (0-100)
|
|
|
|
Returns:
|
|
Tuple of (matched_wikidata_record, confidence_score) or None
|
|
"""
|
|
|
|
best_match = None
|
|
best_score = 0.0
|
|
|
|
for wd in wikidata_results:
|
|
# Name similarity
|
|
name_score = fuzz.ratio(inst_name.lower(), wd['label'].lower())
|
|
|
|
# Location boost (if cities match)
|
|
location_boost = 0
|
|
if inst_city and wd['location']:
|
|
location_score = fuzz.partial_ratio(inst_city.lower(), wd['location'].lower())
|
|
if location_score > 85:
|
|
location_boost = 10
|
|
|
|
# Type match check (optional, informational only)
|
|
# We don't penalize type mismatches since Wikidata typing can be inconsistent
|
|
|
|
# Combined score
|
|
total_score = name_score + location_boost
|
|
|
|
if total_score > best_score and total_score >= threshold:
|
|
best_score = total_score
|
|
best_match = wd
|
|
|
|
if best_match:
|
|
return (best_match, best_score)
|
|
else:
|
|
return None
|
|
|
|
def enrich_with_wikidata():
|
|
"""Main enrichment workflow."""
|
|
|
|
print("="*80)
|
|
print("CZECH INSTITUTIONS - WIKIDATA ENRICHMENT")
|
|
print("="*80)
|
|
print()
|
|
|
|
# Load unified dataset
|
|
print("Loading czech_unified.yaml...")
|
|
with open('data/instances/czech_unified.yaml', 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
|
|
# Filter institutions without Wikidata Q-numbers
|
|
needs_wikidata = []
|
|
has_wikidata = 0
|
|
|
|
for inst in institutions:
|
|
has_qid = False
|
|
for identifier in inst.get('identifiers', []):
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
has_qid = True
|
|
has_wikidata += 1
|
|
break
|
|
|
|
if not has_qid:
|
|
needs_wikidata.append(inst)
|
|
|
|
print(f"Institutions with Wikidata: {has_wikidata}")
|
|
print(f"Institutions needing Wikidata: {len(needs_wikidata)}")
|
|
print()
|
|
|
|
# Query Wikidata
|
|
wikidata_results = query_wikidata_institutions()
|
|
|
|
if not wikidata_results:
|
|
print("No Wikidata results found. Exiting.")
|
|
return
|
|
|
|
print()
|
|
print(f"Fuzzy matching {len(needs_wikidata)} institutions...")
|
|
print(f"Match threshold: 85% similarity")
|
|
print()
|
|
|
|
# Fuzzy match
|
|
matched = 0
|
|
low_confidence = 0
|
|
|
|
for idx, inst in enumerate(needs_wikidata, 1):
|
|
if idx % 100 == 0:
|
|
print(f" Processed {idx}/{len(needs_wikidata)} institutions...")
|
|
|
|
# Extract city from locations
|
|
city = ''
|
|
if inst.get('locations'):
|
|
city = inst['locations'][0].get('city', '')
|
|
|
|
# Fuzzy match
|
|
match_result = fuzzy_match_institution(
|
|
inst['name'],
|
|
city,
|
|
inst['institution_type'],
|
|
wikidata_results,
|
|
threshold=85.0
|
|
)
|
|
|
|
if match_result:
|
|
matched_wd, confidence = match_result
|
|
|
|
# Add Wikidata identifier
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': matched_wd['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{matched_wd['qid']}"
|
|
})
|
|
|
|
# Add ISIL if available and not already present
|
|
if matched_wd.get('isil'):
|
|
has_isil = any(
|
|
i.get('identifier_scheme') == 'ISIL'
|
|
for i in inst['identifiers']
|
|
)
|
|
if not has_isil:
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': matched_wd['isil'],
|
|
# ISIL codes don't have a universal URLisil']}"
|
|
})
|
|
|
|
# Add VIAF if available and not already present
|
|
if matched_wd.get('viaf'):
|
|
has_viaf = any(
|
|
i.get('identifier_scheme') == 'VIAF'
|
|
for i in inst['identifiers']
|
|
)
|
|
if not has_viaf:
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': matched_wd['viaf'],
|
|
'identifier_url': f"https://viaf.org/viaf/{matched_wd['viaf']}"
|
|
})
|
|
|
|
# Update provenance
|
|
if 'enrichment_history' not in inst['provenance']:
|
|
inst['provenance']['enrichment_history'] = []
|
|
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'Wikidata SPARQL query + fuzzy matching',
|
|
'match_score': confidence,
|
|
'verified': True if confidence > 95 else False
|
|
})
|
|
|
|
matched += 1
|
|
|
|
if confidence < 90:
|
|
low_confidence += 1
|
|
|
|
print(f"\n✅ Matched {matched} institutions ({matched/len(needs_wikidata)*100:.1f}%)")
|
|
print(f"⚠️ Low confidence matches (<90%): {low_confidence}")
|
|
print(f"❌ No match: {len(needs_wikidata) - matched}")
|
|
print()
|
|
|
|
# Save enriched dataset
|
|
output_path = 'data/instances/czech_unified_wikidata.yaml'
|
|
print(f"Saving enriched dataset to {output_path}...")
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
institutions,
|
|
f,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
default_flow_style=False,
|
|
width=100
|
|
)
|
|
|
|
print(f"✅ Saved {len(institutions)} institutions")
|
|
print()
|
|
print("="*80)
|
|
print("ENRICHMENT COMPLETE")
|
|
print("="*80)
|
|
print(f"Total institutions: {len(institutions)}")
|
|
print(f"With Wikidata Q-numbers: {has_wikidata + matched}")
|
|
print(f"Newly enriched: {matched}")
|
|
print(f"Enrichment rate: {(has_wikidata + matched)/len(institutions)*100:.1f}%")
|
|
|
|
if __name__ == '__main__':
|
|
enrich_with_wikidata()
|