517 lines
17 KiB
Python
517 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Czech custodian files with Wikidata Q-numbers using fuzzy name matching.
|
|
|
|
Uses Wikidata SPARQL endpoint to find matching institutions by name + location.
|
|
Writes enrichment data directly to individual custodian YAML files.
|
|
|
|
Process:
|
|
1. Query Wikidata for ALL Czech heritage institutions
|
|
2. Load CZ-*.yaml files without wikidata_enrichment
|
|
3. Fuzzy match by name + city location
|
|
4. Add Wikidata identifiers to matched files
|
|
5. Mark with enrichment_version: 2.1_generic
|
|
|
|
Usage:
|
|
python scripts/enrich_czech_wikidata_fuzzy.py [--limit N] [--dry-run] [--threshold N]
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
import argparse
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
from datetime import datetime, timezone
|
|
from rapidfuzz import fuzz
|
|
|
|
# Wikidata SPARQL endpoint
|
|
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
|
|
|
# Czech Republic Wikidata ID
|
|
CZECHIA_QID = "Q213"
|
|
|
|
# Languages for Czech institutions (Czech primary, then English, German, Slovak)
|
|
CZECH_LANGUAGES = "cs,en,de,sk"
|
|
|
|
# Default similarity threshold
|
|
DEFAULT_THRESHOLD = 85.0
|
|
|
|
|
|
def query_wikidata_czech_institutions() -> List[Dict]:
|
|
"""
|
|
Query Wikidata for ALL Czech heritage institutions.
|
|
|
|
Returns:
|
|
List of dicts with: qid, label, type, location, coordinates, isil, viaf
|
|
"""
|
|
|
|
# Simplified SPARQL query - minimal optionals to avoid timeout
|
|
# Czech libraries are mostly municipal/public, so focus on those
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?locationLabel ?isil
|
|
WHERE {{
|
|
# Direct instance of heritage institution types
|
|
VALUES ?type {{
|
|
wd:Q33506 # museum
|
|
wd:Q7075 # library
|
|
wd:Q166118 # archive
|
|
wd:Q1007870 # art gallery
|
|
wd:Q28564 # public library
|
|
wd:Q207694 # art museum
|
|
}}
|
|
|
|
?item wdt:P31 ?type .
|
|
?item wdt:P17 wd:{CZECHIA_QID} .
|
|
|
|
# Location is key for matching
|
|
OPTIONAL {{ ?item wdt:P131 ?location }}
|
|
|
|
# ISIL is valuable
|
|
OPTIONAL {{ ?item wdt:P791 ?isil }}
|
|
|
|
SERVICE wikibase:label {{
|
|
bd:serviceParam wikibase:language "{CZECH_LANGUAGES}"
|
|
}}
|
|
}}
|
|
LIMIT 10000
|
|
"""
|
|
|
|
print("Querying Wikidata for Czech heritage institutions...")
|
|
print(f" Endpoint: {WIKIDATA_SPARQL}")
|
|
print(f" Languages: {CZECH_LANGUAGES}")
|
|
|
|
headers = {
|
|
'User-Agent': 'GLAM-Data-Extraction/0.2.1 (Czech heritage institution research)',
|
|
'Accept': 'application/sparql-results+json'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(
|
|
WIKIDATA_SPARQL,
|
|
params={'query': query},
|
|
headers=headers,
|
|
timeout=180 # Generous timeout for large query
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# Parse results
|
|
institutions = []
|
|
seen_qids = set() # Deduplicate by QID
|
|
|
|
for binding in data['results']['bindings']:
|
|
qid = binding['item']['value'].split('/')[-1]
|
|
|
|
# Skip duplicates (same institution may have multiple types)
|
|
if qid in seen_qids:
|
|
continue
|
|
seen_qids.add(qid)
|
|
|
|
label = binding['itemLabel']['value']
|
|
inst_type = ''
|
|
location = binding.get('locationLabel', {}).get('value', '')
|
|
isil = binding.get('isil', {}).get('value', '')
|
|
|
|
institutions.append({
|
|
'qid': qid,
|
|
'label': label,
|
|
'type': inst_type,
|
|
'location': location,
|
|
'isil': isil
|
|
})
|
|
|
|
print(f" Found {len(institutions)} unique institutions in Wikidata")
|
|
return institutions
|
|
|
|
except requests.exceptions.Timeout:
|
|
print("ERROR: Wikidata query timed out. Try again later.")
|
|
return []
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"ERROR: Failed to query Wikidata: {e}")
|
|
return []
|
|
except Exception as e:
|
|
print(f"ERROR: Unexpected error: {e}")
|
|
return []
|
|
|
|
|
|
def fuzzy_match_institution(
|
|
inst_name: str,
|
|
inst_city: str,
|
|
wikidata_results: List[Dict],
|
|
threshold: float = DEFAULT_THRESHOLD
|
|
) -> Optional[Tuple[Dict, float]]:
|
|
"""
|
|
Fuzzy match institution to Wikidata results.
|
|
|
|
Uses a two-pass algorithm:
|
|
1. First try to find matches with BOTH name and location match (high confidence)
|
|
2. If no location match, fall back to name-only match with higher threshold
|
|
|
|
Args:
|
|
inst_name: Institution name from our dataset
|
|
inst_city: City location
|
|
wikidata_results: List of Wikidata query results
|
|
threshold: Minimum similarity threshold (0-100)
|
|
|
|
Returns:
|
|
Tuple of (matched_wikidata_record, confidence_score) or None
|
|
"""
|
|
|
|
best_match = None
|
|
best_score = 0.0
|
|
best_has_location_match = False
|
|
|
|
# Normalize our institution name
|
|
inst_name_lower = inst_name.lower().strip()
|
|
inst_city_lower = inst_city.lower().strip() if inst_city else ''
|
|
|
|
for wd in wikidata_results:
|
|
wd_label_lower = wd['label'].lower().strip()
|
|
wd_location_lower = wd.get('location', '').lower()
|
|
|
|
# Name similarity using token sort ratio (handles word reordering)
|
|
name_score = fuzz.token_sort_ratio(inst_name_lower, wd_label_lower)
|
|
|
|
# Check for location match
|
|
location_match = False
|
|
location_boost = 0
|
|
|
|
if inst_city_lower and wd_location_lower:
|
|
# Exact city name match in location
|
|
if inst_city_lower in wd_location_lower:
|
|
location_match = True
|
|
location_boost = 10
|
|
# Also check if city name is IN the Wikidata label itself
|
|
elif inst_city_lower in wd_label_lower:
|
|
location_match = True
|
|
location_boost = 8
|
|
# Fuzzy location match
|
|
elif fuzz.partial_ratio(inst_city_lower, wd_location_lower) > 90:
|
|
location_match = True
|
|
location_boost = 5
|
|
|
|
# If we have a city but Wikidata label contains a DIFFERENT city, penalize
|
|
if inst_city_lower and not location_match:
|
|
# Check if Wikidata label contains a different Czech city
|
|
# Major Czech cities that might cause false matches
|
|
czech_cities = ['praha', 'prague', 'brno', 'ostrava', 'plzeň', 'pilsen',
|
|
'liberec', 'olomouc', 'české budějovice', 'hradec králové',
|
|
'ústí nad labem', 'pardubice', 'zlín', 'havířov', 'kladno',
|
|
'opava', 'karviná', 'teplice', 'děčín', 'jihlava']
|
|
for city in czech_cities:
|
|
if city in wd_label_lower and city != inst_city_lower:
|
|
# Different city mentioned in Wikidata label - big penalty
|
|
name_score = max(0, name_score - 20)
|
|
break
|
|
|
|
# Combined score
|
|
total_score = min(name_score + location_boost, 100)
|
|
|
|
# Prefer matches with location confirmation
|
|
is_better = False
|
|
if total_score >= threshold:
|
|
if location_match and not best_has_location_match:
|
|
# Location match beats non-location match
|
|
is_better = True
|
|
elif location_match == best_has_location_match and total_score > best_score:
|
|
# Same location status, higher score wins
|
|
is_better = True
|
|
|
|
if is_better:
|
|
best_score = total_score
|
|
best_match = wd
|
|
best_has_location_match = location_match
|
|
|
|
# For matches without location confirmation, require higher threshold
|
|
if best_match and not best_has_location_match:
|
|
# Require 95% name match if no location confirmation
|
|
if best_score < 95:
|
|
return None
|
|
|
|
if best_match:
|
|
return (best_match, best_score)
|
|
return None
|
|
|
|
|
|
def load_unenriched_files(custodian_dir: Path, limit: Optional[int] = None) -> List[Tuple[Path, Dict]]:
|
|
"""
|
|
Load CZ-*.yaml files that don't have wikidata_enrichment.
|
|
|
|
Args:
|
|
custodian_dir: Path to data/custodian directory
|
|
limit: Optional limit on number of files to load
|
|
|
|
Returns:
|
|
List of (file_path, data_dict) tuples
|
|
"""
|
|
|
|
files = []
|
|
cz_files = sorted(custodian_dir.glob("CZ-*.yaml"))
|
|
|
|
print(f"Scanning {len(cz_files)} CZ-*.yaml files...")
|
|
|
|
for filepath in cz_files:
|
|
if limit and len(files) >= limit:
|
|
break
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Skip if already has wikidata_enrichment
|
|
if data.get('wikidata_enrichment'):
|
|
continue
|
|
|
|
# Skip if already has Wikidata identifier in top-level identifiers
|
|
# Note: Czech files may have Wikidata in original_entry.identifiers which is fine to update
|
|
has_wikidata = False
|
|
for identifier in data.get('identifiers', []):
|
|
if isinstance(identifier, dict):
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
has_wikidata = True
|
|
break
|
|
if has_wikidata:
|
|
continue
|
|
|
|
files.append((filepath, data))
|
|
|
|
except Exception as e:
|
|
print(f" Warning: Could not load {filepath.name}: {e}")
|
|
|
|
print(f" Found {len(files)} files needing Wikidata enrichment")
|
|
return files
|
|
|
|
|
|
def save_enriched_file(filepath: Path, data: Dict) -> bool:
|
|
"""Save enriched data back to YAML file."""
|
|
try:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
data,
|
|
f,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
default_flow_style=False,
|
|
width=120
|
|
)
|
|
return True
|
|
except Exception as e:
|
|
print(f" ERROR saving {filepath.name}: {e}")
|
|
return False
|
|
|
|
|
|
def enrich_with_wikidata(
|
|
limit: Optional[int] = None,
|
|
dry_run: bool = False,
|
|
threshold: float = DEFAULT_THRESHOLD
|
|
):
|
|
"""Main enrichment workflow."""
|
|
|
|
print("=" * 80)
|
|
print("CZECH INSTITUTIONS - WIKIDATA FUZZY MATCHING ENRICHMENT")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Setup paths
|
|
custodian_dir = Path(__file__).parent.parent / "data" / "custodian"
|
|
|
|
if not custodian_dir.exists():
|
|
print(f"ERROR: Custodian directory not found: {custodian_dir}")
|
|
sys.exit(1)
|
|
|
|
# Query Wikidata for Czech institutions
|
|
wikidata_results = query_wikidata_czech_institutions()
|
|
|
|
if not wikidata_results:
|
|
print("No Wikidata results found. Exiting.")
|
|
sys.exit(1)
|
|
|
|
print()
|
|
|
|
# Load unenriched files
|
|
files_to_enrich = load_unenriched_files(custodian_dir, limit)
|
|
|
|
if not files_to_enrich:
|
|
print("No files need enrichment. Exiting.")
|
|
return
|
|
|
|
print()
|
|
print(f"Fuzzy matching {len(files_to_enrich)} institutions...")
|
|
print(f" Match threshold: {threshold}%")
|
|
print(f" Dry run: {dry_run}")
|
|
print()
|
|
|
|
# Statistics
|
|
matched = 0
|
|
high_confidence = 0
|
|
low_confidence = 0
|
|
saved = 0
|
|
errors = 0
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
for idx, (filepath, data) in enumerate(files_to_enrich, 1):
|
|
# Progress indicator
|
|
if idx % 50 == 0 or idx == len(files_to_enrich):
|
|
print(f" [{idx}/{len(files_to_enrich)}] Matched: {matched}, Saved: {saved}")
|
|
|
|
# Extract institution name
|
|
inst_name = None
|
|
if data.get('custodian_name', {}).get('claim_value'):
|
|
inst_name = data['custodian_name']['claim_value']
|
|
elif data.get('original_entry', {}).get('name'):
|
|
inst_name = data['original_entry']['name']
|
|
|
|
if not inst_name:
|
|
continue
|
|
|
|
# Extract city
|
|
inst_city = ''
|
|
if data.get('location', {}).get('city'):
|
|
inst_city = data['location']['city']
|
|
elif data.get('ghcid', {}).get('location_resolution', {}).get('city_name'):
|
|
inst_city = data['ghcid']['location_resolution']['city_name']
|
|
elif data.get('original_entry', {}).get('locations'):
|
|
locs = data['original_entry']['locations']
|
|
if locs and isinstance(locs, list) and locs[0].get('city'):
|
|
inst_city = locs[0]['city']
|
|
|
|
# Fuzzy match
|
|
match_result = fuzzy_match_institution(
|
|
inst_name,
|
|
inst_city,
|
|
wikidata_results,
|
|
threshold=threshold
|
|
)
|
|
|
|
if not match_result:
|
|
continue
|
|
|
|
matched_wd, confidence = match_result
|
|
matched += 1
|
|
|
|
if confidence >= 95:
|
|
high_confidence += 1
|
|
else:
|
|
low_confidence += 1
|
|
|
|
if dry_run:
|
|
print(f" [DRY RUN] Would match: {inst_name}")
|
|
print(f" -> {matched_wd['qid']} ({matched_wd['label']}) [{confidence:.1f}%]")
|
|
continue
|
|
|
|
# Add Wikidata enrichment
|
|
data['wikidata_enrichment'] = {
|
|
'wikidata_id': matched_wd['qid'],
|
|
'wikidata_label': matched_wd['label'],
|
|
'wikidata_url': f"https://www.wikidata.org/wiki/{matched_wd['qid']}",
|
|
'enrichment_date': timestamp,
|
|
'enrichment_version': '2.1_generic',
|
|
'enrichment_method': 'wikidata_fuzzy_match',
|
|
'match_confidence': round(confidence, 1),
|
|
'match_location': matched_wd.get('location', ''),
|
|
}
|
|
|
|
# Add ISIL if available from Wikidata
|
|
if matched_wd.get('isil'):
|
|
# Check if already has this ISIL
|
|
has_isil = any(
|
|
i.get('identifier_scheme') == 'ISIL' and i.get('identifier_value') == matched_wd['isil']
|
|
for i in data.get('identifiers', [])
|
|
if isinstance(i, dict)
|
|
)
|
|
if not has_isil:
|
|
if 'identifiers' not in data:
|
|
data['identifiers'] = []
|
|
data['identifiers'].append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': matched_wd['isil'],
|
|
'identifier_source': 'wikidata'
|
|
})
|
|
|
|
# Add Wikidata identifier to top-level identifiers
|
|
has_wd_id = any(
|
|
i.get('identifier_scheme') == 'Wikidata'
|
|
for i in data.get('identifiers', [])
|
|
if isinstance(i, dict)
|
|
)
|
|
if not has_wd_id:
|
|
if 'identifiers' not in data:
|
|
data['identifiers'] = []
|
|
data['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': matched_wd['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{matched_wd['qid']}",
|
|
'identifier_source': 'wikidata_fuzzy_match'
|
|
})
|
|
|
|
# Update provenance notes
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'notes' not in data['provenance']:
|
|
data['provenance']['notes'] = []
|
|
|
|
data['provenance']['notes'].append(
|
|
f"Wikidata fuzzy match enrichment {timestamp}: "
|
|
f"Matched to {matched_wd['qid']} ({matched_wd['label']}) "
|
|
f"with {confidence:.1f}% confidence"
|
|
)
|
|
|
|
# Save file
|
|
if save_enriched_file(filepath, data):
|
|
saved += 1
|
|
else:
|
|
errors += 1
|
|
|
|
# Final summary
|
|
print()
|
|
print("=" * 80)
|
|
print("ENRICHMENT COMPLETE")
|
|
print("=" * 80)
|
|
print(f" Files scanned: {len(files_to_enrich)}")
|
|
print(f" Matched: {matched} ({matched/len(files_to_enrich)*100:.1f}%)")
|
|
print(f" High confidence (>=95%): {high_confidence}")
|
|
print(f" Low confidence (<95%): {low_confidence}")
|
|
if not dry_run:
|
|
print(f" Saved: {saved}")
|
|
print(f" Errors: {errors}")
|
|
else:
|
|
print(f" [DRY RUN - no files modified]")
|
|
print()
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Enrich Czech custodian files with Wikidata via fuzzy matching"
|
|
)
|
|
parser.add_argument(
|
|
'--limit', '-l',
|
|
type=int,
|
|
default=None,
|
|
help='Limit number of files to process (for testing)'
|
|
)
|
|
parser.add_argument(
|
|
'--dry-run', '-n',
|
|
action='store_true',
|
|
help='Show what would be matched without saving'
|
|
)
|
|
parser.add_argument(
|
|
'--threshold', '-t',
|
|
type=float,
|
|
default=DEFAULT_THRESHOLD,
|
|
help=f'Minimum similarity threshold (default: {DEFAULT_THRESHOLD})'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
enrich_with_wikidata(
|
|
limit=args.limit,
|
|
dry_run=args.dry_run,
|
|
threshold=args.threshold
|
|
)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|