glam/scripts/enrich_denmark_wikidata.py
2025-11-19 23:25:22 +01:00

371 lines
13 KiB
Python
Executable file

"""
Wikidata Enrichment for Danish GLAM Institutions
Queries Wikidata SPARQL endpoint to find Q-numbers for Danish libraries and archives,
then enriches the denmark_complete.json dataset with Wikidata identifiers.
"""
import json
import re
import time
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from urllib.parse import quote
import requests
from rapidfuzz import fuzz
# Wikidata SPARQL endpoint
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
def query_wikidata_libraries_denmark() -> List[Dict]:
"""Query Wikidata for libraries in Denmark."""
query = """
SELECT DISTINCT ?item ?itemLabel ?isil ?viaf ?coordinates ?city ?cityLabel WHERE {
# Libraries in Denmark
?item wdt:P31/wdt:P279* wd:Q7075 . # instance of library (or subclass)
?item wdt:P17 wd:Q35 . # country: Denmark
# Optional identifiers
OPTIONAL { ?item wdt:P791 ?isil } # ISIL code
OPTIONAL { ?item wdt:P214 ?viaf } # VIAF ID
OPTIONAL { ?item wdt:P625 ?coordinates } # Coordinates
OPTIONAL { ?item wdt:P131 ?city } # Located in administrative entity
SERVICE wikibase:label {
bd:serviceParam wikibase:language "da,en"
}
}
ORDER BY ?itemLabel
"""
headers = {
'User-Agent': 'GLAM-Data-Enrichment/0.1 (https://github.com/example/glam-data)',
'Accept': 'application/sparql-results+json'
}
print("Querying Wikidata for Danish libraries...")
response = requests.get(
WIKIDATA_SPARQL,
params={'query': query, 'format': 'json'},
headers=headers,
timeout=60
)
response.raise_for_status()
results = response.json()
bindings = results['results']['bindings']
print(f" Found {len(bindings)} libraries in Wikidata")
libraries = []
for binding in bindings:
lib = {
'qid': binding['item']['value'].split('/')[-1],
'label': binding.get('itemLabel', {}).get('value', ''),
'isil': binding.get('isil', {}).get('value'),
'viaf': binding.get('viaf', {}).get('value'),
'city': binding.get('cityLabel', {}).get('value')
}
libraries.append(lib)
return libraries
def query_wikidata_archives_denmark() -> List[Dict]:
"""Query Wikidata for archives in Denmark."""
query = """
SELECT DISTINCT ?item ?itemLabel ?isil ?viaf ?coordinates ?city ?cityLabel WHERE {
# Archives in Denmark
{
?item wdt:P31/wdt:P279* wd:Q166118 . # instance of archive (or subclass)
} UNION {
?item wdt:P31 wd:Q7075 . # or library with archival collections
?item wdt:P31 wd:Q166118 .
}
?item wdt:P17 wd:Q35 . # country: Denmark
# Optional identifiers
OPTIONAL { ?item wdt:P791 ?isil } # ISIL code
OPTIONAL { ?item wdt:P214 ?viaf } # VIAF ID
OPTIONAL { ?item wdt:P625 ?coordinates } # Coordinates
OPTIONAL { ?item wdt:P131 ?city } # Located in administrative entity
SERVICE wikibase:label {
bd:serviceParam wikibase:language "da,en"
}
}
ORDER BY ?itemLabel
"""
headers = {
'User-Agent': 'GLAM-Data-Enrichment/0.1 (https://github.com/example/glam-data)',
'Accept': 'application/sparql-results+json'
}
print("\nQuerying Wikidata for Danish archives...")
response = requests.get(
WIKIDATA_SPARQL,
params={'query': query, 'format': 'json'},
headers=headers,
timeout=60
)
response.raise_for_status()
results = response.json()
bindings = results['results']['bindings']
print(f" Found {len(bindings)} archives in Wikidata")
archives = []
for binding in bindings:
archive = {
'qid': binding['item']['value'].split('/')[-1],
'label': binding.get('itemLabel', {}).get('value', ''),
'isil': binding.get('isil', {}).get('value'),
'viaf': binding.get('viaf', {}).get('value'),
'city': binding.get('cityLabel', {}).get('value')
}
archives.append(archive)
return archives
def parse_identifier_string(identifier_str: str) -> Optional[Dict]:
"""Parse identifier from string representation."""
if not identifier_str or not isinstance(identifier_str, str):
return None
scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str)
value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str)
url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str)
if scheme_match and value_match:
return {
'scheme': scheme_match.group(1),
'value': value_match.group(1),
'url': url_match.group(1) if url_match else None
}
return None
def find_wikidata_match(
institution: Dict,
wikidata_institutions: List[Dict],
threshold: int = 85
) -> Optional[Tuple[Dict, int]]:
"""
Find best Wikidata match for an institution.
Returns:
Tuple of (wikidata_item, match_score) if found, else None
"""
inst_name = institution.get('name', '').lower()
if not inst_name:
return None
# Extract ISIL code from institution if present
inst_isil = None
identifiers = institution.get('identifiers', [])
for identifier_data in identifiers:
identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'ISIL':
inst_isil = identifier.get('value')
break
# First pass: Try exact ISIL match
if inst_isil:
for wd_item in wikidata_institutions:
if wd_item.get('isil') == inst_isil:
return (wd_item, 100) # Perfect match via ISIL
# Second pass: Fuzzy match by name
best_match = None
best_score = 0
for wd_item in wikidata_institutions:
wd_label = wd_item.get('label', '').lower()
if not wd_label:
continue
# Calculate fuzzy similarity
score = fuzz.ratio(inst_name, wd_label)
# Bonus points for city match
inst_city = None
locations = institution.get('locations', [])
if locations:
first_loc = locations[0]
if isinstance(first_loc, str):
city_match = re.search(r"'city':\s*'([^']*)'", first_loc)
if city_match:
inst_city = city_match.group(1).lower()
elif isinstance(first_loc, dict):
inst_city = first_loc.get('city', '').lower()
if inst_city and wd_item.get('city'):
wd_city = wd_item['city'].lower()
if inst_city in wd_city or wd_city in inst_city:
score += 10 # City match bonus
if score > best_score:
best_score = score
best_match = wd_item
if best_score >= threshold:
return (best_match, best_score)
return None
def enrich_with_wikidata(
institutions: List[Dict],
wikidata_libraries: List[Dict],
wikidata_archives: List[Dict]
) -> Tuple[List[Dict], Dict]:
"""
Enrich institutions with Wikidata Q-numbers.
Returns:
Tuple of (enriched_institutions, statistics)
"""
stats = {
'total': len(institutions),
'libraries_checked': 0,
'archives_checked': 0,
'matched_by_isil': 0,
'matched_by_name': 0,
'no_match': 0,
'already_had_wikidata': 0
}
enriched = []
for i, inst in enumerate(institutions, 1):
if i % 100 == 0:
print(f" Processing {i}/{len(institutions)} institutions...")
inst_type = inst.get('institution_type')
# Check if already has Wikidata ID
has_wikidata = False
identifiers = inst.get('identifiers', [])
for identifier_data in identifiers:
identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'Wikidata':
has_wikidata = True
stats['already_had_wikidata'] += 1
break
if not has_wikidata:
# Try to find Wikidata match
if inst_type == 'LIBRARY':
stats['libraries_checked'] += 1
match = find_wikidata_match(inst, wikidata_libraries, threshold=85)
elif inst_type == 'ARCHIVE':
stats['archives_checked'] += 1
match = find_wikidata_match(inst, wikidata_archives, threshold=85)
else:
match = None
if match:
wd_item, score = match
qid = wd_item['qid']
# Determine if it was ISIL or name match
if score == 100:
stats['matched_by_isil'] += 1
else:
stats['matched_by_name'] += 1
# Add Wikidata identifier (as string representation to match existing format)
wikidata_identifier = (
f"Identifier({{\n"
f" 'identifier_scheme': 'Wikidata',\n"
f" 'identifier_value': '{qid}',\n"
f" 'identifier_url': 'https://www.wikidata.org/wiki/{qid}'\n"
f"}})"
)
if not inst.get('identifiers'):
inst['identifiers'] = []
inst['identifiers'].append(wikidata_identifier)
# Add enrichment metadata
if not inst.get('enrichment_history'):
inst['enrichment_history'] = []
inst['enrichment_history'].append({
'enrichment_date': '2025-11-19',
'enrichment_method': 'Wikidata SPARQL query',
'enrichment_source': 'https://query.wikidata.org/sparql',
'match_score': score,
'matched_label': wd_item.get('label')
})
else:
stats['no_match'] += 1
enriched.append(inst)
return enriched, stats
def main():
print("=" * 60)
print("Danish GLAM Dataset → Wikidata Enrichment")
print("=" * 60)
# Load dataset
input_path = Path('data/instances/denmark_complete.json')
print(f"\nLoading dataset from {input_path}...")
with open(input_path, 'r') as f:
institutions = json.load(f)
print(f" Loaded {len(institutions)} institutions")
# Query Wikidata
try:
wikidata_libraries = query_wikidata_libraries_denmark()
time.sleep(2) # Rate limiting
wikidata_archives = query_wikidata_archives_denmark()
except Exception as e:
print(f"❌ Error querying Wikidata: {e}")
return
# Enrich dataset
print("\nEnriching dataset with Wikidata Q-numbers...")
enriched_institutions, stats = enrich_with_wikidata(
institutions,
wikidata_libraries,
wikidata_archives
)
# Save enriched dataset
output_path = Path('data/instances/denmark_complete_enriched.json')
print(f"\nSaving enriched dataset to {output_path}...")
with open(output_path, 'w') as f:
json.dump(enriched_institutions, f, indent=2, ensure_ascii=False)
size_mb = output_path.stat().st_size / (1024 * 1024)
print(f" ✅ Saved ({size_mb:.2f} MB)")
# Print statistics
print("\n" + "=" * 60)
print("Enrichment Statistics")
print("=" * 60)
print(f"Total institutions: {stats['total']}")
print(f"Already had Wikidata: {stats['already_had_wikidata']}")
print(f"Libraries checked: {stats['libraries_checked']}")
print(f"Archives checked: {stats['archives_checked']}")
print(f"Matched by ISIL: {stats['matched_by_isil']}")
print(f"Matched by name: {stats['matched_by_name']}")
print(f"No match found: {stats['no_match']}")
total_new_matches = stats['matched_by_isil'] + stats['matched_by_name']
total_with_wikidata = stats['already_had_wikidata'] + total_new_matches
print(f"\n✅ Total institutions with Wikidata: {total_with_wikidata}/{stats['total']} " +
f"({100*total_with_wikidata/stats['total']:.1f}%)")
print(f"✅ New Wikidata matches added: {total_new_matches}")
print("\n" + "=" * 60)
print("✅ Wikidata Enrichment Complete")
print("=" * 60)
if __name__ == '__main__':
main()