371 lines
13 KiB
Python
Executable file
371 lines
13 KiB
Python
Executable file
"""
|
|
Wikidata Enrichment for Danish GLAM Institutions
|
|
|
|
Queries Wikidata SPARQL endpoint to find Q-numbers for Danish libraries and archives,
|
|
then enriches the denmark_complete.json dataset with Wikidata identifiers.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
from urllib.parse import quote
|
|
import requests
|
|
from rapidfuzz import fuzz
|
|
|
|
# Wikidata SPARQL endpoint
|
|
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
|
|
|
def query_wikidata_libraries_denmark() -> List[Dict]:
|
|
"""Query Wikidata for libraries in Denmark."""
|
|
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?isil ?viaf ?coordinates ?city ?cityLabel WHERE {
|
|
# Libraries in Denmark
|
|
?item wdt:P31/wdt:P279* wd:Q7075 . # instance of library (or subclass)
|
|
?item wdt:P17 wd:Q35 . # country: Denmark
|
|
|
|
# Optional identifiers
|
|
OPTIONAL { ?item wdt:P791 ?isil } # ISIL code
|
|
OPTIONAL { ?item wdt:P214 ?viaf } # VIAF ID
|
|
OPTIONAL { ?item wdt:P625 ?coordinates } # Coordinates
|
|
OPTIONAL { ?item wdt:P131 ?city } # Located in administrative entity
|
|
|
|
SERVICE wikibase:label {
|
|
bd:serviceParam wikibase:language "da,en"
|
|
}
|
|
}
|
|
ORDER BY ?itemLabel
|
|
"""
|
|
|
|
headers = {
|
|
'User-Agent': 'GLAM-Data-Enrichment/0.1 (https://github.com/example/glam-data)',
|
|
'Accept': 'application/sparql-results+json'
|
|
}
|
|
|
|
print("Querying Wikidata for Danish libraries...")
|
|
response = requests.get(
|
|
WIKIDATA_SPARQL,
|
|
params={'query': query, 'format': 'json'},
|
|
headers=headers,
|
|
timeout=60
|
|
)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results['results']['bindings']
|
|
|
|
print(f" Found {len(bindings)} libraries in Wikidata")
|
|
|
|
libraries = []
|
|
for binding in bindings:
|
|
lib = {
|
|
'qid': binding['item']['value'].split('/')[-1],
|
|
'label': binding.get('itemLabel', {}).get('value', ''),
|
|
'isil': binding.get('isil', {}).get('value'),
|
|
'viaf': binding.get('viaf', {}).get('value'),
|
|
'city': binding.get('cityLabel', {}).get('value')
|
|
}
|
|
libraries.append(lib)
|
|
|
|
return libraries
|
|
|
|
def query_wikidata_archives_denmark() -> List[Dict]:
|
|
"""Query Wikidata for archives in Denmark."""
|
|
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?isil ?viaf ?coordinates ?city ?cityLabel WHERE {
|
|
# Archives in Denmark
|
|
{
|
|
?item wdt:P31/wdt:P279* wd:Q166118 . # instance of archive (or subclass)
|
|
} UNION {
|
|
?item wdt:P31 wd:Q7075 . # or library with archival collections
|
|
?item wdt:P31 wd:Q166118 .
|
|
}
|
|
?item wdt:P17 wd:Q35 . # country: Denmark
|
|
|
|
# Optional identifiers
|
|
OPTIONAL { ?item wdt:P791 ?isil } # ISIL code
|
|
OPTIONAL { ?item wdt:P214 ?viaf } # VIAF ID
|
|
OPTIONAL { ?item wdt:P625 ?coordinates } # Coordinates
|
|
OPTIONAL { ?item wdt:P131 ?city } # Located in administrative entity
|
|
|
|
SERVICE wikibase:label {
|
|
bd:serviceParam wikibase:language "da,en"
|
|
}
|
|
}
|
|
ORDER BY ?itemLabel
|
|
"""
|
|
|
|
headers = {
|
|
'User-Agent': 'GLAM-Data-Enrichment/0.1 (https://github.com/example/glam-data)',
|
|
'Accept': 'application/sparql-results+json'
|
|
}
|
|
|
|
print("\nQuerying Wikidata for Danish archives...")
|
|
response = requests.get(
|
|
WIKIDATA_SPARQL,
|
|
params={'query': query, 'format': 'json'},
|
|
headers=headers,
|
|
timeout=60
|
|
)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results['results']['bindings']
|
|
|
|
print(f" Found {len(bindings)} archives in Wikidata")
|
|
|
|
archives = []
|
|
for binding in bindings:
|
|
archive = {
|
|
'qid': binding['item']['value'].split('/')[-1],
|
|
'label': binding.get('itemLabel', {}).get('value', ''),
|
|
'isil': binding.get('isil', {}).get('value'),
|
|
'viaf': binding.get('viaf', {}).get('value'),
|
|
'city': binding.get('cityLabel', {}).get('value')
|
|
}
|
|
archives.append(archive)
|
|
|
|
return archives
|
|
|
|
def parse_identifier_string(identifier_str: str) -> Optional[Dict]:
|
|
"""Parse identifier from string representation."""
|
|
if not identifier_str or not isinstance(identifier_str, str):
|
|
return None
|
|
|
|
scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str)
|
|
value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str)
|
|
url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str)
|
|
|
|
if scheme_match and value_match:
|
|
return {
|
|
'scheme': scheme_match.group(1),
|
|
'value': value_match.group(1),
|
|
'url': url_match.group(1) if url_match else None
|
|
}
|
|
return None
|
|
|
|
def find_wikidata_match(
|
|
institution: Dict,
|
|
wikidata_institutions: List[Dict],
|
|
threshold: int = 85
|
|
) -> Optional[Tuple[Dict, int]]:
|
|
"""
|
|
Find best Wikidata match for an institution.
|
|
|
|
Returns:
|
|
Tuple of (wikidata_item, match_score) if found, else None
|
|
"""
|
|
inst_name = institution.get('name', '').lower()
|
|
if not inst_name:
|
|
return None
|
|
|
|
# Extract ISIL code from institution if present
|
|
inst_isil = None
|
|
identifiers = institution.get('identifiers', [])
|
|
for identifier_data in identifiers:
|
|
identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
|
|
if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'ISIL':
|
|
inst_isil = identifier.get('value')
|
|
break
|
|
|
|
# First pass: Try exact ISIL match
|
|
if inst_isil:
|
|
for wd_item in wikidata_institutions:
|
|
if wd_item.get('isil') == inst_isil:
|
|
return (wd_item, 100) # Perfect match via ISIL
|
|
|
|
# Second pass: Fuzzy match by name
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for wd_item in wikidata_institutions:
|
|
wd_label = wd_item.get('label', '').lower()
|
|
if not wd_label:
|
|
continue
|
|
|
|
# Calculate fuzzy similarity
|
|
score = fuzz.ratio(inst_name, wd_label)
|
|
|
|
# Bonus points for city match
|
|
inst_city = None
|
|
locations = institution.get('locations', [])
|
|
if locations:
|
|
first_loc = locations[0]
|
|
if isinstance(first_loc, str):
|
|
city_match = re.search(r"'city':\s*'([^']*)'", first_loc)
|
|
if city_match:
|
|
inst_city = city_match.group(1).lower()
|
|
elif isinstance(first_loc, dict):
|
|
inst_city = first_loc.get('city', '').lower()
|
|
|
|
if inst_city and wd_item.get('city'):
|
|
wd_city = wd_item['city'].lower()
|
|
if inst_city in wd_city or wd_city in inst_city:
|
|
score += 10 # City match bonus
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = wd_item
|
|
|
|
if best_score >= threshold:
|
|
return (best_match, best_score)
|
|
|
|
return None
|
|
|
|
def enrich_with_wikidata(
|
|
institutions: List[Dict],
|
|
wikidata_libraries: List[Dict],
|
|
wikidata_archives: List[Dict]
|
|
) -> Tuple[List[Dict], Dict]:
|
|
"""
|
|
Enrich institutions with Wikidata Q-numbers.
|
|
|
|
Returns:
|
|
Tuple of (enriched_institutions, statistics)
|
|
"""
|
|
|
|
stats = {
|
|
'total': len(institutions),
|
|
'libraries_checked': 0,
|
|
'archives_checked': 0,
|
|
'matched_by_isil': 0,
|
|
'matched_by_name': 0,
|
|
'no_match': 0,
|
|
'already_had_wikidata': 0
|
|
}
|
|
|
|
enriched = []
|
|
|
|
for i, inst in enumerate(institutions, 1):
|
|
if i % 100 == 0:
|
|
print(f" Processing {i}/{len(institutions)} institutions...")
|
|
|
|
inst_type = inst.get('institution_type')
|
|
|
|
# Check if already has Wikidata ID
|
|
has_wikidata = False
|
|
identifiers = inst.get('identifiers', [])
|
|
for identifier_data in identifiers:
|
|
identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
|
|
if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'Wikidata':
|
|
has_wikidata = True
|
|
stats['already_had_wikidata'] += 1
|
|
break
|
|
|
|
if not has_wikidata:
|
|
# Try to find Wikidata match
|
|
if inst_type == 'LIBRARY':
|
|
stats['libraries_checked'] += 1
|
|
match = find_wikidata_match(inst, wikidata_libraries, threshold=85)
|
|
elif inst_type == 'ARCHIVE':
|
|
stats['archives_checked'] += 1
|
|
match = find_wikidata_match(inst, wikidata_archives, threshold=85)
|
|
else:
|
|
match = None
|
|
|
|
if match:
|
|
wd_item, score = match
|
|
qid = wd_item['qid']
|
|
|
|
# Determine if it was ISIL or name match
|
|
if score == 100:
|
|
stats['matched_by_isil'] += 1
|
|
else:
|
|
stats['matched_by_name'] += 1
|
|
|
|
# Add Wikidata identifier (as string representation to match existing format)
|
|
wikidata_identifier = (
|
|
f"Identifier({{\n"
|
|
f" 'identifier_scheme': 'Wikidata',\n"
|
|
f" 'identifier_value': '{qid}',\n"
|
|
f" 'identifier_url': 'https://www.wikidata.org/wiki/{qid}'\n"
|
|
f"}})"
|
|
)
|
|
|
|
if not inst.get('identifiers'):
|
|
inst['identifiers'] = []
|
|
inst['identifiers'].append(wikidata_identifier)
|
|
|
|
# Add enrichment metadata
|
|
if not inst.get('enrichment_history'):
|
|
inst['enrichment_history'] = []
|
|
inst['enrichment_history'].append({
|
|
'enrichment_date': '2025-11-19',
|
|
'enrichment_method': 'Wikidata SPARQL query',
|
|
'enrichment_source': 'https://query.wikidata.org/sparql',
|
|
'match_score': score,
|
|
'matched_label': wd_item.get('label')
|
|
})
|
|
else:
|
|
stats['no_match'] += 1
|
|
|
|
enriched.append(inst)
|
|
|
|
return enriched, stats
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Danish GLAM Dataset → Wikidata Enrichment")
|
|
print("=" * 60)
|
|
|
|
# Load dataset
|
|
input_path = Path('data/instances/denmark_complete.json')
|
|
print(f"\nLoading dataset from {input_path}...")
|
|
with open(input_path, 'r') as f:
|
|
institutions = json.load(f)
|
|
print(f" Loaded {len(institutions)} institutions")
|
|
|
|
# Query Wikidata
|
|
try:
|
|
wikidata_libraries = query_wikidata_libraries_denmark()
|
|
time.sleep(2) # Rate limiting
|
|
wikidata_archives = query_wikidata_archives_denmark()
|
|
except Exception as e:
|
|
print(f"❌ Error querying Wikidata: {e}")
|
|
return
|
|
|
|
# Enrich dataset
|
|
print("\nEnriching dataset with Wikidata Q-numbers...")
|
|
enriched_institutions, stats = enrich_with_wikidata(
|
|
institutions,
|
|
wikidata_libraries,
|
|
wikidata_archives
|
|
)
|
|
|
|
# Save enriched dataset
|
|
output_path = Path('data/instances/denmark_complete_enriched.json')
|
|
print(f"\nSaving enriched dataset to {output_path}...")
|
|
with open(output_path, 'w') as f:
|
|
json.dump(enriched_institutions, f, indent=2, ensure_ascii=False)
|
|
|
|
size_mb = output_path.stat().st_size / (1024 * 1024)
|
|
print(f" ✅ Saved ({size_mb:.2f} MB)")
|
|
|
|
# Print statistics
|
|
print("\n" + "=" * 60)
|
|
print("Enrichment Statistics")
|
|
print("=" * 60)
|
|
print(f"Total institutions: {stats['total']}")
|
|
print(f"Already had Wikidata: {stats['already_had_wikidata']}")
|
|
print(f"Libraries checked: {stats['libraries_checked']}")
|
|
print(f"Archives checked: {stats['archives_checked']}")
|
|
print(f"Matched by ISIL: {stats['matched_by_isil']}")
|
|
print(f"Matched by name: {stats['matched_by_name']}")
|
|
print(f"No match found: {stats['no_match']}")
|
|
|
|
total_new_matches = stats['matched_by_isil'] + stats['matched_by_name']
|
|
total_with_wikidata = stats['already_had_wikidata'] + total_new_matches
|
|
|
|
print(f"\n✅ Total institutions with Wikidata: {total_with_wikidata}/{stats['total']} " +
|
|
f"({100*total_with_wikidata/stats['total']:.1f}%)")
|
|
print(f"✅ New Wikidata matches added: {total_new_matches}")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("✅ Wikidata Enrichment Complete")
|
|
print("=" * 60)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|