""" Wikidata Enrichment for Danish GLAM Institutions Queries Wikidata SPARQL endpoint to find Q-numbers for Danish libraries and archives, then enriches the denmark_complete.json dataset with Wikidata identifiers. """ import json import re import time from pathlib import Path from typing import Dict, List, Optional, Tuple from urllib.parse import quote import requests from rapidfuzz import fuzz # Wikidata SPARQL endpoint WIKIDATA_SPARQL = "https://query.wikidata.org/sparql" def query_wikidata_libraries_denmark() -> List[Dict]: """Query Wikidata for libraries in Denmark.""" query = """ SELECT DISTINCT ?item ?itemLabel ?isil ?viaf ?coordinates ?city ?cityLabel WHERE { # Libraries in Denmark ?item wdt:P31/wdt:P279* wd:Q7075 . # instance of library (or subclass) ?item wdt:P17 wd:Q35 . # country: Denmark # Optional identifiers OPTIONAL { ?item wdt:P791 ?isil } # ISIL code OPTIONAL { ?item wdt:P214 ?viaf } # VIAF ID OPTIONAL { ?item wdt:P625 ?coordinates } # Coordinates OPTIONAL { ?item wdt:P131 ?city } # Located in administrative entity SERVICE wikibase:label { bd:serviceParam wikibase:language "da,en" } } ORDER BY ?itemLabel """ headers = { 'User-Agent': 'GLAM-Data-Enrichment/0.1 (https://github.com/example/glam-data)', 'Accept': 'application/sparql-results+json' } print("Querying Wikidata for Danish libraries...") response = requests.get( WIKIDATA_SPARQL, params={'query': query, 'format': 'json'}, headers=headers, timeout=60 ) response.raise_for_status() results = response.json() bindings = results['results']['bindings'] print(f" Found {len(bindings)} libraries in Wikidata") libraries = [] for binding in bindings: lib = { 'qid': binding['item']['value'].split('/')[-1], 'label': binding.get('itemLabel', {}).get('value', ''), 'isil': binding.get('isil', {}).get('value'), 'viaf': binding.get('viaf', {}).get('value'), 'city': binding.get('cityLabel', {}).get('value') } libraries.append(lib) return libraries def query_wikidata_archives_denmark() -> List[Dict]: """Query Wikidata for archives in Denmark.""" query = """ SELECT DISTINCT ?item ?itemLabel ?isil ?viaf ?coordinates ?city ?cityLabel WHERE { # Archives in Denmark { ?item wdt:P31/wdt:P279* wd:Q166118 . # instance of archive (or subclass) } UNION { ?item wdt:P31 wd:Q7075 . # or library with archival collections ?item wdt:P31 wd:Q166118 . } ?item wdt:P17 wd:Q35 . # country: Denmark # Optional identifiers OPTIONAL { ?item wdt:P791 ?isil } # ISIL code OPTIONAL { ?item wdt:P214 ?viaf } # VIAF ID OPTIONAL { ?item wdt:P625 ?coordinates } # Coordinates OPTIONAL { ?item wdt:P131 ?city } # Located in administrative entity SERVICE wikibase:label { bd:serviceParam wikibase:language "da,en" } } ORDER BY ?itemLabel """ headers = { 'User-Agent': 'GLAM-Data-Enrichment/0.1 (https://github.com/example/glam-data)', 'Accept': 'application/sparql-results+json' } print("\nQuerying Wikidata for Danish archives...") response = requests.get( WIKIDATA_SPARQL, params={'query': query, 'format': 'json'}, headers=headers, timeout=60 ) response.raise_for_status() results = response.json() bindings = results['results']['bindings'] print(f" Found {len(bindings)} archives in Wikidata") archives = [] for binding in bindings: archive = { 'qid': binding['item']['value'].split('/')[-1], 'label': binding.get('itemLabel', {}).get('value', ''), 'isil': binding.get('isil', {}).get('value'), 'viaf': binding.get('viaf', {}).get('value'), 'city': binding.get('cityLabel', {}).get('value') } archives.append(archive) return archives def parse_identifier_string(identifier_str: str) -> Optional[Dict]: """Parse identifier from string representation.""" if not identifier_str or not isinstance(identifier_str, str): return None scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str) value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str) url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str) if scheme_match and value_match: return { 'scheme': scheme_match.group(1), 'value': value_match.group(1), 'url': url_match.group(1) if url_match else None } return None def find_wikidata_match( institution: Dict, wikidata_institutions: List[Dict], threshold: int = 85 ) -> Optional[Tuple[Dict, int]]: """ Find best Wikidata match for an institution. Returns: Tuple of (wikidata_item, match_score) if found, else None """ inst_name = institution.get('name', '').lower() if not inst_name: return None # Extract ISIL code from institution if present inst_isil = None identifiers = institution.get('identifiers', []) for identifier_data in identifiers: identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'ISIL': inst_isil = identifier.get('value') break # First pass: Try exact ISIL match if inst_isil: for wd_item in wikidata_institutions: if wd_item.get('isil') == inst_isil: return (wd_item, 100) # Perfect match via ISIL # Second pass: Fuzzy match by name best_match = None best_score = 0 for wd_item in wikidata_institutions: wd_label = wd_item.get('label', '').lower() if not wd_label: continue # Calculate fuzzy similarity score = fuzz.ratio(inst_name, wd_label) # Bonus points for city match inst_city = None locations = institution.get('locations', []) if locations: first_loc = locations[0] if isinstance(first_loc, str): city_match = re.search(r"'city':\s*'([^']*)'", first_loc) if city_match: inst_city = city_match.group(1).lower() elif isinstance(first_loc, dict): inst_city = first_loc.get('city', '').lower() if inst_city and wd_item.get('city'): wd_city = wd_item['city'].lower() if inst_city in wd_city or wd_city in inst_city: score += 10 # City match bonus if score > best_score: best_score = score best_match = wd_item if best_score >= threshold: return (best_match, best_score) return None def enrich_with_wikidata( institutions: List[Dict], wikidata_libraries: List[Dict], wikidata_archives: List[Dict] ) -> Tuple[List[Dict], Dict]: """ Enrich institutions with Wikidata Q-numbers. Returns: Tuple of (enriched_institutions, statistics) """ stats = { 'total': len(institutions), 'libraries_checked': 0, 'archives_checked': 0, 'matched_by_isil': 0, 'matched_by_name': 0, 'no_match': 0, 'already_had_wikidata': 0 } enriched = [] for i, inst in enumerate(institutions, 1): if i % 100 == 0: print(f" Processing {i}/{len(institutions)} institutions...") inst_type = inst.get('institution_type') # Check if already has Wikidata ID has_wikidata = False identifiers = inst.get('identifiers', []) for identifier_data in identifiers: identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'Wikidata': has_wikidata = True stats['already_had_wikidata'] += 1 break if not has_wikidata: # Try to find Wikidata match if inst_type == 'LIBRARY': stats['libraries_checked'] += 1 match = find_wikidata_match(inst, wikidata_libraries, threshold=85) elif inst_type == 'ARCHIVE': stats['archives_checked'] += 1 match = find_wikidata_match(inst, wikidata_archives, threshold=85) else: match = None if match: wd_item, score = match qid = wd_item['qid'] # Determine if it was ISIL or name match if score == 100: stats['matched_by_isil'] += 1 else: stats['matched_by_name'] += 1 # Add Wikidata identifier (as string representation to match existing format) wikidata_identifier = ( f"Identifier({{\n" f" 'identifier_scheme': 'Wikidata',\n" f" 'identifier_value': '{qid}',\n" f" 'identifier_url': 'https://www.wikidata.org/wiki/{qid}'\n" f"}})" ) if not inst.get('identifiers'): inst['identifiers'] = [] inst['identifiers'].append(wikidata_identifier) # Add enrichment metadata if not inst.get('enrichment_history'): inst['enrichment_history'] = [] inst['enrichment_history'].append({ 'enrichment_date': '2025-11-19', 'enrichment_method': 'Wikidata SPARQL query', 'enrichment_source': 'https://query.wikidata.org/sparql', 'match_score': score, 'matched_label': wd_item.get('label') }) else: stats['no_match'] += 1 enriched.append(inst) return enriched, stats def main(): print("=" * 60) print("Danish GLAM Dataset → Wikidata Enrichment") print("=" * 60) # Load dataset input_path = Path('data/instances/denmark_complete.json') print(f"\nLoading dataset from {input_path}...") with open(input_path, 'r') as f: institutions = json.load(f) print(f" Loaded {len(institutions)} institutions") # Query Wikidata try: wikidata_libraries = query_wikidata_libraries_denmark() time.sleep(2) # Rate limiting wikidata_archives = query_wikidata_archives_denmark() except Exception as e: print(f"❌ Error querying Wikidata: {e}") return # Enrich dataset print("\nEnriching dataset with Wikidata Q-numbers...") enriched_institutions, stats = enrich_with_wikidata( institutions, wikidata_libraries, wikidata_archives ) # Save enriched dataset output_path = Path('data/instances/denmark_complete_enriched.json') print(f"\nSaving enriched dataset to {output_path}...") with open(output_path, 'w') as f: json.dump(enriched_institutions, f, indent=2, ensure_ascii=False) size_mb = output_path.stat().st_size / (1024 * 1024) print(f" ✅ Saved ({size_mb:.2f} MB)") # Print statistics print("\n" + "=" * 60) print("Enrichment Statistics") print("=" * 60) print(f"Total institutions: {stats['total']}") print(f"Already had Wikidata: {stats['already_had_wikidata']}") print(f"Libraries checked: {stats['libraries_checked']}") print(f"Archives checked: {stats['archives_checked']}") print(f"Matched by ISIL: {stats['matched_by_isil']}") print(f"Matched by name: {stats['matched_by_name']}") print(f"No match found: {stats['no_match']}") total_new_matches = stats['matched_by_isil'] + stats['matched_by_name'] total_with_wikidata = stats['already_had_wikidata'] + total_new_matches print(f"\n✅ Total institutions with Wikidata: {total_with_wikidata}/{stats['total']} " + f"({100*total_with_wikidata/stats['total']:.1f}%)") print(f"✅ New Wikidata matches added: {total_new_matches}") print("\n" + "=" * 60) print("✅ Wikidata Enrichment Complete") print("=" * 60) if __name__ == '__main__': main()