#!/usr/bin/env python3 """ Bulgarian ISIL Registry Enrichment Script Enriches Bulgarian institutions with OSM, Wikidata, and VIAF data. """ import json import time import yaml import requests from datetime import datetime, timezone from typing import List, Dict, Optional from rapidfuzz import fuzz def fetch_osm_libraries() -> List[Dict]: """Fetch Bulgarian libraries/archives from OpenStreetMap.""" print("Fetching OSM data for Bulgaria...") overpass_url = "https://overpass-api.de/api/interpreter" query = """ [out:json][timeout:90]; area["ISO3166-1"="BG"][admin_level=2]->.searchArea; ( node["amenity"="library"](area.searchArea); way["amenity"="library"](area.searchArea); relation["amenity"="library"](area.searchArea); node["amenity"="archive"](area.searchArea); way["amenity"="archive"](area.searchArea); node["tourism"="museum"](area.searchArea); way["tourism"="museum"](area.searchArea); ); out body; >; out skel qt; """ try: response = requests.post(overpass_url, data={'data': query}, timeout=120) response.raise_for_status() data = response.json() libraries = [] for element in data.get('elements', []): if element.get('type') == 'node' or (element.get('type') == 'way' and 'center' in element): tags = element.get('tags', {}) name = tags.get('name') or tags.get('name:bg') or tags.get('name:en') if name: lat = element.get('lat') or element.get('center', {}).get('lat') lon = element.get('lon') or element.get('center', {}).get('lon') libraries.append({ 'name': name, 'name_bg': tags.get('name:bg'), 'name_en': tags.get('name:en'), 'latitude': lat, 'longitude': lon, 'city': tags.get('addr:city'), 'street': tags.get('addr:street'), 'website': tags.get('website') or tags.get('contact:website'), 'wikidata': tags.get('wikidata'), 'osm_id': element.get('id') }) print(f"✓ Found {len(libraries)} locations in OSM") return libraries except Exception as e: print(f"✗ OSM fetch failed: {e}") return [] def fetch_wikidata_institutions() -> List[Dict]: """Query Wikidata for Bulgarian heritage institutions.""" print("Querying Wikidata for Bulgarian institutions...") endpoint = "https://query.wikidata.org/sparql" query = """ SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?isil ?viaf ?website ?coords ?address WHERE { VALUES ?type { wd:Q7075 wd:Q166118 wd:Q1030034 wd:Q33506 wd:Q636400 wd:Q212805 wd:Q213441 wd:Q1362225 wd:Q24398318 } ?item wdt:P31/wdt:P279* ?type . ?item wdt:P17 wd:Q219 . OPTIONAL { ?item wdt:P791 ?isil } OPTIONAL { ?item wdt:P214 ?viaf } OPTIONAL { ?item wdt:P856 ?website } OPTIONAL { ?item wdt:P625 ?coords } OPTIONAL { ?item wdt:P6375 ?address } SERVICE wikibase:label { bd:serviceParam wikibase:language "bg,en,ru". ?item rdfs:label ?itemLabel . ?item skos:altLabel ?itemAltLabel . } } """ try: response = requests.get( endpoint, params={'query': query, 'format': 'json'}, headers={'User-Agent': 'GLAM-Extractor/1.0'}, timeout=60 ) response.raise_for_status() data = response.json() institutions = [] for result in data['results']['bindings']: item_id = result['item']['value'].split('/')[-1] coords_str = result.get('coords', {}).get('value', '') lat, lon = None, None if coords_str and coords_str.startswith('Point('): parts = coords_str.replace('Point(', '').replace(')', '').split() if len(parts) == 2: lon, lat = float(parts[0]), float(parts[1]) institutions.append({ 'qid': item_id, 'label': result.get('itemLabel', {}).get('value'), 'alt_labels': result.get('itemAltLabel', {}).get('value', '').split(',') if result.get('itemAltLabel') else [], 'isil': result.get('isil', {}).get('value'), 'viaf': result.get('viaf', {}).get('value'), 'website': result.get('website', {}).get('value'), 'latitude': lat, 'longitude': lon, 'address': result.get('address', {}).get('value') }) print(f"✓ Found {len(institutions)} institutions in Wikidata") return institutions except Exception as e: print(f"✗ Wikidata query failed: {e}") return [] def fuzzy_match_institution(inst_name: str, candidates: List[Dict], name_fields: List[str]) -> Optional[tuple]: """Fuzzy match institution name against candidates.""" best_match = None best_score = 0 inst_name_clean = inst_name.lower().strip() for candidate in candidates: # Try all name fields names_to_try = [] for field in name_fields: value = candidate.get(field) if value: if isinstance(value, list): names_to_try.extend(value) else: names_to_try.append(value) for name in names_to_try: if not name: continue name_clean = name.lower().strip() score = fuzz.token_sort_ratio(inst_name_clean, name_clean) if score > best_score: best_score = score best_match = candidate if best_score >= 75: return (best_match, best_score) return None def enrich_institutions(base_file: str, osm_data: List[Dict], wikidata_data: List[Dict]) -> tuple: """Enrich Bulgarian institutions with external data.""" print(f"\nLoading base institutions from {base_file}...") with open(base_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"✓ Loaded {len(institutions)} institutions") print("\nEnriching institutions...") enrichments = [] stats = { 'total': len(institutions), 'wikidata_matched': 0, 'viaf_added': 0, 'coords_added': 0, 'website_added': 0, 'osm_matched': 0, 'high_confidence': 0, 'medium_confidence': 0 } for idx, inst in enumerate(institutions, 1): inst_name = inst.get('name', '') alt_names = inst.get('alternative_names', []) enrichment = { 'id': inst.get('id'), 'name': inst_name, 'isil': next((i['identifier_value'] for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'ISIL'), None), 'matches': [] } # Try Wikidata match by ISIL first (exact) inst_isil = enrichment['isil'] wikidata_match = None match_score = 0 if inst_isil: for wd in wikidata_data: if wd.get('isil') == inst_isil: wikidata_match = wd match_score = 100 enrichment['matches'].append({ 'source': 'wikidata', 'match_type': 'isil_exact', 'score': 100, 'qid': wd['qid'] }) stats['wikidata_matched'] += 1 stats['high_confidence'] += 1 break # Fuzzy match by name if no ISIL match if not wikidata_match: result = fuzzy_match_institution( inst_name, wikidata_data, ['label', 'alt_labels'] ) if result: wikidata_match, match_score = result enrichment['matches'].append({ 'source': 'wikidata', 'match_type': 'name_fuzzy', 'score': match_score, 'qid': wikidata_match['qid'] }) stats['wikidata_matched'] += 1 if match_score >= 85: stats['high_confidence'] += 1 else: stats['medium_confidence'] += 1 # Add Wikidata enrichments if wikidata_match: if wikidata_match.get('viaf'): enrichment['viaf'] = wikidata_match['viaf'] stats['viaf_added'] += 1 if wikidata_match.get('website') and not inst.get('homepage'): enrichment['website'] = wikidata_match['website'] stats['website_added'] += 1 if wikidata_match.get('latitude') and wikidata_match.get('longitude'): location = inst.get('locations', [{}])[0] if not location.get('latitude'): enrichment['latitude'] = wikidata_match['latitude'] enrichment['longitude'] = wikidata_match['longitude'] stats['coords_added'] += 1 # Try OSM match osm_result = fuzzy_match_institution( inst_name, osm_data, ['name', 'name_bg', 'name_en'] ) if osm_result: osm_match, osm_score = osm_result enrichment['matches'].append({ 'source': 'osm', 'match_type': 'name_fuzzy', 'score': osm_score, 'osm_id': osm_match['osm_id'] }) stats['osm_matched'] += 1 # Add OSM data if missing if osm_match.get('latitude') and osm_match.get('longitude'): location = inst.get('locations', [{}])[0] if not location.get('latitude') and 'latitude' not in enrichment: enrichment['latitude'] = osm_match['latitude'] enrichment['longitude'] = osm_match['longitude'] stats['coords_added'] += 1 if osm_match.get('website') and not inst.get('homepage') and 'website' not in enrichment: enrichment['website'] = osm_match['website'] stats['website_added'] += 1 if enrichment['matches']: enrichments.append(enrichment) if idx % 10 == 0: print(f" Processed {idx}/{stats['total']} institutions...") print(f"\n✓ Enrichment complete") print(f"\nStatistics:") print(f" Total institutions: {stats['total']}") print(f" Wikidata matches: {stats['wikidata_matched']} ({stats['wikidata_matched']/stats['total']*100:.1f}%)") print(f" - High confidence (≥85%): {stats['high_confidence']}") print(f" - Medium confidence (75-84%): {stats['medium_confidence']}") print(f" OSM matches: {stats['osm_matched']} ({stats['osm_matched']/stats['total']*100:.1f}%)") print(f" VIAF IDs added: {stats['viaf_added']}") print(f" Coordinates added: {stats['coords_added']}") print(f" Websites added: {stats['website_added']}") print(f" Total enriched: {len(enrichments)} ({len(enrichments)/stats['total']*100:.1f}%)") return enrichments, stats def apply_enrichments(base_file: str, enrichments: List[Dict], output_file: str): """Apply enrichments to base YAML and save.""" print(f"\nApplying enrichments to {base_file}...") with open(base_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) enrichment_map = {e['id']: e for e in enrichments} enriched_count = 0 for inst in institutions: inst_id = inst.get('id') if inst_id in enrichment_map: enrich_data = enrichment_map[inst_id] # Add Wikidata ID if 'qid' in enrich_data or any(m.get('qid') for m in enrich_data.get('matches', [])): qid = enrich_data.get('qid') or next(m['qid'] for m in enrich_data['matches'] if m.get('qid')) identifiers = inst.setdefault('identifiers', []) if not any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers): identifiers.append({ 'identifier_scheme': 'Wikidata', 'identifier_value': qid, 'identifier_url': f'https://www.wikidata.org/wiki/{qid}' }) # Add VIAF ID if 'viaf' in enrich_data: identifiers = inst.setdefault('identifiers', []) if not any(i.get('identifier_scheme') == 'VIAF' for i in identifiers): identifiers.append({ 'identifier_scheme': 'VIAF', 'identifier_value': enrich_data['viaf'], 'identifier_url': f'https://viaf.org/viaf/{enrich_data["viaf"]}' }) # Add coordinates if 'latitude' in enrich_data and 'longitude' in enrich_data: locations = inst.setdefault('locations', [{}]) if not locations[0].get('latitude'): locations[0]['latitude'] = enrich_data['latitude'] locations[0]['longitude'] = enrich_data['longitude'] # Add website if 'website' in enrich_data and not inst.get('homepage'): inst['homepage'] = enrich_data['website'] identifiers = inst.setdefault('identifiers', []) if not any(i.get('identifier_scheme') == 'Website' for i in identifiers): identifiers.append({ 'identifier_scheme': 'Website', 'identifier_value': enrich_data['website'], 'identifier_url': enrich_data['website'] }) enriched_count += 1 # Add header comment output_data = { '_comment': f'Bulgarian ISIL Registry - Enriched with Wikidata, VIAF, and OSM data', '_generated': datetime.now(timezone.utc).isoformat(), '_source_file': base_file, '_enriched_count': enriched_count, '_total_count': len(institutions) } # Save enriched YAML with open(output_file, 'w', encoding='utf-8') as f: f.write("---\n") f.write(f"# Bulgarian ISIL Registry - Enriched\n") f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n") f.write(f"# Enriched: {enriched_count}/{len(institutions)} institutions\n") f.write(f"# Source: {base_file}\n\n") yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Saved enriched data to {output_file}") print(f" Enriched {enriched_count}/{len(institutions)} institutions") def main(): print("=" * 70) print("Bulgarian ISIL Registry Enrichment") print("=" * 70) base_file = "data/instances/bulgaria_isil_libraries.yaml" output_file = "data/instances/bulgaria_complete.yaml" # Step 1: Fetch OSM data osm_data = fetch_osm_libraries() with open('data/isil/bulgaria/bulgaria_osm_libraries.json', 'w', encoding='utf-8') as f: json.dump(osm_data, f, ensure_ascii=False, indent=2) print(f"✓ Saved OSM data to data/isil/bulgaria/bulgaria_osm_libraries.json") # Step 2: Fetch Wikidata time.sleep(2) # Rate limiting wikidata_data = fetch_wikidata_institutions() with open('data/isil/bulgaria/bulgaria_wikidata_institutions.json', 'w', encoding='utf-8') as f: json.dump(wikidata_data, f, ensure_ascii=False, indent=2) print(f"✓ Saved Wikidata data to data/isil/bulgaria/bulgaria_wikidata_institutions.json") # Step 3: Enrich enrichments, stats = enrich_institutions(base_file, osm_data, wikidata_data) with open('data/isil/bulgaria/bulgaria_enrichments.json', 'w', encoding='utf-8') as f: json.dump({ 'enrichments': enrichments, 'stats': stats, 'generated': datetime.now(timezone.utc).isoformat() }, f, ensure_ascii=False, indent=2) print(f"✓ Saved enrichment data to data/isil/bulgaria/bulgaria_enrichments.json") # Step 4: Apply enrichments apply_enrichments(base_file, enrichments, output_file) print("\n" + "=" * 70) print("✓ Bulgarian ISIL enrichment complete!") print("=" * 70) if __name__ == '__main__': main()