#!/usr/bin/env python3 """ Austrian Heritage Institution Data Consolidator Merges ISIL registry, Wikidata, and OpenStreetMap data for Austria This script consolidates multiple Austrian data sources: 1. ISIL page files (194 files, ~1,920 institutions) 2. Wikidata SPARQL results (~4,863 institutions) 3. OpenStreetMap libraries (~748 libraries) Outputs: - Consolidated JSON with deduplication - Statistics report - Ready for LinkML conversion Author: OpenCode + MCP Tools Date: 2025-11-19 """ import json import glob from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Set from collections import defaultdict from rapidfuzz import fuzz # Configuration DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/austria") OUTPUT_FILE = DATA_DIR / f"austrian_institutions_consolidated_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json" STATS_FILE = DATA_DIR / f"consolidation_stats_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json" # Fuzzy matching threshold for deduplication FUZZY_THRESHOLD = 85 def parse_isil_pages() -> List[Dict]: """Parse all page_XXX_data.json files.""" institutions = [] page_files = sorted(glob.glob(str(DATA_DIR / "page_*_data.json"))) print(f"šŸ“„ Parsing {len(page_files)} ISIL page files...") for filepath in page_files: try: with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) # Handle two formats: # 1. Direct array: [{name, isil_code}, ...] # 2. Wrapped object: {institutions: [{name, isil}, ...]} if isinstance(data, dict) and 'institutions' in data: items = data['institutions'] elif isinstance(data, list): items = data else: print(f"āš ļø Unknown format in {filepath}") continue for inst in items: # Handle both string and dict formats if isinstance(inst, str): continue # Skip string entries if not isinstance(inst, dict): continue name = inst.get('name') # Check both 'isil_code' and 'isil' fields isil = inst.get('isil_code') or inst.get('isil') # Skip entries with no name if not name: continue institutions.append({ 'name': name.strip() if name else '', 'isil_code': isil.strip() if isil else None, 'data_source': 'ISIL_REGISTRY', 'source_file': Path(filepath).name }) except Exception as e: print(f"āš ļø Error reading {filepath}: {e}") print(f"āœ… Parsed {len(institutions)} institutions from ISIL pages") return institutions def parse_wikidata() -> List[Dict]: """Parse Wikidata SPARQL results.""" institutions = [] wikidata_file = DATA_DIR / "austria_wikidata_institutions.json" print(f"šŸ“„ Parsing Wikidata SPARQL results...") try: with open(wikidata_file, 'r', encoding='utf-8') as f: data = json.load(f) bindings = data.get('results', {}).get('bindings', []) for binding in bindings: item = binding.get('item', {}).get('value', '') q_number = item.split('/')[-1] if item else None name = binding.get('itemLabel', {}).get('value', '') # Skip if name is just Q-number (no proper label) if name.startswith('Q') and name[1:].isdigit(): continue description = binding.get('itemDescription', {}).get('value', '') inst_type = binding.get('typeLabel', {}).get('value', '') website = binding.get('website', {}).get('value', '') viaf = binding.get('viaf', {}).get('value', '') isil = binding.get('isil', {}).get('value', '') coords = binding.get('coord', {}).get('value', '') city = binding.get('cityLabel', {}).get('value', '') # Parse coordinates if present lat, lon = None, None if coords and coords.startswith('Point('): try: coords_clean = coords.replace('Point(', '').replace(')', '') lon, lat = map(float, coords_clean.split()) except: pass institutions.append({ 'name': name.strip(), 'wikidata_id': q_number, 'description': description, 'institution_type': inst_type, 'website': website, 'viaf': viaf, 'isil_code': isil, 'city': city, 'latitude': lat, 'longitude': lon, 'data_source': 'WIKIDATA', 'source_file': 'austria_wikidata_institutions.json' }) print(f"āœ… Parsed {len(institutions)} institutions from Wikidata") except Exception as e: print(f"āš ļø Error reading Wikidata file: {e}") return institutions def parse_osm() -> List[Dict]: """Parse OpenStreetMap library data.""" institutions = [] osm_file = DATA_DIR / "austria_osm_libraries.json" print(f"šŸ“„ Parsing OpenStreetMap data...") try: with open(osm_file, 'r', encoding='utf-8') as f: data = json.load(f) elements = data.get('elements', []) for element in elements: tags = element.get('tags', {}) name = tags.get('name', tags.get('operator', '')) if not name: continue institutions.append({ 'name': name.strip(), 'institution_type': 'library', 'latitude': element.get('lat'), 'longitude': element.get('lon'), 'street': tags.get('addr:street'), 'housenumber': tags.get('addr:housenumber'), 'postcode': tags.get('addr:postcode'), 'city': tags.get('addr:city'), 'website': tags.get('website') or tags.get('contact:website'), 'phone': tags.get('phone') or tags.get('contact:phone'), 'email': tags.get('email') or tags.get('contact:email'), 'osm_id': element.get('id'), 'osm_type': element.get('type'), 'data_source': 'OPENSTREETMAP', 'source_file': 'austria_osm_libraries.json' }) print(f"āœ… Parsed {len(institutions)} libraries from OSM") except Exception as e: print(f"āš ļø Error reading OSM file: {e}") return institutions def fuzzy_match_name(name1: str, name2: str) -> int: """Calculate fuzzy match score between two institution names.""" if not name1 or not name2: return 0 return fuzz.ratio(name1.lower(), name2.lower()) def deduplicate_institutions(institutions: List[Dict]) -> List[Dict]: """ Deduplicate institutions using ISIL codes and fuzzy name matching. Priority: 1. ISIL_REGISTRY (most authoritative for ISIL codes) 2. WIKIDATA (rich metadata) 3. OPENSTREETMAP (geocoding) """ print(f"\nšŸ” Deduplicating {len(institutions)} institutions...") # Index by ISIL code by_isil: Dict[str, List[Dict]] = defaultdict(list) no_isil: List[Dict] = [] for inst in institutions: isil = inst.get('isil_code') # Handle None or empty strings if isil: isil = isil.strip() if isil: by_isil[isil].append(inst) else: no_isil.append(inst) print(f" - {len(by_isil)} unique ISIL codes") print(f" - {len(no_isil)} institutions without ISIL") # Merge institutions with same ISIL code merged = [] for isil, group in by_isil.items(): if len(group) == 1: merged.append(group[0]) else: # Merge metadata from all sources base = {'isil_code': isil, 'data_sources': []} for inst in group: base['data_sources'].append(inst.get('data_source')) # Merge fields (prefer non-empty values) for key, value in inst.items(): if key == 'data_source': continue if key not in base or not base[key]: base[key] = value merged.append(base) # Fuzzy match institutions without ISIL print(f" - Fuzzy matching {len(no_isil)} institutions...") matched_indices: Set[int] = set() for i, inst1 in enumerate(no_isil): if i in matched_indices: continue # Try to match with existing merged institutions best_match = None best_score = 0 for j, inst2 in enumerate(merged): score = fuzzy_match_name(inst1.get('name', ''), inst2.get('name', '')) if score > best_score and score >= FUZZY_THRESHOLD: best_score = score best_match = inst2 if best_match: # Merge into existing institution if 'data_sources' not in best_match: best_match['data_sources'] = [best_match.get('data_source')] best_match['data_sources'].append(inst1.get('data_source')) for key, value in inst1.items(): if key == 'data_source': continue if key not in best_match or not best_match[key]: best_match[key] = value matched_indices.add(i) else: # Try to match with other no_isil institutions for j in range(i + 1, len(no_isil)): if j in matched_indices: continue score = fuzzy_match_name(inst1.get('name', ''), no_isil[j].get('name', '')) if score >= FUZZY_THRESHOLD: # Merge inst1 and inst[j] inst1['data_sources'] = [inst1.get('data_source'), no_isil[j].get('data_source')] for key, value in no_isil[j].items(): if key == 'data_source': continue if key not in inst1 or not inst1[key]: inst1[key] = value matched_indices.add(j) # Add unmatched no_isil institutions for i, inst in enumerate(no_isil): if i not in matched_indices: inst['data_sources'] = [inst.get('data_source')] merged.append(inst) print(f"āœ… Deduplicated to {len(merged)} unique institutions") return merged def generate_statistics(institutions: List[Dict]) -> Dict: """Generate consolidation statistics.""" stats = { 'total_institutions': len(institutions), 'by_source': defaultdict(int), 'by_type': defaultdict(int), 'with_isil': 0, 'with_wikidata': 0, 'with_geocoding': 0, 'with_website': 0, 'multi_source': 0, 'cities': defaultdict(int), 'generation_date': datetime.now(timezone.utc).isoformat() } for inst in institutions: # Count sources sources = inst.get('data_sources', [inst.get('data_source')]) for source in sources: stats['by_source'][source] += 1 if len(sources) > 1: stats['multi_source'] += 1 # Count features if inst.get('isil_code'): stats['with_isil'] += 1 if inst.get('wikidata_id'): stats['with_wikidata'] += 1 if inst.get('latitude') and inst.get('longitude'): stats['with_geocoding'] += 1 if inst.get('website'): stats['with_website'] += 1 # Count by type inst_type = inst.get('institution_type', 'unknown') stats['by_type'][inst_type] += 1 # Count by city city = inst.get('city', 'unknown') if city: stats['cities'][city] += 1 # Convert defaultdicts to regular dicts for JSON serialization stats['by_source'] = dict(stats['by_source']) stats['by_type'] = dict(stats['by_type']) stats['cities'] = dict(sorted(stats['cities'].items(), key=lambda x: x[1], reverse=True)[:20]) # Top 20 cities return stats def main(): """Main consolidation workflow.""" print("šŸ‡¦šŸ‡¹ Austrian Heritage Institution Data Consolidation") print("=" * 60) # Parse all data sources isil_institutions = parse_isil_pages() wikidata_institutions = parse_wikidata() osm_institutions = parse_osm() # Combine all sources all_institutions = isil_institutions + wikidata_institutions + osm_institutions print(f"\nšŸ“Š Total raw institutions: {len(all_institutions)}") # Deduplicate consolidated = deduplicate_institutions(all_institutions) # Generate statistics print(f"\nšŸ“ˆ Generating statistics...") stats = generate_statistics(consolidated) # Export consolidated data print(f"\nšŸ’¾ Exporting consolidated data...") with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(consolidated, f, indent=2, ensure_ascii=False) print(f"āœ… Saved to: {OUTPUT_FILE}") print(f" Size: {OUTPUT_FILE.stat().st_size / 1024:.1f} KB") # Export statistics with open(STATS_FILE, 'w', encoding='utf-8') as f: json.dump(stats, f, indent=2, ensure_ascii=False) print(f"āœ… Statistics saved to: {STATS_FILE}") # Print summary print(f"\n" + "=" * 60) print(f"šŸ“Š CONSOLIDATION SUMMARY") print(f"=" * 60) print(f"Total unique institutions: {stats['total_institutions']}") print(f"") print(f"By source:") for source, count in stats['by_source'].items(): print(f" - {source}: {count}") print(f"") print(f"Multi-source records: {stats['multi_source']}") print(f"") print(f"Coverage:") print(f" - With ISIL codes: {stats['with_isil']} ({stats['with_isil']/stats['total_institutions']*100:.1f}%)") print(f" - With Wikidata IDs: {stats['with_wikidata']} ({stats['with_wikidata']/stats['total_institutions']*100:.1f}%)") print(f" - With geocoding: {stats['with_geocoding']} ({stats['with_geocoding']/stats['total_institutions']*100:.1f}%)") print(f" - With websites: {stats['with_website']} ({stats['with_website']/stats['total_institutions']*100:.1f}%)") print(f"") print(f"Top 5 cities:") for i, (city, count) in enumerate(list(stats['cities'].items())[:5], 1): print(f" {i}. {city}: {count}") print(f"=" * 60) if __name__ == "__main__": main()