glam/scripts/scrapers/consolidate_austrian_data.py
2025-11-19 23:25:22 +01:00

416 lines
15 KiB
Python

#!/usr/bin/env python3
"""
Austrian Heritage Institution Data Consolidator
Merges ISIL registry, Wikidata, and OpenStreetMap data for Austria
This script consolidates multiple Austrian data sources:
1. ISIL page files (194 files, ~1,920 institutions)
2. Wikidata SPARQL results (~4,863 institutions)
3. OpenStreetMap libraries (~748 libraries)
Outputs:
- Consolidated JSON with deduplication
- Statistics report
- Ready for LinkML conversion
Author: OpenCode + MCP Tools
Date: 2025-11-19
"""
import json
import glob
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Set
from collections import defaultdict
from rapidfuzz import fuzz
# Configuration
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/austria")
OUTPUT_FILE = DATA_DIR / f"austrian_institutions_consolidated_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json"
STATS_FILE = DATA_DIR / f"consolidation_stats_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json"
# Fuzzy matching threshold for deduplication
FUZZY_THRESHOLD = 85
def parse_isil_pages() -> List[Dict]:
"""Parse all page_XXX_data.json files."""
institutions = []
page_files = sorted(glob.glob(str(DATA_DIR / "page_*_data.json")))
print(f"📄 Parsing {len(page_files)} ISIL page files...")
for filepath in page_files:
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
# Handle two formats:
# 1. Direct array: [{name, isil_code}, ...]
# 2. Wrapped object: {institutions: [{name, isil}, ...]}
if isinstance(data, dict) and 'institutions' in data:
items = data['institutions']
elif isinstance(data, list):
items = data
else:
print(f"⚠️ Unknown format in {filepath}")
continue
for inst in items:
# Handle both string and dict formats
if isinstance(inst, str):
continue # Skip string entries
if not isinstance(inst, dict):
continue
name = inst.get('name')
# Check both 'isil_code' and 'isil' fields
isil = inst.get('isil_code') or inst.get('isil')
# Skip entries with no name
if not name:
continue
institutions.append({
'name': name.strip() if name else '',
'isil_code': isil.strip() if isil else None,
'data_source': 'ISIL_REGISTRY',
'source_file': Path(filepath).name
})
except Exception as e:
print(f"⚠️ Error reading {filepath}: {e}")
print(f"✅ Parsed {len(institutions)} institutions from ISIL pages")
return institutions
def parse_wikidata() -> List[Dict]:
"""Parse Wikidata SPARQL results."""
institutions = []
wikidata_file = DATA_DIR / "austria_wikidata_institutions.json"
print(f"📄 Parsing Wikidata SPARQL results...")
try:
with open(wikidata_file, 'r', encoding='utf-8') as f:
data = json.load(f)
bindings = data.get('results', {}).get('bindings', [])
for binding in bindings:
item = binding.get('item', {}).get('value', '')
q_number = item.split('/')[-1] if item else None
name = binding.get('itemLabel', {}).get('value', '')
# Skip if name is just Q-number (no proper label)
if name.startswith('Q') and name[1:].isdigit():
continue
description = binding.get('itemDescription', {}).get('value', '')
inst_type = binding.get('typeLabel', {}).get('value', '')
website = binding.get('website', {}).get('value', '')
viaf = binding.get('viaf', {}).get('value', '')
isil = binding.get('isil', {}).get('value', '')
coords = binding.get('coord', {}).get('value', '')
city = binding.get('cityLabel', {}).get('value', '')
# Parse coordinates if present
lat, lon = None, None
if coords and coords.startswith('Point('):
try:
coords_clean = coords.replace('Point(', '').replace(')', '')
lon, lat = map(float, coords_clean.split())
except:
pass
institutions.append({
'name': name.strip(),
'wikidata_id': q_number,
'description': description,
'institution_type': inst_type,
'website': website,
'viaf': viaf,
'isil_code': isil,
'city': city,
'latitude': lat,
'longitude': lon,
'data_source': 'WIKIDATA',
'source_file': 'austria_wikidata_institutions.json'
})
print(f"✅ Parsed {len(institutions)} institutions from Wikidata")
except Exception as e:
print(f"⚠️ Error reading Wikidata file: {e}")
return institutions
def parse_osm() -> List[Dict]:
"""Parse OpenStreetMap library data."""
institutions = []
osm_file = DATA_DIR / "austria_osm_libraries.json"
print(f"📄 Parsing OpenStreetMap data...")
try:
with open(osm_file, 'r', encoding='utf-8') as f:
data = json.load(f)
elements = data.get('elements', [])
for element in elements:
tags = element.get('tags', {})
name = tags.get('name', tags.get('operator', ''))
if not name:
continue
institutions.append({
'name': name.strip(),
'institution_type': 'library',
'latitude': element.get('lat'),
'longitude': element.get('lon'),
'street': tags.get('addr:street'),
'housenumber': tags.get('addr:housenumber'),
'postcode': tags.get('addr:postcode'),
'city': tags.get('addr:city'),
'website': tags.get('website') or tags.get('contact:website'),
'phone': tags.get('phone') or tags.get('contact:phone'),
'email': tags.get('email') or tags.get('contact:email'),
'osm_id': element.get('id'),
'osm_type': element.get('type'),
'data_source': 'OPENSTREETMAP',
'source_file': 'austria_osm_libraries.json'
})
print(f"✅ Parsed {len(institutions)} libraries from OSM")
except Exception as e:
print(f"⚠️ Error reading OSM file: {e}")
return institutions
def fuzzy_match_name(name1: str, name2: str) -> int:
"""Calculate fuzzy match score between two institution names."""
if not name1 or not name2:
return 0
return fuzz.ratio(name1.lower(), name2.lower())
def deduplicate_institutions(institutions: List[Dict]) -> List[Dict]:
"""
Deduplicate institutions using ISIL codes and fuzzy name matching.
Priority:
1. ISIL_REGISTRY (most authoritative for ISIL codes)
2. WIKIDATA (rich metadata)
3. OPENSTREETMAP (geocoding)
"""
print(f"\n🔍 Deduplicating {len(institutions)} institutions...")
# Index by ISIL code
by_isil: Dict[str, List[Dict]] = defaultdict(list)
no_isil: List[Dict] = []
for inst in institutions:
isil = inst.get('isil_code')
# Handle None or empty strings
if isil:
isil = isil.strip()
if isil:
by_isil[isil].append(inst)
else:
no_isil.append(inst)
print(f" - {len(by_isil)} unique ISIL codes")
print(f" - {len(no_isil)} institutions without ISIL")
# Merge institutions with same ISIL code
merged = []
for isil, group in by_isil.items():
if len(group) == 1:
merged.append(group[0])
else:
# Merge metadata from all sources
base = {'isil_code': isil, 'data_sources': []}
for inst in group:
base['data_sources'].append(inst.get('data_source'))
# Merge fields (prefer non-empty values)
for key, value in inst.items():
if key == 'data_source':
continue
if key not in base or not base[key]:
base[key] = value
merged.append(base)
# Fuzzy match institutions without ISIL
print(f" - Fuzzy matching {len(no_isil)} institutions...")
matched_indices: Set[int] = set()
for i, inst1 in enumerate(no_isil):
if i in matched_indices:
continue
# Try to match with existing merged institutions
best_match = None
best_score = 0
for j, inst2 in enumerate(merged):
score = fuzzy_match_name(inst1.get('name', ''), inst2.get('name', ''))
if score > best_score and score >= FUZZY_THRESHOLD:
best_score = score
best_match = inst2
if best_match:
# Merge into existing institution
if 'data_sources' not in best_match:
best_match['data_sources'] = [best_match.get('data_source')]
best_match['data_sources'].append(inst1.get('data_source'))
for key, value in inst1.items():
if key == 'data_source':
continue
if key not in best_match or not best_match[key]:
best_match[key] = value
matched_indices.add(i)
else:
# Try to match with other no_isil institutions
for j in range(i + 1, len(no_isil)):
if j in matched_indices:
continue
score = fuzzy_match_name(inst1.get('name', ''), no_isil[j].get('name', ''))
if score >= FUZZY_THRESHOLD:
# Merge inst1 and inst[j]
inst1['data_sources'] = [inst1.get('data_source'), no_isil[j].get('data_source')]
for key, value in no_isil[j].items():
if key == 'data_source':
continue
if key not in inst1 or not inst1[key]:
inst1[key] = value
matched_indices.add(j)
# Add unmatched no_isil institutions
for i, inst in enumerate(no_isil):
if i not in matched_indices:
inst['data_sources'] = [inst.get('data_source')]
merged.append(inst)
print(f"✅ Deduplicated to {len(merged)} unique institutions")
return merged
def generate_statistics(institutions: List[Dict]) -> Dict:
"""Generate consolidation statistics."""
stats = {
'total_institutions': len(institutions),
'by_source': defaultdict(int),
'by_type': defaultdict(int),
'with_isil': 0,
'with_wikidata': 0,
'with_geocoding': 0,
'with_website': 0,
'multi_source': 0,
'cities': defaultdict(int),
'generation_date': datetime.now(timezone.utc).isoformat()
}
for inst in institutions:
# Count sources
sources = inst.get('data_sources', [inst.get('data_source')])
for source in sources:
stats['by_source'][source] += 1
if len(sources) > 1:
stats['multi_source'] += 1
# Count features
if inst.get('isil_code'):
stats['with_isil'] += 1
if inst.get('wikidata_id'):
stats['with_wikidata'] += 1
if inst.get('latitude') and inst.get('longitude'):
stats['with_geocoding'] += 1
if inst.get('website'):
stats['with_website'] += 1
# Count by type
inst_type = inst.get('institution_type', 'unknown')
stats['by_type'][inst_type] += 1
# Count by city
city = inst.get('city', 'unknown')
if city:
stats['cities'][city] += 1
# Convert defaultdicts to regular dicts for JSON serialization
stats['by_source'] = dict(stats['by_source'])
stats['by_type'] = dict(stats['by_type'])
stats['cities'] = dict(sorted(stats['cities'].items(), key=lambda x: x[1], reverse=True)[:20]) # Top 20 cities
return stats
def main():
"""Main consolidation workflow."""
print("🇦🇹 Austrian Heritage Institution Data Consolidation")
print("=" * 60)
# Parse all data sources
isil_institutions = parse_isil_pages()
wikidata_institutions = parse_wikidata()
osm_institutions = parse_osm()
# Combine all sources
all_institutions = isil_institutions + wikidata_institutions + osm_institutions
print(f"\n📊 Total raw institutions: {len(all_institutions)}")
# Deduplicate
consolidated = deduplicate_institutions(all_institutions)
# Generate statistics
print(f"\n📈 Generating statistics...")
stats = generate_statistics(consolidated)
# Export consolidated data
print(f"\n💾 Exporting consolidated data...")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(consolidated, f, indent=2, ensure_ascii=False)
print(f"✅ Saved to: {OUTPUT_FILE}")
print(f" Size: {OUTPUT_FILE.stat().st_size / 1024:.1f} KB")
# Export statistics
with open(STATS_FILE, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2, ensure_ascii=False)
print(f"✅ Statistics saved to: {STATS_FILE}")
# Print summary
print(f"\n" + "=" * 60)
print(f"📊 CONSOLIDATION SUMMARY")
print(f"=" * 60)
print(f"Total unique institutions: {stats['total_institutions']}")
print(f"")
print(f"By source:")
for source, count in stats['by_source'].items():
print(f" - {source}: {count}")
print(f"")
print(f"Multi-source records: {stats['multi_source']}")
print(f"")
print(f"Coverage:")
print(f" - With ISIL codes: {stats['with_isil']} ({stats['with_isil']/stats['total_institutions']*100:.1f}%)")
print(f" - With Wikidata IDs: {stats['with_wikidata']} ({stats['with_wikidata']/stats['total_institutions']*100:.1f}%)")
print(f" - With geocoding: {stats['with_geocoding']} ({stats['with_geocoding']/stats['total_institutions']*100:.1f}%)")
print(f" - With websites: {stats['with_website']} ({stats['with_website']/stats['total_institutions']*100:.1f}%)")
print(f"")
print(f"Top 5 cities:")
for i, (city, count) in enumerate(list(stats['cities'].items())[:5], 1):
print(f" {i}. {city}: {count}")
print(f"=" * 60)
if __name__ == "__main__":
main()