416 lines
15 KiB
Python
416 lines
15 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Austrian Heritage Institution Data Consolidator
|
|
Merges ISIL registry, Wikidata, and OpenStreetMap data for Austria
|
|
|
|
This script consolidates multiple Austrian data sources:
|
|
1. ISIL page files (194 files, ~1,920 institutions)
|
|
2. Wikidata SPARQL results (~4,863 institutions)
|
|
3. OpenStreetMap libraries (~748 libraries)
|
|
|
|
Outputs:
|
|
- Consolidated JSON with deduplication
|
|
- Statistics report
|
|
- Ready for LinkML conversion
|
|
|
|
Author: OpenCode + MCP Tools
|
|
Date: 2025-11-19
|
|
"""
|
|
|
|
import json
|
|
import glob
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Set
|
|
from collections import defaultdict
|
|
from rapidfuzz import fuzz
|
|
|
|
# Configuration
|
|
DATA_DIR = Path("/Users/kempersc/apps/glam/data/isil/austria")
|
|
OUTPUT_FILE = DATA_DIR / f"austrian_institutions_consolidated_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json"
|
|
STATS_FILE = DATA_DIR / f"consolidation_stats_{datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')}.json"
|
|
|
|
# Fuzzy matching threshold for deduplication
|
|
FUZZY_THRESHOLD = 85
|
|
|
|
|
|
def parse_isil_pages() -> List[Dict]:
|
|
"""Parse all page_XXX_data.json files."""
|
|
institutions = []
|
|
page_files = sorted(glob.glob(str(DATA_DIR / "page_*_data.json")))
|
|
|
|
print(f"📄 Parsing {len(page_files)} ISIL page files...")
|
|
|
|
for filepath in page_files:
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Handle two formats:
|
|
# 1. Direct array: [{name, isil_code}, ...]
|
|
# 2. Wrapped object: {institutions: [{name, isil}, ...]}
|
|
if isinstance(data, dict) and 'institutions' in data:
|
|
items = data['institutions']
|
|
elif isinstance(data, list):
|
|
items = data
|
|
else:
|
|
print(f"⚠️ Unknown format in {filepath}")
|
|
continue
|
|
|
|
for inst in items:
|
|
# Handle both string and dict formats
|
|
if isinstance(inst, str):
|
|
continue # Skip string entries
|
|
|
|
if not isinstance(inst, dict):
|
|
continue
|
|
|
|
name = inst.get('name')
|
|
# Check both 'isil_code' and 'isil' fields
|
|
isil = inst.get('isil_code') or inst.get('isil')
|
|
|
|
# Skip entries with no name
|
|
if not name:
|
|
continue
|
|
|
|
institutions.append({
|
|
'name': name.strip() if name else '',
|
|
'isil_code': isil.strip() if isil else None,
|
|
'data_source': 'ISIL_REGISTRY',
|
|
'source_file': Path(filepath).name
|
|
})
|
|
except Exception as e:
|
|
print(f"⚠️ Error reading {filepath}: {e}")
|
|
|
|
print(f"✅ Parsed {len(institutions)} institutions from ISIL pages")
|
|
return institutions
|
|
|
|
|
|
def parse_wikidata() -> List[Dict]:
|
|
"""Parse Wikidata SPARQL results."""
|
|
institutions = []
|
|
wikidata_file = DATA_DIR / "austria_wikidata_institutions.json"
|
|
|
|
print(f"📄 Parsing Wikidata SPARQL results...")
|
|
|
|
try:
|
|
with open(wikidata_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
bindings = data.get('results', {}).get('bindings', [])
|
|
|
|
for binding in bindings:
|
|
item = binding.get('item', {}).get('value', '')
|
|
q_number = item.split('/')[-1] if item else None
|
|
|
|
name = binding.get('itemLabel', {}).get('value', '')
|
|
# Skip if name is just Q-number (no proper label)
|
|
if name.startswith('Q') and name[1:].isdigit():
|
|
continue
|
|
|
|
description = binding.get('itemDescription', {}).get('value', '')
|
|
inst_type = binding.get('typeLabel', {}).get('value', '')
|
|
website = binding.get('website', {}).get('value', '')
|
|
viaf = binding.get('viaf', {}).get('value', '')
|
|
isil = binding.get('isil', {}).get('value', '')
|
|
coords = binding.get('coord', {}).get('value', '')
|
|
city = binding.get('cityLabel', {}).get('value', '')
|
|
|
|
# Parse coordinates if present
|
|
lat, lon = None, None
|
|
if coords and coords.startswith('Point('):
|
|
try:
|
|
coords_clean = coords.replace('Point(', '').replace(')', '')
|
|
lon, lat = map(float, coords_clean.split())
|
|
except:
|
|
pass
|
|
|
|
institutions.append({
|
|
'name': name.strip(),
|
|
'wikidata_id': q_number,
|
|
'description': description,
|
|
'institution_type': inst_type,
|
|
'website': website,
|
|
'viaf': viaf,
|
|
'isil_code': isil,
|
|
'city': city,
|
|
'latitude': lat,
|
|
'longitude': lon,
|
|
'data_source': 'WIKIDATA',
|
|
'source_file': 'austria_wikidata_institutions.json'
|
|
})
|
|
|
|
print(f"✅ Parsed {len(institutions)} institutions from Wikidata")
|
|
except Exception as e:
|
|
print(f"⚠️ Error reading Wikidata file: {e}")
|
|
|
|
return institutions
|
|
|
|
|
|
def parse_osm() -> List[Dict]:
|
|
"""Parse OpenStreetMap library data."""
|
|
institutions = []
|
|
osm_file = DATA_DIR / "austria_osm_libraries.json"
|
|
|
|
print(f"📄 Parsing OpenStreetMap data...")
|
|
|
|
try:
|
|
with open(osm_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
elements = data.get('elements', [])
|
|
|
|
for element in elements:
|
|
tags = element.get('tags', {})
|
|
name = tags.get('name', tags.get('operator', ''))
|
|
|
|
if not name:
|
|
continue
|
|
|
|
institutions.append({
|
|
'name': name.strip(),
|
|
'institution_type': 'library',
|
|
'latitude': element.get('lat'),
|
|
'longitude': element.get('lon'),
|
|
'street': tags.get('addr:street'),
|
|
'housenumber': tags.get('addr:housenumber'),
|
|
'postcode': tags.get('addr:postcode'),
|
|
'city': tags.get('addr:city'),
|
|
'website': tags.get('website') or tags.get('contact:website'),
|
|
'phone': tags.get('phone') or tags.get('contact:phone'),
|
|
'email': tags.get('email') or tags.get('contact:email'),
|
|
'osm_id': element.get('id'),
|
|
'osm_type': element.get('type'),
|
|
'data_source': 'OPENSTREETMAP',
|
|
'source_file': 'austria_osm_libraries.json'
|
|
})
|
|
|
|
print(f"✅ Parsed {len(institutions)} libraries from OSM")
|
|
except Exception as e:
|
|
print(f"⚠️ Error reading OSM file: {e}")
|
|
|
|
return institutions
|
|
|
|
|
|
def fuzzy_match_name(name1: str, name2: str) -> int:
|
|
"""Calculate fuzzy match score between two institution names."""
|
|
if not name1 or not name2:
|
|
return 0
|
|
return fuzz.ratio(name1.lower(), name2.lower())
|
|
|
|
|
|
def deduplicate_institutions(institutions: List[Dict]) -> List[Dict]:
|
|
"""
|
|
Deduplicate institutions using ISIL codes and fuzzy name matching.
|
|
|
|
Priority:
|
|
1. ISIL_REGISTRY (most authoritative for ISIL codes)
|
|
2. WIKIDATA (rich metadata)
|
|
3. OPENSTREETMAP (geocoding)
|
|
"""
|
|
print(f"\n🔍 Deduplicating {len(institutions)} institutions...")
|
|
|
|
# Index by ISIL code
|
|
by_isil: Dict[str, List[Dict]] = defaultdict(list)
|
|
no_isil: List[Dict] = []
|
|
|
|
for inst in institutions:
|
|
isil = inst.get('isil_code')
|
|
# Handle None or empty strings
|
|
if isil:
|
|
isil = isil.strip()
|
|
|
|
if isil:
|
|
by_isil[isil].append(inst)
|
|
else:
|
|
no_isil.append(inst)
|
|
|
|
print(f" - {len(by_isil)} unique ISIL codes")
|
|
print(f" - {len(no_isil)} institutions without ISIL")
|
|
|
|
# Merge institutions with same ISIL code
|
|
merged = []
|
|
for isil, group in by_isil.items():
|
|
if len(group) == 1:
|
|
merged.append(group[0])
|
|
else:
|
|
# Merge metadata from all sources
|
|
base = {'isil_code': isil, 'data_sources': []}
|
|
|
|
for inst in group:
|
|
base['data_sources'].append(inst.get('data_source'))
|
|
# Merge fields (prefer non-empty values)
|
|
for key, value in inst.items():
|
|
if key == 'data_source':
|
|
continue
|
|
if key not in base or not base[key]:
|
|
base[key] = value
|
|
|
|
merged.append(base)
|
|
|
|
# Fuzzy match institutions without ISIL
|
|
print(f" - Fuzzy matching {len(no_isil)} institutions...")
|
|
matched_indices: Set[int] = set()
|
|
|
|
for i, inst1 in enumerate(no_isil):
|
|
if i in matched_indices:
|
|
continue
|
|
|
|
# Try to match with existing merged institutions
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for j, inst2 in enumerate(merged):
|
|
score = fuzzy_match_name(inst1.get('name', ''), inst2.get('name', ''))
|
|
if score > best_score and score >= FUZZY_THRESHOLD:
|
|
best_score = score
|
|
best_match = inst2
|
|
|
|
if best_match:
|
|
# Merge into existing institution
|
|
if 'data_sources' not in best_match:
|
|
best_match['data_sources'] = [best_match.get('data_source')]
|
|
best_match['data_sources'].append(inst1.get('data_source'))
|
|
|
|
for key, value in inst1.items():
|
|
if key == 'data_source':
|
|
continue
|
|
if key not in best_match or not best_match[key]:
|
|
best_match[key] = value
|
|
|
|
matched_indices.add(i)
|
|
else:
|
|
# Try to match with other no_isil institutions
|
|
for j in range(i + 1, len(no_isil)):
|
|
if j in matched_indices:
|
|
continue
|
|
|
|
score = fuzzy_match_name(inst1.get('name', ''), no_isil[j].get('name', ''))
|
|
if score >= FUZZY_THRESHOLD:
|
|
# Merge inst1 and inst[j]
|
|
inst1['data_sources'] = [inst1.get('data_source'), no_isil[j].get('data_source')]
|
|
for key, value in no_isil[j].items():
|
|
if key == 'data_source':
|
|
continue
|
|
if key not in inst1 or not inst1[key]:
|
|
inst1[key] = value
|
|
matched_indices.add(j)
|
|
|
|
# Add unmatched no_isil institutions
|
|
for i, inst in enumerate(no_isil):
|
|
if i not in matched_indices:
|
|
inst['data_sources'] = [inst.get('data_source')]
|
|
merged.append(inst)
|
|
|
|
print(f"✅ Deduplicated to {len(merged)} unique institutions")
|
|
return merged
|
|
|
|
|
|
def generate_statistics(institutions: List[Dict]) -> Dict:
|
|
"""Generate consolidation statistics."""
|
|
stats = {
|
|
'total_institutions': len(institutions),
|
|
'by_source': defaultdict(int),
|
|
'by_type': defaultdict(int),
|
|
'with_isil': 0,
|
|
'with_wikidata': 0,
|
|
'with_geocoding': 0,
|
|
'with_website': 0,
|
|
'multi_source': 0,
|
|
'cities': defaultdict(int),
|
|
'generation_date': datetime.now(timezone.utc).isoformat()
|
|
}
|
|
|
|
for inst in institutions:
|
|
# Count sources
|
|
sources = inst.get('data_sources', [inst.get('data_source')])
|
|
for source in sources:
|
|
stats['by_source'][source] += 1
|
|
|
|
if len(sources) > 1:
|
|
stats['multi_source'] += 1
|
|
|
|
# Count features
|
|
if inst.get('isil_code'):
|
|
stats['with_isil'] += 1
|
|
if inst.get('wikidata_id'):
|
|
stats['with_wikidata'] += 1
|
|
if inst.get('latitude') and inst.get('longitude'):
|
|
stats['with_geocoding'] += 1
|
|
if inst.get('website'):
|
|
stats['with_website'] += 1
|
|
|
|
# Count by type
|
|
inst_type = inst.get('institution_type', 'unknown')
|
|
stats['by_type'][inst_type] += 1
|
|
|
|
# Count by city
|
|
city = inst.get('city', 'unknown')
|
|
if city:
|
|
stats['cities'][city] += 1
|
|
|
|
# Convert defaultdicts to regular dicts for JSON serialization
|
|
stats['by_source'] = dict(stats['by_source'])
|
|
stats['by_type'] = dict(stats['by_type'])
|
|
stats['cities'] = dict(sorted(stats['cities'].items(), key=lambda x: x[1], reverse=True)[:20]) # Top 20 cities
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
"""Main consolidation workflow."""
|
|
print("🇦🇹 Austrian Heritage Institution Data Consolidation")
|
|
print("=" * 60)
|
|
|
|
# Parse all data sources
|
|
isil_institutions = parse_isil_pages()
|
|
wikidata_institutions = parse_wikidata()
|
|
osm_institutions = parse_osm()
|
|
|
|
# Combine all sources
|
|
all_institutions = isil_institutions + wikidata_institutions + osm_institutions
|
|
print(f"\n📊 Total raw institutions: {len(all_institutions)}")
|
|
|
|
# Deduplicate
|
|
consolidated = deduplicate_institutions(all_institutions)
|
|
|
|
# Generate statistics
|
|
print(f"\n📈 Generating statistics...")
|
|
stats = generate_statistics(consolidated)
|
|
|
|
# Export consolidated data
|
|
print(f"\n💾 Exporting consolidated data...")
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(consolidated, f, indent=2, ensure_ascii=False)
|
|
print(f"✅ Saved to: {OUTPUT_FILE}")
|
|
print(f" Size: {OUTPUT_FILE.stat().st_size / 1024:.1f} KB")
|
|
|
|
# Export statistics
|
|
with open(STATS_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(stats, f, indent=2, ensure_ascii=False)
|
|
print(f"✅ Statistics saved to: {STATS_FILE}")
|
|
|
|
# Print summary
|
|
print(f"\n" + "=" * 60)
|
|
print(f"📊 CONSOLIDATION SUMMARY")
|
|
print(f"=" * 60)
|
|
print(f"Total unique institutions: {stats['total_institutions']}")
|
|
print(f"")
|
|
print(f"By source:")
|
|
for source, count in stats['by_source'].items():
|
|
print(f" - {source}: {count}")
|
|
print(f"")
|
|
print(f"Multi-source records: {stats['multi_source']}")
|
|
print(f"")
|
|
print(f"Coverage:")
|
|
print(f" - With ISIL codes: {stats['with_isil']} ({stats['with_isil']/stats['total_institutions']*100:.1f}%)")
|
|
print(f" - With Wikidata IDs: {stats['with_wikidata']} ({stats['with_wikidata']/stats['total_institutions']*100:.1f}%)")
|
|
print(f" - With geocoding: {stats['with_geocoding']} ({stats['with_geocoding']/stats['total_institutions']*100:.1f}%)")
|
|
print(f" - With websites: {stats['with_website']} ({stats['with_website']/stats['total_institutions']*100:.1f}%)")
|
|
print(f"")
|
|
print(f"Top 5 cities:")
|
|
for i, (city, count) in enumerate(list(stats['cities'].items())[:5], 1):
|
|
print(f" {i}. {city}: {count}")
|
|
print(f"=" * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|