#!/usr/bin/env python3 """ Merge Austrian ISIL page files into a single dataset. Handles two JSON formats: 1. Array format: [{"name": "...", "isil_code": "..."}] 2. Metadata format: {"page": N, "institutions": [{"name": "...", "isil": "..."}]} Normalizes to consistent format with deduplication. """ import json from pathlib import Path from typing import List, Dict, Set from datetime import datetime, timezone def load_page_data(page_file: Path) -> List[Dict[str, str]]: """Load institutions from a page file, handling both formats.""" with open(page_file, 'r', encoding='utf-8') as f: data = json.load(f) institutions = [] if isinstance(data, list): # Array format for inst in data: institutions.append({ 'name': inst.get('name', ''), 'isil_code': inst.get('isil_code') or inst.get('isil', '') }) elif isinstance(data, dict): # Metadata format for inst in data.get('institutions', []): institutions.append({ 'name': inst.get('name', ''), 'isil_code': inst.get('isil_code') or inst.get('isil', '') }) return institutions def merge_austrian_isil_pages(data_dir: str, output_file: str, start_page: int = 1, end_page: int = 194): """ Merge Austrian ISIL page files into a single dataset. Args: data_dir: Directory containing page_NNN_data.json files output_file: Output path for merged JSON start_page: First page to process end_page: Last page to process """ data_path = Path(data_dir) institutions_with_isil = [] institutions_without_isil = [] seen_isil_codes: Set[str] = set() seen_names: Set[str] = set() duplicates = [] stats = { 'pages_processed': 0, 'institutions_extracted': 0, 'institutions_with_isil': 0, 'institutions_without_isil': 0, 'duplicates_found': 0, 'missing_pages': [], } print(f"Merging Austrian ISIL pages {start_page}-{end_page}") print(f"Input directory: {data_path}") print(f"Output file: {output_file}") print() for page_num in range(start_page, end_page + 1): page_file = data_path / f"page_{page_num:03d}_data.json" if not page_file.exists(): print(f"⚠️ Missing: page {page_num}") stats['missing_pages'].append(page_num) continue try: institutions = load_page_data(page_file) page_count = len(institutions) for inst in institutions: isil_code = inst['isil_code'] name = inst['name'] # Handle institutions WITH ISIL codes if isil_code: if isil_code in seen_isil_codes: print(f"⚠️ Duplicate ISIL code: {isil_code} (page {page_num})") duplicates.append({ 'isil_code': isil_code, 'name': name, 'page': page_num }) stats['duplicates_found'] += 1 else: seen_isil_codes.add(isil_code) institutions_with_isil.append(inst) stats['institutions_with_isil'] += 1 # Handle institutions WITHOUT ISIL codes (departments, branches) else: # Deduplicate by name to avoid exact duplicates if name not in seen_names: seen_names.add(name) institutions_without_isil.append(inst) stats['institutions_without_isil'] += 1 stats['pages_processed'] += 1 stats['institutions_extracted'] += page_count print(f"✅ Page {page_num:2d}: {page_count:2d} institutions") except Exception as e: print(f"❌ Error processing page {page_num}: {e}") # Sort institutions institutions_with_isil.sort(key=lambda x: x['isil_code']) institutions_without_isil.sort(key=lambda x: x['name']) # Combine all institutions all_institutions = institutions_with_isil + institutions_without_isil # Create output with metadata output_data = { 'metadata': { 'source': 'Austrian ISIL Registry (https://www.isil.at)', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'pages_scraped': f"{start_page}-{end_page}", 'total_institutions': len(all_institutions), 'institutions_with_isil': len(institutions_with_isil), 'institutions_without_isil': len(institutions_without_isil), 'duplicates_removed': stats['duplicates_found'], 'data_tier': 'TIER_1_AUTHORITATIVE', 'format_version': '2.0', 'notes': 'Institutions without ISIL codes are typically departments or branches of main institutions' }, 'statistics': stats, 'duplicates': duplicates, 'institutions_with_isil': institutions_with_isil, 'institutions_without_isil': institutions_without_isil, 'all_institutions': all_institutions } # Write merged output output_path = Path(output_file) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(output_data, f, indent=2, ensure_ascii=False) print() print("=" * 60) print("MERGE COMPLETE") print("=" * 60) print(f"Pages processed: {stats['pages_processed']}/{end_page - start_page + 1}") print(f"Institutions extracted: {stats['institutions_extracted']}") print(f" - With ISIL codes: {stats['institutions_with_isil']}") print(f" - Without ISIL codes (departments/branches): {stats['institutions_without_isil']}") print(f"Duplicates removed: {stats['duplicates_found']}") print(f"Total unique institutions: {len(all_institutions)}") print(f"Missing pages: {len(stats['missing_pages'])}") if stats['missing_pages']: print(f" {stats['missing_pages']}") print() print(f"✅ Output saved to: {output_path}") return output_data if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='Merge Austrian ISIL page files') parser.add_argument('--data-dir', default='data/isil/austria', help='Directory containing page JSON files') parser.add_argument('--output', default='data/isil/austria/austrian_isil_merged.json', help='Output file path') parser.add_argument('--start', type=int, default=1, help='First page to process') parser.add_argument('--end', type=int, default=194, help='Last page to process') args = parser.parse_args() merge_austrian_isil_pages( data_dir=args.data_dir, output_file=args.output, start_page=args.start, end_page=args.end )