glam/scripts/merge_austrian_isil_pages.py
2025-11-19 23:25:22 +01:00

190 lines
7.1 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Merge Austrian ISIL page files into a single dataset.
Handles two JSON formats:
1. Array format: [{"name": "...", "isil_code": "..."}]
2. Metadata format: {"page": N, "institutions": [{"name": "...", "isil": "..."}]}
Normalizes to consistent format with deduplication.
"""
import json
from pathlib import Path
from typing import List, Dict, Set
from datetime import datetime, timezone
def load_page_data(page_file: Path) -> List[Dict[str, str]]:
"""Load institutions from a page file, handling both formats."""
with open(page_file, 'r', encoding='utf-8') as f:
data = json.load(f)
institutions = []
if isinstance(data, list):
# Array format
for inst in data:
institutions.append({
'name': inst.get('name', ''),
'isil_code': inst.get('isil_code') or inst.get('isil', '')
})
elif isinstance(data, dict):
# Metadata format
for inst in data.get('institutions', []):
institutions.append({
'name': inst.get('name', ''),
'isil_code': inst.get('isil_code') or inst.get('isil', '')
})
return institutions
def merge_austrian_isil_pages(data_dir: str, output_file: str, start_page: int = 1, end_page: int = 194):
"""
Merge Austrian ISIL page files into a single dataset.
Args:
data_dir: Directory containing page_NNN_data.json files
output_file: Output path for merged JSON
start_page: First page to process
end_page: Last page to process
"""
data_path = Path(data_dir)
institutions_with_isil = []
institutions_without_isil = []
seen_isil_codes: Set[str] = set()
seen_names: Set[str] = set()
duplicates = []
stats = {
'pages_processed': 0,
'institutions_extracted': 0,
'institutions_with_isil': 0,
'institutions_without_isil': 0,
'duplicates_found': 0,
'missing_pages': [],
}
print(f"Merging Austrian ISIL pages {start_page}-{end_page}")
print(f"Input directory: {data_path}")
print(f"Output file: {output_file}")
print()
for page_num in range(start_page, end_page + 1):
page_file = data_path / f"page_{page_num:03d}_data.json"
if not page_file.exists():
print(f"⚠️ Missing: page {page_num}")
stats['missing_pages'].append(page_num)
continue
try:
institutions = load_page_data(page_file)
page_count = len(institutions)
for inst in institutions:
isil_code = inst['isil_code']
name = inst['name']
# Handle institutions WITH ISIL codes
if isil_code:
if isil_code in seen_isil_codes:
print(f"⚠️ Duplicate ISIL code: {isil_code} (page {page_num})")
duplicates.append({
'isil_code': isil_code,
'name': name,
'page': page_num
})
stats['duplicates_found'] += 1
else:
seen_isil_codes.add(isil_code)
institutions_with_isil.append(inst)
stats['institutions_with_isil'] += 1
# Handle institutions WITHOUT ISIL codes (departments, branches)
else:
# Deduplicate by name to avoid exact duplicates
if name not in seen_names:
seen_names.add(name)
institutions_without_isil.append(inst)
stats['institutions_without_isil'] += 1
stats['pages_processed'] += 1
stats['institutions_extracted'] += page_count
print(f"✅ Page {page_num:2d}: {page_count:2d} institutions")
except Exception as e:
print(f"❌ Error processing page {page_num}: {e}")
# Sort institutions
institutions_with_isil.sort(key=lambda x: x['isil_code'])
institutions_without_isil.sort(key=lambda x: x['name'])
# Combine all institutions
all_institutions = institutions_with_isil + institutions_without_isil
# Create output with metadata
output_data = {
'metadata': {
'source': 'Austrian ISIL Registry (https://www.isil.at)',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'pages_scraped': f"{start_page}-{end_page}",
'total_institutions': len(all_institutions),
'institutions_with_isil': len(institutions_with_isil),
'institutions_without_isil': len(institutions_without_isil),
'duplicates_removed': stats['duplicates_found'],
'data_tier': 'TIER_1_AUTHORITATIVE',
'format_version': '2.0',
'notes': 'Institutions without ISIL codes are typically departments or branches of main institutions'
},
'statistics': stats,
'duplicates': duplicates,
'institutions_with_isil': institutions_with_isil,
'institutions_without_isil': institutions_without_isil,
'all_institutions': all_institutions
}
# Write merged output
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(output_data, f, indent=2, ensure_ascii=False)
print()
print("=" * 60)
print("MERGE COMPLETE")
print("=" * 60)
print(f"Pages processed: {stats['pages_processed']}/{end_page - start_page + 1}")
print(f"Institutions extracted: {stats['institutions_extracted']}")
print(f" - With ISIL codes: {stats['institutions_with_isil']}")
print(f" - Without ISIL codes (departments/branches): {stats['institutions_without_isil']}")
print(f"Duplicates removed: {stats['duplicates_found']}")
print(f"Total unique institutions: {len(all_institutions)}")
print(f"Missing pages: {len(stats['missing_pages'])}")
if stats['missing_pages']:
print(f" {stats['missing_pages']}")
print()
print(f"✅ Output saved to: {output_path}")
return output_data
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Merge Austrian ISIL page files')
parser.add_argument('--data-dir', default='data/isil/austria',
help='Directory containing page JSON files')
parser.add_argument('--output', default='data/isil/austria/austrian_isil_merged.json',
help='Output file path')
parser.add_argument('--start', type=int, default=1,
help='First page to process')
parser.add_argument('--end', type=int, default=194,
help='Last page to process')
args = parser.parse_args()
merge_austrian_isil_pages(
data_dir=args.data_dir,
output_file=args.output,
start_page=args.start,
end_page=args.end
)