190 lines
7.1 KiB
Python
Executable file
190 lines
7.1 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Merge Austrian ISIL page files into a single dataset.
|
|
|
|
Handles two JSON formats:
|
|
1. Array format: [{"name": "...", "isil_code": "..."}]
|
|
2. Metadata format: {"page": N, "institutions": [{"name": "...", "isil": "..."}]}
|
|
|
|
Normalizes to consistent format with deduplication.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from typing import List, Dict, Set
|
|
from datetime import datetime, timezone
|
|
|
|
def load_page_data(page_file: Path) -> List[Dict[str, str]]:
|
|
"""Load institutions from a page file, handling both formats."""
|
|
with open(page_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
institutions = []
|
|
|
|
if isinstance(data, list):
|
|
# Array format
|
|
for inst in data:
|
|
institutions.append({
|
|
'name': inst.get('name', ''),
|
|
'isil_code': inst.get('isil_code') or inst.get('isil', '')
|
|
})
|
|
elif isinstance(data, dict):
|
|
# Metadata format
|
|
for inst in data.get('institutions', []):
|
|
institutions.append({
|
|
'name': inst.get('name', ''),
|
|
'isil_code': inst.get('isil_code') or inst.get('isil', '')
|
|
})
|
|
|
|
return institutions
|
|
|
|
def merge_austrian_isil_pages(data_dir: str, output_file: str, start_page: int = 1, end_page: int = 194):
|
|
"""
|
|
Merge Austrian ISIL page files into a single dataset.
|
|
|
|
Args:
|
|
data_dir: Directory containing page_NNN_data.json files
|
|
output_file: Output path for merged JSON
|
|
start_page: First page to process
|
|
end_page: Last page to process
|
|
"""
|
|
data_path = Path(data_dir)
|
|
institutions_with_isil = []
|
|
institutions_without_isil = []
|
|
seen_isil_codes: Set[str] = set()
|
|
seen_names: Set[str] = set()
|
|
duplicates = []
|
|
stats = {
|
|
'pages_processed': 0,
|
|
'institutions_extracted': 0,
|
|
'institutions_with_isil': 0,
|
|
'institutions_without_isil': 0,
|
|
'duplicates_found': 0,
|
|
'missing_pages': [],
|
|
}
|
|
|
|
print(f"Merging Austrian ISIL pages {start_page}-{end_page}")
|
|
print(f"Input directory: {data_path}")
|
|
print(f"Output file: {output_file}")
|
|
print()
|
|
|
|
for page_num in range(start_page, end_page + 1):
|
|
page_file = data_path / f"page_{page_num:03d}_data.json"
|
|
|
|
if not page_file.exists():
|
|
print(f"⚠️ Missing: page {page_num}")
|
|
stats['missing_pages'].append(page_num)
|
|
continue
|
|
|
|
try:
|
|
institutions = load_page_data(page_file)
|
|
page_count = len(institutions)
|
|
|
|
for inst in institutions:
|
|
isil_code = inst['isil_code']
|
|
name = inst['name']
|
|
|
|
# Handle institutions WITH ISIL codes
|
|
if isil_code:
|
|
if isil_code in seen_isil_codes:
|
|
print(f"⚠️ Duplicate ISIL code: {isil_code} (page {page_num})")
|
|
duplicates.append({
|
|
'isil_code': isil_code,
|
|
'name': name,
|
|
'page': page_num
|
|
})
|
|
stats['duplicates_found'] += 1
|
|
else:
|
|
seen_isil_codes.add(isil_code)
|
|
institutions_with_isil.append(inst)
|
|
stats['institutions_with_isil'] += 1
|
|
|
|
# Handle institutions WITHOUT ISIL codes (departments, branches)
|
|
else:
|
|
# Deduplicate by name to avoid exact duplicates
|
|
if name not in seen_names:
|
|
seen_names.add(name)
|
|
institutions_without_isil.append(inst)
|
|
stats['institutions_without_isil'] += 1
|
|
|
|
stats['pages_processed'] += 1
|
|
stats['institutions_extracted'] += page_count
|
|
print(f"✅ Page {page_num:2d}: {page_count:2d} institutions")
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error processing page {page_num}: {e}")
|
|
|
|
# Sort institutions
|
|
institutions_with_isil.sort(key=lambda x: x['isil_code'])
|
|
institutions_without_isil.sort(key=lambda x: x['name'])
|
|
|
|
# Combine all institutions
|
|
all_institutions = institutions_with_isil + institutions_without_isil
|
|
|
|
# Create output with metadata
|
|
output_data = {
|
|
'metadata': {
|
|
'source': 'Austrian ISIL Registry (https://www.isil.at)',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'pages_scraped': f"{start_page}-{end_page}",
|
|
'total_institutions': len(all_institutions),
|
|
'institutions_with_isil': len(institutions_with_isil),
|
|
'institutions_without_isil': len(institutions_without_isil),
|
|
'duplicates_removed': stats['duplicates_found'],
|
|
'data_tier': 'TIER_1_AUTHORITATIVE',
|
|
'format_version': '2.0',
|
|
'notes': 'Institutions without ISIL codes are typically departments or branches of main institutions'
|
|
},
|
|
'statistics': stats,
|
|
'duplicates': duplicates,
|
|
'institutions_with_isil': institutions_with_isil,
|
|
'institutions_without_isil': institutions_without_isil,
|
|
'all_institutions': all_institutions
|
|
}
|
|
|
|
# Write merged output
|
|
output_path = Path(output_file)
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(output_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print()
|
|
print("=" * 60)
|
|
print("MERGE COMPLETE")
|
|
print("=" * 60)
|
|
print(f"Pages processed: {stats['pages_processed']}/{end_page - start_page + 1}")
|
|
print(f"Institutions extracted: {stats['institutions_extracted']}")
|
|
print(f" - With ISIL codes: {stats['institutions_with_isil']}")
|
|
print(f" - Without ISIL codes (departments/branches): {stats['institutions_without_isil']}")
|
|
print(f"Duplicates removed: {stats['duplicates_found']}")
|
|
print(f"Total unique institutions: {len(all_institutions)}")
|
|
print(f"Missing pages: {len(stats['missing_pages'])}")
|
|
if stats['missing_pages']:
|
|
print(f" {stats['missing_pages']}")
|
|
print()
|
|
print(f"✅ Output saved to: {output_path}")
|
|
|
|
return output_data
|
|
|
|
if __name__ == '__main__':
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description='Merge Austrian ISIL page files')
|
|
parser.add_argument('--data-dir', default='data/isil/austria',
|
|
help='Directory containing page JSON files')
|
|
parser.add_argument('--output', default='data/isil/austria/austrian_isil_merged.json',
|
|
help='Output file path')
|
|
parser.add_argument('--start', type=int, default=1,
|
|
help='First page to process')
|
|
parser.add_argument('--end', type=int, default=194,
|
|
help='Last page to process')
|
|
|
|
args = parser.parse_args()
|
|
|
|
merge_austrian_isil_pages(
|
|
data_dir=args.data_dir,
|
|
output_file=args.output,
|
|
start_page=args.start,
|
|
end_page=args.end
|
|
)
|