glam/scripts/parse_austrian_isil.py
2025-11-19 23:25:22 +01:00

236 lines
6.9 KiB
Python

#!/usr/bin/env python3
"""
Parse Austrian ISIL data into LinkML-compliant YAML format.
Converts the merged Austrian ISIL JSON into HeritageCustodian records
following the modular LinkML schema v0.2.1.
"""
import json
import yaml
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional
def infer_institution_type(name: str) -> str:
"""
Infer institution type from German name.
Maps to GLAMORCUBESFIXPHDNT taxonomy.
"""
name_lower = name.lower()
# Archives
if 'archiv' in name_lower:
return 'ARCHIVE'
# Libraries
if any(term in name_lower for term in ['bibliothek', 'bücherei']):
return 'LIBRARY'
# Museums
if 'museum' in name_lower:
return 'MUSEUM'
# Universities
if 'universität' in name_lower:
return 'EDUCATION_PROVIDER'
# Universities of Applied Sciences
if 'fachhochschule' in name_lower:
return 'EDUCATION_PROVIDER'
# Research institutions
if 'forschung' in name_lower:
return 'RESEARCH_CENTER'
# Religious institutions
if any(term in name_lower for term in ['stift', 'kloster', 'kirch']):
return 'HOLY_SITES'
# Government institutions
if any(term in name_lower for term in ['amt', 'landes']):
return 'OFFICIAL_INSTITUTION'
# Default: unknown
return 'UNKNOWN'
def extract_location_from_isil(isil: str) -> Dict[str, str]:
"""
Extract location information from ISIL code structure.
Austrian ISIL codes often encode location (e.g., AT-WSTLA for Vienna).
"""
location = {
'country': 'AT'
}
# Common Austrian city codes in ISIL
city_codes = {
'W': 'Wien',
'WIEN': 'Wien',
'WSTLA': 'Wien',
'SBG': 'Salzburg',
'STAR': 'Graz',
'STARG': 'Graz',
'LENT': 'Linz',
'IBK': 'Innsbruck',
'BLA': 'Eisenstadt',
'KLA': 'Klagenfurt',
'VLA': 'Bregenz',
'NOe': 'St. Pölten',
'OOe': 'Linz',
'SLA': 'Salzburg'
}
# Try to match city code
isil_suffix = isil.replace('AT-', '')
for code, city in city_codes.items():
if isil_suffix.startswith(code) or code in isil_suffix:
location['city'] = city
break
return location
def create_ghcid(country: str, region: str, city: str, inst_type: str, abbreviation: str) -> str:
"""Generate GHCID following the project standard."""
# For Austrian institutions without full location data, use simplified GHCID
# Format: AT-REGION-CITY-TYPE-ABBREV
# Map institution type to single letter
type_codes = {
'GALLERY': 'G',
'LIBRARY': 'L',
'ARCHIVE': 'A',
'MUSEUM': 'M',
'OFFICIAL_INSTITUTION': 'O',
'RESEARCH_CENTER': 'R',
'CORPORATION': 'C',
'UNKNOWN': 'U',
'BOTANICAL_ZOO': 'B',
'EDUCATION_PROVIDER': 'E',
'COLLECTING_SOCIETY': 'S',
'FEATURES': 'F',
'INTANGIBLE_HERITAGE_GROUP': 'I',
'MIXED': 'X',
'PERSONAL_COLLECTION': 'P',
'HOLY_SITES': 'H',
'DIGITAL_PLATFORM': 'D',
'NGO': 'N',
'TASTE_SMELL': 'T'
}
type_code = type_codes.get(inst_type, 'U')
# Simplified GHCID for Austrian institutions
# Use ISIL code suffix as abbreviation
return f"{country}-{region}-{city}-{type_code}-{abbreviation}"
def parse_institution(inst_data: dict) -> dict:
"""Parse a single institution into LinkML format."""
name = inst_data['name']
isil = inst_data.get('isil_code') or inst_data.get('isil', '') # Handle both field names
if not isil:
raise ValueError(f"No ISIL code found for institution: {name}")
# Infer institution type
inst_type = infer_institution_type(name)
# Extract location
location = extract_location_from_isil(isil)
# Create base record
record = {
'id': f"https://w3id.org/heritage/custodian/at/{isil.lower().replace('at-', '')}",
'name': name,
'institution_type': inst_type,
'identifiers': [
{
'identifier_scheme': 'ISIL',
'identifier_value': isil,
'identifier_url': f"https://permalink.obvsg.at/ais/{isil}"
}
],
'locations': [location] if location else None,
'provenance': {
'data_source': 'CSV_REGISTRY',
'data_tier': 'TIER_1_AUTHORITATIVE',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Playwright MCP browser automation from Austrian ISIL database (https://www.isil.at)',
'confidence_score': 0.95 # High confidence for official registry data
}
}
return record
def main():
input_file = Path("/Users/kempersc/apps/glam/data/isil/austria/austrian_isil_merged.json")
output_file = Path("/Users/kempersc/apps/glam/data/instances/austria_isil.yaml")
print("🔄 Parsing Austrian ISIL data to LinkML format...")
print(f"📥 Input: {input_file}")
print(f"📤 Output: {output_file}")
print()
# Load merged data
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
institutions = data['institutions']
print(f"📊 Total institutions: {len(institutions)}")
print()
# Parse each institution
parsed_records = []
for inst in institutions:
try:
record = parse_institution(inst)
parsed_records.append(record)
except Exception as e:
print(f"⚠️ Error parsing {inst.get('name', 'unknown')}: {e}")
print(f"✅ Successfully parsed: {len(parsed_records)} institutions")
print()
# Type distribution
type_counts = {}
for record in parsed_records:
inst_type = record['institution_type']
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
print("📊 Institution Type Distribution:")
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {inst_type}: {count}")
print()
# Save to YAML
output_file.parent.mkdir(parents=True, exist_ok=True)
output_data = {
'metadata': {
'source': 'Austrian ISIL Database (https://www.isil.at)',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'total_institutions': len(parsed_records),
'schema_version': 'v0.2.1',
'data_tier': 'TIER_1_AUTHORITATIVE'
},
'institutions': parsed_records
}
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(output_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"✅ LinkML data saved to: {output_file}")
print()
print("🎉 Parsing complete!")
if __name__ == "__main__":
main()