#!/usr/bin/env python3 """ Parse Austrian ISIL data into LinkML-compliant YAML format. Converts the merged Austrian ISIL JSON into HeritageCustodian records following the modular LinkML schema v0.2.1. """ import json import yaml import re from pathlib import Path from datetime import datetime, timezone from typing import Dict, List, Optional def infer_institution_type(name: str) -> str: """ Infer institution type from German name. Maps to GLAMORCUBESFIXPHDNT taxonomy. """ name_lower = name.lower() # Archives if 'archiv' in name_lower: return 'ARCHIVE' # Libraries if any(term in name_lower for term in ['bibliothek', 'bücherei']): return 'LIBRARY' # Museums if 'museum' in name_lower: return 'MUSEUM' # Universities if 'universität' in name_lower: return 'EDUCATION_PROVIDER' # Universities of Applied Sciences if 'fachhochschule' in name_lower: return 'EDUCATION_PROVIDER' # Research institutions if 'forschung' in name_lower: return 'RESEARCH_CENTER' # Religious institutions if any(term in name_lower for term in ['stift', 'kloster', 'kirch']): return 'HOLY_SITES' # Government institutions if any(term in name_lower for term in ['amt', 'landes']): return 'OFFICIAL_INSTITUTION' # Default: unknown return 'UNKNOWN' def extract_location_from_isil(isil: str) -> Dict[str, str]: """ Extract location information from ISIL code structure. Austrian ISIL codes often encode location (e.g., AT-WSTLA for Vienna). """ location = { 'country': 'AT' } # Common Austrian city codes in ISIL city_codes = { 'W': 'Wien', 'WIEN': 'Wien', 'WSTLA': 'Wien', 'SBG': 'Salzburg', 'STAR': 'Graz', 'STARG': 'Graz', 'LENT': 'Linz', 'IBK': 'Innsbruck', 'BLA': 'Eisenstadt', 'KLA': 'Klagenfurt', 'VLA': 'Bregenz', 'NOe': 'St. Pölten', 'OOe': 'Linz', 'SLA': 'Salzburg' } # Try to match city code isil_suffix = isil.replace('AT-', '') for code, city in city_codes.items(): if isil_suffix.startswith(code) or code in isil_suffix: location['city'] = city break return location def create_ghcid(country: str, region: str, city: str, inst_type: str, abbreviation: str) -> str: """Generate GHCID following the project standard.""" # For Austrian institutions without full location data, use simplified GHCID # Format: AT-REGION-CITY-TYPE-ABBREV # Map institution type to single letter type_codes = { 'GALLERY': 'G', 'LIBRARY': 'L', 'ARCHIVE': 'A', 'MUSEUM': 'M', 'OFFICIAL_INSTITUTION': 'O', 'RESEARCH_CENTER': 'R', 'CORPORATION': 'C', 'UNKNOWN': 'U', 'BOTANICAL_ZOO': 'B', 'EDUCATION_PROVIDER': 'E', 'COLLECTING_SOCIETY': 'S', 'FEATURES': 'F', 'INTANGIBLE_HERITAGE_GROUP': 'I', 'MIXED': 'X', 'PERSONAL_COLLECTION': 'P', 'HOLY_SITES': 'H', 'DIGITAL_PLATFORM': 'D', 'NGO': 'N', 'TASTE_SMELL': 'T' } type_code = type_codes.get(inst_type, 'U') # Simplified GHCID for Austrian institutions # Use ISIL code suffix as abbreviation return f"{country}-{region}-{city}-{type_code}-{abbreviation}" def parse_institution(inst_data: dict) -> dict: """Parse a single institution into LinkML format.""" name = inst_data['name'] isil = inst_data.get('isil_code') or inst_data.get('isil', '') # Handle both field names if not isil: raise ValueError(f"No ISIL code found for institution: {name}") # Infer institution type inst_type = infer_institution_type(name) # Extract location location = extract_location_from_isil(isil) # Create base record record = { 'id': f"https://w3id.org/heritage/custodian/at/{isil.lower().replace('at-', '')}", 'name': name, 'institution_type': inst_type, 'identifiers': [ { 'identifier_scheme': 'ISIL', 'identifier_value': isil, 'identifier_url': f"https://permalink.obvsg.at/ais/{isil}" } ], 'locations': [location] if location else None, 'provenance': { 'data_source': 'CSV_REGISTRY', 'data_tier': 'TIER_1_AUTHORITATIVE', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Playwright MCP browser automation from Austrian ISIL database (https://www.isil.at)', 'confidence_score': 0.95 # High confidence for official registry data } } return record def main(): input_file = Path("/Users/kempersc/apps/glam/data/isil/austria/austrian_isil_merged.json") output_file = Path("/Users/kempersc/apps/glam/data/instances/austria_isil.yaml") print("🔄 Parsing Austrian ISIL data to LinkML format...") print(f"📥 Input: {input_file}") print(f"📤 Output: {output_file}") print() # Load merged data with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) institutions = data['institutions'] print(f"📊 Total institutions: {len(institutions)}") print() # Parse each institution parsed_records = [] for inst in institutions: try: record = parse_institution(inst) parsed_records.append(record) except Exception as e: print(f"⚠️ Error parsing {inst.get('name', 'unknown')}: {e}") print(f"✅ Successfully parsed: {len(parsed_records)} institutions") print() # Type distribution type_counts = {} for record in parsed_records: inst_type = record['institution_type'] type_counts[inst_type] = type_counts.get(inst_type, 0) + 1 print("📊 Institution Type Distribution:") for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]): print(f" {inst_type}: {count}") print() # Save to YAML output_file.parent.mkdir(parents=True, exist_ok=True) output_data = { 'metadata': { 'source': 'Austrian ISIL Database (https://www.isil.at)', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'total_institutions': len(parsed_records), 'schema_version': 'v0.2.1', 'data_tier': 'TIER_1_AUTHORITATIVE' }, 'institutions': parsed_records } with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(output_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print(f"✅ LinkML data saved to: {output_file}") print() print("🎉 Parsing complete!") if __name__ == "__main__": main()