236 lines
6.9 KiB
Python
236 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Parse Austrian ISIL data into LinkML-compliant YAML format.
|
|
|
|
Converts the merged Austrian ISIL JSON into HeritageCustodian records
|
|
following the modular LinkML schema v0.2.1.
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Optional
|
|
|
|
|
|
def infer_institution_type(name: str) -> str:
|
|
"""
|
|
Infer institution type from German name.
|
|
|
|
Maps to GLAMORCUBESFIXPHDNT taxonomy.
|
|
"""
|
|
name_lower = name.lower()
|
|
|
|
# Archives
|
|
if 'archiv' in name_lower:
|
|
return 'ARCHIVE'
|
|
|
|
# Libraries
|
|
if any(term in name_lower for term in ['bibliothek', 'bücherei']):
|
|
return 'LIBRARY'
|
|
|
|
# Museums
|
|
if 'museum' in name_lower:
|
|
return 'MUSEUM'
|
|
|
|
# Universities
|
|
if 'universität' in name_lower:
|
|
return 'EDUCATION_PROVIDER'
|
|
|
|
# Universities of Applied Sciences
|
|
if 'fachhochschule' in name_lower:
|
|
return 'EDUCATION_PROVIDER'
|
|
|
|
# Research institutions
|
|
if 'forschung' in name_lower:
|
|
return 'RESEARCH_CENTER'
|
|
|
|
# Religious institutions
|
|
if any(term in name_lower for term in ['stift', 'kloster', 'kirch']):
|
|
return 'HOLY_SITES'
|
|
|
|
# Government institutions
|
|
if any(term in name_lower for term in ['amt', 'landes']):
|
|
return 'OFFICIAL_INSTITUTION'
|
|
|
|
# Default: unknown
|
|
return 'UNKNOWN'
|
|
|
|
|
|
def extract_location_from_isil(isil: str) -> Dict[str, str]:
|
|
"""
|
|
Extract location information from ISIL code structure.
|
|
|
|
Austrian ISIL codes often encode location (e.g., AT-WSTLA for Vienna).
|
|
"""
|
|
location = {
|
|
'country': 'AT'
|
|
}
|
|
|
|
# Common Austrian city codes in ISIL
|
|
city_codes = {
|
|
'W': 'Wien',
|
|
'WIEN': 'Wien',
|
|
'WSTLA': 'Wien',
|
|
'SBG': 'Salzburg',
|
|
'STAR': 'Graz',
|
|
'STARG': 'Graz',
|
|
'LENT': 'Linz',
|
|
'IBK': 'Innsbruck',
|
|
'BLA': 'Eisenstadt',
|
|
'KLA': 'Klagenfurt',
|
|
'VLA': 'Bregenz',
|
|
'NOe': 'St. Pölten',
|
|
'OOe': 'Linz',
|
|
'SLA': 'Salzburg'
|
|
}
|
|
|
|
# Try to match city code
|
|
isil_suffix = isil.replace('AT-', '')
|
|
for code, city in city_codes.items():
|
|
if isil_suffix.startswith(code) or code in isil_suffix:
|
|
location['city'] = city
|
|
break
|
|
|
|
return location
|
|
|
|
|
|
def create_ghcid(country: str, region: str, city: str, inst_type: str, abbreviation: str) -> str:
|
|
"""Generate GHCID following the project standard."""
|
|
# For Austrian institutions without full location data, use simplified GHCID
|
|
# Format: AT-REGION-CITY-TYPE-ABBREV
|
|
|
|
# Map institution type to single letter
|
|
type_codes = {
|
|
'GALLERY': 'G',
|
|
'LIBRARY': 'L',
|
|
'ARCHIVE': 'A',
|
|
'MUSEUM': 'M',
|
|
'OFFICIAL_INSTITUTION': 'O',
|
|
'RESEARCH_CENTER': 'R',
|
|
'CORPORATION': 'C',
|
|
'UNKNOWN': 'U',
|
|
'BOTANICAL_ZOO': 'B',
|
|
'EDUCATION_PROVIDER': 'E',
|
|
'COLLECTING_SOCIETY': 'S',
|
|
'FEATURES': 'F',
|
|
'INTANGIBLE_HERITAGE_GROUP': 'I',
|
|
'MIXED': 'X',
|
|
'PERSONAL_COLLECTION': 'P',
|
|
'HOLY_SITES': 'H',
|
|
'DIGITAL_PLATFORM': 'D',
|
|
'NGO': 'N',
|
|
'TASTE_SMELL': 'T'
|
|
}
|
|
|
|
type_code = type_codes.get(inst_type, 'U')
|
|
|
|
# Simplified GHCID for Austrian institutions
|
|
# Use ISIL code suffix as abbreviation
|
|
return f"{country}-{region}-{city}-{type_code}-{abbreviation}"
|
|
|
|
|
|
def parse_institution(inst_data: dict) -> dict:
|
|
"""Parse a single institution into LinkML format."""
|
|
name = inst_data['name']
|
|
isil = inst_data.get('isil_code') or inst_data.get('isil', '') # Handle both field names
|
|
|
|
if not isil:
|
|
raise ValueError(f"No ISIL code found for institution: {name}")
|
|
|
|
# Infer institution type
|
|
inst_type = infer_institution_type(name)
|
|
|
|
# Extract location
|
|
location = extract_location_from_isil(isil)
|
|
|
|
# Create base record
|
|
record = {
|
|
'id': f"https://w3id.org/heritage/custodian/at/{isil.lower().replace('at-', '')}",
|
|
'name': name,
|
|
'institution_type': inst_type,
|
|
'identifiers': [
|
|
{
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': isil,
|
|
'identifier_url': f"https://permalink.obvsg.at/ais/{isil}"
|
|
}
|
|
],
|
|
'locations': [location] if location else None,
|
|
'provenance': {
|
|
'data_source': 'CSV_REGISTRY',
|
|
'data_tier': 'TIER_1_AUTHORITATIVE',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Playwright MCP browser automation from Austrian ISIL database (https://www.isil.at)',
|
|
'confidence_score': 0.95 # High confidence for official registry data
|
|
}
|
|
}
|
|
|
|
return record
|
|
|
|
|
|
def main():
|
|
input_file = Path("/Users/kempersc/apps/glam/data/isil/austria/austrian_isil_merged.json")
|
|
output_file = Path("/Users/kempersc/apps/glam/data/instances/austria_isil.yaml")
|
|
|
|
print("🔄 Parsing Austrian ISIL data to LinkML format...")
|
|
print(f"📥 Input: {input_file}")
|
|
print(f"📤 Output: {output_file}")
|
|
print()
|
|
|
|
# Load merged data
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
institutions = data['institutions']
|
|
print(f"📊 Total institutions: {len(institutions)}")
|
|
print()
|
|
|
|
# Parse each institution
|
|
parsed_records = []
|
|
for inst in institutions:
|
|
try:
|
|
record = parse_institution(inst)
|
|
parsed_records.append(record)
|
|
except Exception as e:
|
|
print(f"⚠️ Error parsing {inst.get('name', 'unknown')}: {e}")
|
|
|
|
print(f"✅ Successfully parsed: {len(parsed_records)} institutions")
|
|
print()
|
|
|
|
# Type distribution
|
|
type_counts = {}
|
|
for record in parsed_records:
|
|
inst_type = record['institution_type']
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
print("📊 Institution Type Distribution:")
|
|
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {inst_type}: {count}")
|
|
print()
|
|
|
|
# Save to YAML
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
output_data = {
|
|
'metadata': {
|
|
'source': 'Austrian ISIL Database (https://www.isil.at)',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'total_institutions': len(parsed_records),
|
|
'schema_version': 'v0.2.1',
|
|
'data_tier': 'TIER_1_AUTHORITATIVE'
|
|
},
|
|
'institutions': parsed_records
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(output_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"✅ LinkML data saved to: {output_file}")
|
|
print()
|
|
print("🎉 Parsing complete!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|