#!/usr/bin/env python3 """ Convert Swiss ISIL data to LinkML format Maps Swiss institution types to GLAMORCUBESFIXPHDNT taxonomy Author: GLAM Data Extraction Project Date: November 2025 """ import json import yaml from pathlib import Path from typing import List, Dict, Any, Optional from datetime import datetime import re # Mapping Swiss institution types to GLAM taxonomy SWISS_TYPE_MAPPING = { # Libraries 'University and research library': 'LIBRARY', 'Public library': 'LIBRARY', 'Special library': 'LIBRARY', 'Cantonal library': 'LIBRARY', 'National library': 'LIBRARY', 'School library': 'LIBRARY', 'Parliamentary library': 'LIBRARY', # Archives 'Municipal archives or county/local authority archives': 'ARCHIVE', 'Municipal archives or county/local authority archi': 'ARCHIVE', # Truncated 'Church and religious archives': 'ARCHIVE', 'Regional archives': 'ARCHIVE', 'University and research archives': 'ARCHIVE', 'Business archives': 'ARCHIVE', 'Private persons and family archives': 'ARCHIVE', 'Specialised non-governmental archives and archives': 'ARCHIVE', 'Media archives': 'ARCHIVE', 'National archives': 'ARCHIVE', 'Cantonal archives': 'ARCHIVE', # Museums 'Regional and local museums': 'MUSEUM', 'Historical museums': 'MUSEUM', 'Art museums': 'MUSEUM', 'Natural science museums': 'MUSEUM', 'Other museums': 'MUSEUM', 'Ethnographic museums': 'MUSEUM', 'Technical museums': 'MUSEUM', 'Science museums': 'MUSEUM', # Education 'University': 'EDUCATION_PROVIDER', 'School': 'EDUCATION_PROVIDER', # Research 'Research center': 'RESEARCH_CENTER', 'Research institute': 'RESEARCH_CENTER', } def map_institution_type(swiss_categories: List[str]) -> str: """Map Swiss institution type to GLAM taxonomy""" if not swiss_categories: return 'UNKNOWN' # Try to map the first category for category in swiss_categories: if category in SWISS_TYPE_MAPPING: return SWISS_TYPE_MAPPING[category] # Fallback: try partial matching category_text = ' '.join(swiss_categories).lower() if 'library' in category_text or 'bibliothek' in category_text or 'bibliothèque' in category_text: return 'LIBRARY' elif 'archive' in category_text or 'archiv' in category_text: return 'ARCHIVE' elif 'museum' in category_text or 'musée' in category_text: return 'MUSEUM' elif 'university' in category_text or 'universität' in category_text or 'université' in category_text: return 'EDUCATION_PROVIDER' elif 'research' in category_text or 'forschung' in category_text or 'recherche' in category_text: return 'RESEARCH_CENTER' return 'UNKNOWN' def generate_id(inst: Dict[str, Any]) -> str: """Generate a W3ID URI for the institution""" # Use ISIL code if available if inst.get('isil_code'): isil = inst['isil_code'].replace('CH-', '').lower() return f"https://w3id.org/heritage/custodian/ch/{isil}" # Otherwise create from name and canton name_slug = re.sub(r'[^a-z0-9]+', '-', inst.get('name', '').lower()).strip('-') canton = inst.get('canton', 'unknown').lower() # Limit slug length if len(name_slug) > 50: name_slug = name_slug[:50].rsplit('-', 1)[0] return f"https://w3id.org/heritage/custodian/ch/{canton}/{name_slug}" def convert_to_linkml(inst: Dict[str, Any]) -> Dict[str, Any]: """Convert Swiss ISIL institution to LinkML HeritageCustodian format""" # Generate ID record_id = generate_id(inst) # Map institution type categories = inst.get('categories', []) institution_type = map_institution_type(categories) # Build LinkML record record = { 'id': record_id, 'name': inst.get('name', ''), 'institution_type': institution_type, } # Alternative names if inst.get('alternative_name'): record['alternative_names'] = [inst['alternative_name']] # Description if inst.get('description'): record['description'] = inst['description'] # Location location = {} if inst.get('city'): location['city'] = inst['city'] if inst.get('postal_code'): location['postal_code'] = inst['postal_code'] if inst.get('street'): location['street_address'] = inst['street'] if inst.get('canton'): location['region'] = inst['canton'] location['country'] = 'CH' # Switzerland if location: record['locations'] = [location] # Identifiers identifiers = [] # ISIL code if inst.get('isil_code'): identifiers.append({ 'identifier_scheme': 'ISIL', 'identifier_value': inst['isil_code'], 'identifier_url': f"https://www.isil.nb.admin.ch/en/?isil={inst['isil_code']}" }) # Website as identifier if inst.get('contact', {}).get('website'): identifiers.append({ 'identifier_scheme': 'Website', 'identifier_value': inst['contact']['website'], 'identifier_url': inst['contact']['website'] }) if identifiers: record['identifiers'] = identifiers # Contact information contact = inst.get('contact', {}) if contact: contact_info = {} if contact.get('email'): contact_info['email'] = contact['email'] if contact.get('phone'): contact_info['telephone'] = contact['phone'] if contact_info: record['contact_info'] = contact_info # Homepage if contact.get('website'): record['homepage'] = contact['website'] # Digital platforms (if detail URL available) if inst.get('detail_url'): record['digital_platforms'] = [{ 'platform_name': 'Swiss ISIL Directory', 'platform_url': inst['detail_url'], 'platform_type': 'DISCOVERY_PORTAL' }] # Provenance record['provenance'] = { 'data_source': 'CSV_REGISTRY', 'data_tier': 'TIER_1_AUTHORITATIVE', 'extraction_date': datetime.now().isoformat(), 'extraction_method': 'Web scraping from Swiss National Library ISIL directory (https://www.isil.nb.admin.ch)', 'confidence_score': 0.95, 'source_url': 'https://www.isil.nb.admin.ch/en/', 'notes': f"Scraped from Swiss ISIL directory. Swiss categories: {'; '.join(categories)}" } return record def convert_all(json_file: Path, yaml_file: Path): """Convert all Swiss ISIL institutions to LinkML YAML""" print(f"Loading data from {json_file}") with open(json_file, 'r', encoding='utf-8') as f: institutions = json.load(f) print(f"Loaded {len(institutions)} institutions") print(f"Converting to LinkML format...") # Convert all institutions linkml_records = [] type_counts = {} for inst in institutions: record = convert_to_linkml(inst) linkml_records.append(record) # Count types inst_type = record['institution_type'] type_counts[inst_type] = type_counts.get(inst_type, 0) + 1 print(f"\nConverted {len(linkml_records)} institutions") print("\nInstitution type distribution:") for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True): print(f" {inst_type:25s}: {count:4d} ({count/len(linkml_records)*100:5.1f}%)") # Write YAML print(f"\nWriting LinkML YAML to {yaml_file}") with open(yaml_file, 'w', encoding='utf-8') as f: yaml.dump(linkml_records, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=100) print(f"✓ Successfully exported {len(linkml_records)} institutions to LinkML YAML") # Also write as JSON-LD jsonld_file = yaml_file.with_suffix('.jsonld') print(f"\nWriting JSON-LD to {jsonld_file}") jsonld_data = { '@context': 'https://w3id.org/heritage/custodian/context.jsonld', '@graph': linkml_records } with open(jsonld_file, 'w', encoding='utf-8') as f: json.dump(jsonld_data, f, ensure_ascii=False, indent=2) print(f"✓ Successfully exported to JSON-LD") def main(): """Main conversion function""" base_dir = Path("/Users/kempersc/apps/glam/data/isil/switzerland") instances_dir = Path("/Users/kempersc/apps/glam/data/instances") json_file = base_dir / "swiss_isil_complete_final.json" yaml_file = instances_dir / "switzerland_isil.yaml" if not json_file.exists(): print(f"Error: JSON file not found at {json_file}") return # Create instances directory if needed instances_dir.mkdir(exist_ok=True) convert_all(json_file, yaml_file) print(f"\n✓ LinkML conversion complete!") print(f" YAML output: {yaml_file}") print(f" JSON-LD output: {yaml_file.with_suffix('.jsonld')}") if __name__ == "__main__": main()