glam/scripts/convert_switzerland_linkml.py
2025-11-19 23:25:22 +01:00

275 lines
9 KiB
Python

#!/usr/bin/env python3
"""
Convert Swiss ISIL data to LinkML format
Maps Swiss institution types to GLAMORCUBESFIXPHDNT taxonomy
Author: GLAM Data Extraction Project
Date: November 2025
"""
import json
import yaml
from pathlib import Path
from typing import List, Dict, Any, Optional
from datetime import datetime
import re
# Mapping Swiss institution types to GLAM taxonomy
SWISS_TYPE_MAPPING = {
# Libraries
'University and research library': 'LIBRARY',
'Public library': 'LIBRARY',
'Special library': 'LIBRARY',
'Cantonal library': 'LIBRARY',
'National library': 'LIBRARY',
'School library': 'LIBRARY',
'Parliamentary library': 'LIBRARY',
# Archives
'Municipal archives or county/local authority archives': 'ARCHIVE',
'Municipal archives or county/local authority archi': 'ARCHIVE', # Truncated
'Church and religious archives': 'ARCHIVE',
'Regional archives': 'ARCHIVE',
'University and research archives': 'ARCHIVE',
'Business archives': 'ARCHIVE',
'Private persons and family archives': 'ARCHIVE',
'Specialised non-governmental archives and archives': 'ARCHIVE',
'Media archives': 'ARCHIVE',
'National archives': 'ARCHIVE',
'Cantonal archives': 'ARCHIVE',
# Museums
'Regional and local museums': 'MUSEUM',
'Historical museums': 'MUSEUM',
'Art museums': 'MUSEUM',
'Natural science museums': 'MUSEUM',
'Other museums': 'MUSEUM',
'Ethnographic museums': 'MUSEUM',
'Technical museums': 'MUSEUM',
'Science museums': 'MUSEUM',
# Education
'University': 'EDUCATION_PROVIDER',
'School': 'EDUCATION_PROVIDER',
# Research
'Research center': 'RESEARCH_CENTER',
'Research institute': 'RESEARCH_CENTER',
}
def map_institution_type(swiss_categories: List[str]) -> str:
"""Map Swiss institution type to GLAM taxonomy"""
if not swiss_categories:
return 'UNKNOWN'
# Try to map the first category
for category in swiss_categories:
if category in SWISS_TYPE_MAPPING:
return SWISS_TYPE_MAPPING[category]
# Fallback: try partial matching
category_text = ' '.join(swiss_categories).lower()
if 'library' in category_text or 'bibliothek' in category_text or 'bibliothèque' in category_text:
return 'LIBRARY'
elif 'archive' in category_text or 'archiv' in category_text:
return 'ARCHIVE'
elif 'museum' in category_text or 'musée' in category_text:
return 'MUSEUM'
elif 'university' in category_text or 'universität' in category_text or 'université' in category_text:
return 'EDUCATION_PROVIDER'
elif 'research' in category_text or 'forschung' in category_text or 'recherche' in category_text:
return 'RESEARCH_CENTER'
return 'UNKNOWN'
def generate_id(inst: Dict[str, Any]) -> str:
"""Generate a W3ID URI for the institution"""
# Use ISIL code if available
if inst.get('isil_code'):
isil = inst['isil_code'].replace('CH-', '').lower()
return f"https://w3id.org/heritage/custodian/ch/{isil}"
# Otherwise create from name and canton
name_slug = re.sub(r'[^a-z0-9]+', '-', inst.get('name', '').lower()).strip('-')
canton = inst.get('canton', 'unknown').lower()
# Limit slug length
if len(name_slug) > 50:
name_slug = name_slug[:50].rsplit('-', 1)[0]
return f"https://w3id.org/heritage/custodian/ch/{canton}/{name_slug}"
def convert_to_linkml(inst: Dict[str, Any]) -> Dict[str, Any]:
"""Convert Swiss ISIL institution to LinkML HeritageCustodian format"""
# Generate ID
record_id = generate_id(inst)
# Map institution type
categories = inst.get('categories', [])
institution_type = map_institution_type(categories)
# Build LinkML record
record = {
'id': record_id,
'name': inst.get('name', ''),
'institution_type': institution_type,
}
# Alternative names
if inst.get('alternative_name'):
record['alternative_names'] = [inst['alternative_name']]
# Description
if inst.get('description'):
record['description'] = inst['description']
# Location
location = {}
if inst.get('city'):
location['city'] = inst['city']
if inst.get('postal_code'):
location['postal_code'] = inst['postal_code']
if inst.get('street'):
location['street_address'] = inst['street']
if inst.get('canton'):
location['region'] = inst['canton']
location['country'] = 'CH' # Switzerland
if location:
record['locations'] = [location]
# Identifiers
identifiers = []
# ISIL code
if inst.get('isil_code'):
identifiers.append({
'identifier_scheme': 'ISIL',
'identifier_value': inst['isil_code'],
'identifier_url': f"https://www.isil.nb.admin.ch/en/?isil={inst['isil_code']}"
})
# Website as identifier
if inst.get('contact', {}).get('website'):
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': inst['contact']['website'],
'identifier_url': inst['contact']['website']
})
if identifiers:
record['identifiers'] = identifiers
# Contact information
contact = inst.get('contact', {})
if contact:
contact_info = {}
if contact.get('email'):
contact_info['email'] = contact['email']
if contact.get('phone'):
contact_info['telephone'] = contact['phone']
if contact_info:
record['contact_info'] = contact_info
# Homepage
if contact.get('website'):
record['homepage'] = contact['website']
# Digital platforms (if detail URL available)
if inst.get('detail_url'):
record['digital_platforms'] = [{
'platform_name': 'Swiss ISIL Directory',
'platform_url': inst['detail_url'],
'platform_type': 'DISCOVERY_PORTAL'
}]
# Provenance
record['provenance'] = {
'data_source': 'CSV_REGISTRY',
'data_tier': 'TIER_1_AUTHORITATIVE',
'extraction_date': datetime.now().isoformat(),
'extraction_method': 'Web scraping from Swiss National Library ISIL directory (https://www.isil.nb.admin.ch)',
'confidence_score': 0.95,
'source_url': 'https://www.isil.nb.admin.ch/en/',
'notes': f"Scraped from Swiss ISIL directory. Swiss categories: {'; '.join(categories)}"
}
return record
def convert_all(json_file: Path, yaml_file: Path):
"""Convert all Swiss ISIL institutions to LinkML YAML"""
print(f"Loading data from {json_file}")
with open(json_file, 'r', encoding='utf-8') as f:
institutions = json.load(f)
print(f"Loaded {len(institutions)} institutions")
print(f"Converting to LinkML format...")
# Convert all institutions
linkml_records = []
type_counts = {}
for inst in institutions:
record = convert_to_linkml(inst)
linkml_records.append(record)
# Count types
inst_type = record['institution_type']
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
print(f"\nConverted {len(linkml_records)} institutions")
print("\nInstitution type distribution:")
for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
print(f" {inst_type:25s}: {count:4d} ({count/len(linkml_records)*100:5.1f}%)")
# Write YAML
print(f"\nWriting LinkML YAML to {yaml_file}")
with open(yaml_file, 'w', encoding='utf-8') as f:
yaml.dump(linkml_records, f,
allow_unicode=True,
default_flow_style=False,
sort_keys=False,
width=100)
print(f"✓ Successfully exported {len(linkml_records)} institutions to LinkML YAML")
# Also write as JSON-LD
jsonld_file = yaml_file.with_suffix('.jsonld')
print(f"\nWriting JSON-LD to {jsonld_file}")
jsonld_data = {
'@context': 'https://w3id.org/heritage/custodian/context.jsonld',
'@graph': linkml_records
}
with open(jsonld_file, 'w', encoding='utf-8') as f:
json.dump(jsonld_data, f, ensure_ascii=False, indent=2)
print(f"✓ Successfully exported to JSON-LD")
def main():
"""Main conversion function"""
base_dir = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
instances_dir = Path("/Users/kempersc/apps/glam/data/instances")
json_file = base_dir / "swiss_isil_complete_final.json"
yaml_file = instances_dir / "switzerland_isil.yaml"
if not json_file.exists():
print(f"Error: JSON file not found at {json_file}")
return
# Create instances directory if needed
instances_dir.mkdir(exist_ok=True)
convert_all(json_file, yaml_file)
print(f"\n✓ LinkML conversion complete!")
print(f" YAML output: {yaml_file}")
print(f" JSON-LD output: {yaml_file.with_suffix('.jsonld')}")
if __name__ == "__main__":
main()