275 lines
9 KiB
Python
275 lines
9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Convert Swiss ISIL data to LinkML format
|
|
Maps Swiss institution types to GLAMORCUBESFIXPHDNT taxonomy
|
|
Author: GLAM Data Extraction Project
|
|
Date: November 2025
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any, Optional
|
|
from datetime import datetime
|
|
import re
|
|
|
|
# Mapping Swiss institution types to GLAM taxonomy
|
|
SWISS_TYPE_MAPPING = {
|
|
# Libraries
|
|
'University and research library': 'LIBRARY',
|
|
'Public library': 'LIBRARY',
|
|
'Special library': 'LIBRARY',
|
|
'Cantonal library': 'LIBRARY',
|
|
'National library': 'LIBRARY',
|
|
'School library': 'LIBRARY',
|
|
'Parliamentary library': 'LIBRARY',
|
|
|
|
# Archives
|
|
'Municipal archives or county/local authority archives': 'ARCHIVE',
|
|
'Municipal archives or county/local authority archi': 'ARCHIVE', # Truncated
|
|
'Church and religious archives': 'ARCHIVE',
|
|
'Regional archives': 'ARCHIVE',
|
|
'University and research archives': 'ARCHIVE',
|
|
'Business archives': 'ARCHIVE',
|
|
'Private persons and family archives': 'ARCHIVE',
|
|
'Specialised non-governmental archives and archives': 'ARCHIVE',
|
|
'Media archives': 'ARCHIVE',
|
|
'National archives': 'ARCHIVE',
|
|
'Cantonal archives': 'ARCHIVE',
|
|
|
|
# Museums
|
|
'Regional and local museums': 'MUSEUM',
|
|
'Historical museums': 'MUSEUM',
|
|
'Art museums': 'MUSEUM',
|
|
'Natural science museums': 'MUSEUM',
|
|
'Other museums': 'MUSEUM',
|
|
'Ethnographic museums': 'MUSEUM',
|
|
'Technical museums': 'MUSEUM',
|
|
'Science museums': 'MUSEUM',
|
|
|
|
# Education
|
|
'University': 'EDUCATION_PROVIDER',
|
|
'School': 'EDUCATION_PROVIDER',
|
|
|
|
# Research
|
|
'Research center': 'RESEARCH_CENTER',
|
|
'Research institute': 'RESEARCH_CENTER',
|
|
}
|
|
|
|
def map_institution_type(swiss_categories: List[str]) -> str:
|
|
"""Map Swiss institution type to GLAM taxonomy"""
|
|
if not swiss_categories:
|
|
return 'UNKNOWN'
|
|
|
|
# Try to map the first category
|
|
for category in swiss_categories:
|
|
if category in SWISS_TYPE_MAPPING:
|
|
return SWISS_TYPE_MAPPING[category]
|
|
|
|
# Fallback: try partial matching
|
|
category_text = ' '.join(swiss_categories).lower()
|
|
if 'library' in category_text or 'bibliothek' in category_text or 'bibliothèque' in category_text:
|
|
return 'LIBRARY'
|
|
elif 'archive' in category_text or 'archiv' in category_text:
|
|
return 'ARCHIVE'
|
|
elif 'museum' in category_text or 'musée' in category_text:
|
|
return 'MUSEUM'
|
|
elif 'university' in category_text or 'universität' in category_text or 'université' in category_text:
|
|
return 'EDUCATION_PROVIDER'
|
|
elif 'research' in category_text or 'forschung' in category_text or 'recherche' in category_text:
|
|
return 'RESEARCH_CENTER'
|
|
|
|
return 'UNKNOWN'
|
|
|
|
def generate_id(inst: Dict[str, Any]) -> str:
|
|
"""Generate a W3ID URI for the institution"""
|
|
# Use ISIL code if available
|
|
if inst.get('isil_code'):
|
|
isil = inst['isil_code'].replace('CH-', '').lower()
|
|
return f"https://w3id.org/heritage/custodian/ch/{isil}"
|
|
|
|
# Otherwise create from name and canton
|
|
name_slug = re.sub(r'[^a-z0-9]+', '-', inst.get('name', '').lower()).strip('-')
|
|
canton = inst.get('canton', 'unknown').lower()
|
|
|
|
# Limit slug length
|
|
if len(name_slug) > 50:
|
|
name_slug = name_slug[:50].rsplit('-', 1)[0]
|
|
|
|
return f"https://w3id.org/heritage/custodian/ch/{canton}/{name_slug}"
|
|
|
|
def convert_to_linkml(inst: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Convert Swiss ISIL institution to LinkML HeritageCustodian format"""
|
|
|
|
# Generate ID
|
|
record_id = generate_id(inst)
|
|
|
|
# Map institution type
|
|
categories = inst.get('categories', [])
|
|
institution_type = map_institution_type(categories)
|
|
|
|
# Build LinkML record
|
|
record = {
|
|
'id': record_id,
|
|
'name': inst.get('name', ''),
|
|
'institution_type': institution_type,
|
|
}
|
|
|
|
# Alternative names
|
|
if inst.get('alternative_name'):
|
|
record['alternative_names'] = [inst['alternative_name']]
|
|
|
|
# Description
|
|
if inst.get('description'):
|
|
record['description'] = inst['description']
|
|
|
|
# Location
|
|
location = {}
|
|
if inst.get('city'):
|
|
location['city'] = inst['city']
|
|
if inst.get('postal_code'):
|
|
location['postal_code'] = inst['postal_code']
|
|
if inst.get('street'):
|
|
location['street_address'] = inst['street']
|
|
if inst.get('canton'):
|
|
location['region'] = inst['canton']
|
|
|
|
location['country'] = 'CH' # Switzerland
|
|
|
|
if location:
|
|
record['locations'] = [location]
|
|
|
|
# Identifiers
|
|
identifiers = []
|
|
|
|
# ISIL code
|
|
if inst.get('isil_code'):
|
|
identifiers.append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': inst['isil_code'],
|
|
'identifier_url': f"https://www.isil.nb.admin.ch/en/?isil={inst['isil_code']}"
|
|
})
|
|
|
|
# Website as identifier
|
|
if inst.get('contact', {}).get('website'):
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': inst['contact']['website'],
|
|
'identifier_url': inst['contact']['website']
|
|
})
|
|
|
|
if identifiers:
|
|
record['identifiers'] = identifiers
|
|
|
|
# Contact information
|
|
contact = inst.get('contact', {})
|
|
if contact:
|
|
contact_info = {}
|
|
if contact.get('email'):
|
|
contact_info['email'] = contact['email']
|
|
if contact.get('phone'):
|
|
contact_info['telephone'] = contact['phone']
|
|
|
|
if contact_info:
|
|
record['contact_info'] = contact_info
|
|
|
|
# Homepage
|
|
if contact.get('website'):
|
|
record['homepage'] = contact['website']
|
|
|
|
# Digital platforms (if detail URL available)
|
|
if inst.get('detail_url'):
|
|
record['digital_platforms'] = [{
|
|
'platform_name': 'Swiss ISIL Directory',
|
|
'platform_url': inst['detail_url'],
|
|
'platform_type': 'DISCOVERY_PORTAL'
|
|
}]
|
|
|
|
# Provenance
|
|
record['provenance'] = {
|
|
'data_source': 'CSV_REGISTRY',
|
|
'data_tier': 'TIER_1_AUTHORITATIVE',
|
|
'extraction_date': datetime.now().isoformat(),
|
|
'extraction_method': 'Web scraping from Swiss National Library ISIL directory (https://www.isil.nb.admin.ch)',
|
|
'confidence_score': 0.95,
|
|
'source_url': 'https://www.isil.nb.admin.ch/en/',
|
|
'notes': f"Scraped from Swiss ISIL directory. Swiss categories: {'; '.join(categories)}"
|
|
}
|
|
|
|
return record
|
|
|
|
def convert_all(json_file: Path, yaml_file: Path):
|
|
"""Convert all Swiss ISIL institutions to LinkML YAML"""
|
|
print(f"Loading data from {json_file}")
|
|
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
institutions = json.load(f)
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
print(f"Converting to LinkML format...")
|
|
|
|
# Convert all institutions
|
|
linkml_records = []
|
|
type_counts = {}
|
|
|
|
for inst in institutions:
|
|
record = convert_to_linkml(inst)
|
|
linkml_records.append(record)
|
|
|
|
# Count types
|
|
inst_type = record['institution_type']
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
print(f"\nConverted {len(linkml_records)} institutions")
|
|
print("\nInstitution type distribution:")
|
|
for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {inst_type:25s}: {count:4d} ({count/len(linkml_records)*100:5.1f}%)")
|
|
|
|
# Write YAML
|
|
print(f"\nWriting LinkML YAML to {yaml_file}")
|
|
with open(yaml_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(linkml_records, f,
|
|
allow_unicode=True,
|
|
default_flow_style=False,
|
|
sort_keys=False,
|
|
width=100)
|
|
|
|
print(f"✓ Successfully exported {len(linkml_records)} institutions to LinkML YAML")
|
|
|
|
# Also write as JSON-LD
|
|
jsonld_file = yaml_file.with_suffix('.jsonld')
|
|
print(f"\nWriting JSON-LD to {jsonld_file}")
|
|
|
|
jsonld_data = {
|
|
'@context': 'https://w3id.org/heritage/custodian/context.jsonld',
|
|
'@graph': linkml_records
|
|
}
|
|
|
|
with open(jsonld_file, 'w', encoding='utf-8') as f:
|
|
json.dump(jsonld_data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"✓ Successfully exported to JSON-LD")
|
|
|
|
def main():
|
|
"""Main conversion function"""
|
|
base_dir = Path("/Users/kempersc/apps/glam/data/isil/switzerland")
|
|
instances_dir = Path("/Users/kempersc/apps/glam/data/instances")
|
|
|
|
json_file = base_dir / "swiss_isil_complete_final.json"
|
|
yaml_file = instances_dir / "switzerland_isil.yaml"
|
|
|
|
if not json_file.exists():
|
|
print(f"Error: JSON file not found at {json_file}")
|
|
return
|
|
|
|
# Create instances directory if needed
|
|
instances_dir.mkdir(exist_ok=True)
|
|
|
|
convert_all(json_file, yaml_file)
|
|
|
|
print(f"\n✓ LinkML conversion complete!")
|
|
print(f" YAML output: {yaml_file}")
|
|
print(f" JSON-LD output: {yaml_file.with_suffix('.jsonld')}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|