glam/scripts/parse_iso20275_codes.py
kempersc fa5680f0dd Add initial versions of custodian hub UML diagrams in Mermaid and PlantUML formats
- Introduced custodian_hub_v3.mmd, custodian_hub_v4_final.mmd, and custodian_hub_v5_FINAL.mmd for Mermaid representation.
- Created custodian_hub_FINAL.puml and custodian_hub_v3.puml for PlantUML representation.
- Defined entities such as CustodianReconstruction, Identifier, TimeSpan, Agent, CustodianName, CustodianObservation, ReconstructionActivity, Appellation, ConfidenceMeasure, Custodian, LanguageCode, and SourceDocument.
- Established relationships and associations between entities, including temporal extents, observations, and reconstruction activities.
- Incorporated enumerations for various types, statuses, and classifications relevant to custodians and their activities.
2025-11-22 14:33:51 +01:00

159 lines
6.2 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Parse ISO 20275 Entity Legal Form codes from CSV and generate LinkML mappings.
"""
import csv
import yaml
from pathlib import Path
from typing import Dict, List, Any
def parse_iso20275_csv(csv_path: Path) -> List[Dict[str, Any]]:
"""
Parse the ISO 20275 ELF code CSV file.
Expected columns based on the GLEIF standard:
- ELF Code (4 characters)
- Country Code (ISO 3166-1)
- Jurisdiction
- Country/Subdivision
- Entity Legal Form name (local language)
- Entity Legal Form name (transliterated)
- Abbreviation (local language)
- Abbreviation (transliterated)
- Date added
- ELF Status
"""
legal_forms = []
with open(csv_path, 'r', encoding='utf-8-sig') as f:
# Try to detect delimiter
sample = f.read(1024)
f.seek(0)
sniffer = csv.Sniffer()
delimiter = sniffer.sniff(sample).delimiter
reader = csv.DictReader(f, delimiter=delimiter)
for row in reader:
# Map column names (may vary)
elf_code = (row.get('ELF Code') or
row.get('Entity Legal Form Code') or
row.get('Code') or '').strip()
country_code = (row.get('Country Code') or
row.get('Country') or
row.get('ISO Country Code') or '').strip()
local_name = (row.get('Entity Legal Form name Local') or
row.get('Legal Form Name') or
row.get('Name Local') or '').strip()
transliterated = (row.get('Entity Legal Form name Transliterated') or
row.get('Name Transliterated') or '').strip()
abbreviation = (row.get('Abbreviation Local') or
row.get('Abbreviation') or '').strip()
status = (row.get('ELF Status') or
row.get('Status') or 'Active').strip()
if elf_code and len(elf_code) == 4 and status == 'Active':
legal_forms.append({
'elf_code': elf_code.upper(),
'country_code': country_code.upper() if country_code else '',
'local_name': local_name,
'transliterated_name': transliterated if transliterated != local_name else None,
'abbreviation': abbreviation if abbreviation else None,
})
return legal_forms
def generate_common_mappings(legal_forms: List[Dict[str, Any]]) -> Dict[str, Any]:
"""
Generate mappings for common heritage institution legal forms.
Focus on forms commonly used by museums, archives, and libraries.
"""
# Common legal forms for heritage institutions
heritage_forms = {
# Netherlands
'8888': {'ontology': ['org:FormalOrganization', 'schema:NonProfitOrganization'], 'type': 'foundation'},
'54M6': {'ontology': ['org:FormalOrganization', 'schema:Corporation'], 'type': 'private_company'},
# Germany
'QS1L': {'ontology': ['org:FormalOrganization', 'schema:NonProfitOrganization'], 'type': 'foundation'},
'HRA1': {'ontology': ['org:FormalOrganization', 'schema:Corporation'], 'type': 'gmbh'},
# France
'L6L1': {'ontology': ['org:FormalOrganization', 'schema:NonProfitOrganization'], 'type': 'association'},
# UK
'PRIV': {'ontology': ['org:FormalOrganization', 'schema:Corporation'], 'type': 'private_limited'},
'CHAR': {'ontology': ['org:FormalOrganization', 'schema:NonProfitOrganization'], 'type': 'charity'},
# US
'501C': {'ontology': ['org:FormalOrganization', 'schema:NonProfitOrganization'], 'type': 'nonprofit'},
}
mappings = []
for form in legal_forms:
if form['elf_code'] in heritage_forms:
mapping = {
'elf_code': form['elf_code'],
'country_code': form['country_code'],
'legal_form_name': form['local_name'],
'ontology_mappings': heritage_forms[form['elf_code']]['ontology'],
'common_type': heritage_forms[form['elf_code']]['type'],
}
if form['transliterated_name']:
mapping['transliterated_name'] = form['transliterated_name']
if form['abbreviation']:
mapping['abbreviation'] = form['abbreviation']
mappings.append(mapping)
return {
'id': 'https://nde.nl/ontology/hc/mapping/ISO20275_common',
'name': 'ISO20275_common_mappings',
'title': 'Common ISO 20275 Legal Forms for Heritage Institutions',
'description': 'Frequently used legal forms for museums, archives, and libraries',
'mappings': mappings
}
def main():
"""Parse ISO 20275 codes and generate mappings."""
csv_path = Path('data/ontology/2023-09-28-elf-code-list-v1.5.csv')
if not csv_path.exists():
print(f"Error: CSV file not found at {csv_path}")
return
print(f"Parsing ISO 20275 codes from {csv_path}")
legal_forms = parse_iso20275_csv(csv_path)
print(f"Found {len(legal_forms)} active legal form codes")
# Count by country
by_country = {}
for form in legal_forms:
country = form['country_code']
if country:
by_country[country] = by_country.get(country, 0) + 1
print("\nTop 10 countries by number of legal forms:")
for country, count in sorted(by_country.items(), key=lambda x: x[1], reverse=True)[:10]:
print(f" {country}: {count} forms")
# Generate common mappings
mappings = generate_common_mappings(legal_forms)
output_path = Path('schemas/20251121/linkml/modules/mappings/ISO20275_common.yaml')
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w') as f:
yaml.dump(mappings, f, default_flow_style=False, allow_unicode=True)
print(f"\nGenerated common mappings: {output_path}")
print(f"Mapped {len(mappings['mappings'])} common heritage institution legal forms")
if __name__ == '__main__':
main()