glam/scripts/migrate_legal_form_to_iso20275.py
2025-11-21 22:12:33 +01:00

559 lines
22 KiB
Python

#!/usr/bin/env python3
"""
Data Migration Script: Generic Legal Form Enums → ISO 20275 ELF Codes
This script migrates existing heritage institution data from generic legal form
enumerations to ISO 20275 Entity Legal Forms (ELF) codes.
MIGRATION POLICY:
- Maps old generic enums to country-specific ISO 20275 codes
- Preserves original values in provenance notes
- Generates comprehensive migration report
- Handles edge cases and unknown mappings
- Validates ISO 20275 code format (^[A-Z0-9]{4}$)
ISO 20275 Reference:
- Standard: https://www.gleif.org/en/about-lei/code-lists/iso-20275-entity-legal-forms-code-list
- Data: /data/ontology/2023-09-28-elf-code-list-v1.5.csv
- Country guides: /schemas/20251121/elf_codes/{country}/README.md
Usage:
python migrate_legal_form_to_iso20275.py --input data.yaml --output migrated.yaml [--country NL]
python migrate_legal_form_to_iso20275.py --input-dir data/ --output-dir migrated/ [--country NL]
Author: GLAM Ontology Project
Date: 2025-11-21
Version: 1.0.0
"""
import argparse
import csv
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, field
import yaml
# ============================================================================
# ISO 20275 ELF CODE MAPPINGS
# ============================================================================
@dataclass
class ELFCodeMapping:
"""Represents a mapping from old enum to ISO 20275 ELF code"""
old_enum: str
new_elf_code: str
country: str
local_name: str
transliterated_name: str
confidence: float # 0.0-1.0
# Generic enum → ISO 20275 mappings by country
# Source: /schemas/20251121/elf_codes/{country}/README.md
MAPPINGS_NETHERLANDS = {
"STICHTING": ELFCodeMapping("STICHTING", "V44D", "NL", "Stichting", "Stichting", 1.0),
"ASSOCIATION": ELFCodeMapping("ASSOCIATION", "33MN", "NL", "Vereniging met volledige rechtsbevoegdheid", "Vereniging", 0.9),
"NGO": ELFCodeMapping("NGO", "33MN", "NL", "Vereniging met volledige rechtsbevoegdheid", "Vereniging", 0.7), # Ambiguous
"GOVERNMENT_AGENCY": ELFCodeMapping("GOVERNMENT_AGENCY", "A0W7", "NL", "Publiekrechtelijke rechtspersoon", "Public entity", 0.95),
"LIMITED_COMPANY": ELFCodeMapping("LIMITED_COMPANY", "54M6", "NL", "Besloten vennootschap met beperkte aansprakelijkheid", "BV", 0.85),
"COOPERATIVE": ELFCodeMapping("COOPERATIVE", "NFFH", "NL", "Coöperatie", "Coöperatie", 1.0),
"TRUST": ELFCodeMapping("TRUST", "V44D", "NL", "Stichting", "Stichting", 0.6), # Trust ≈ stichting in NL context
}
MAPPINGS_FRANCE = {
"STICHTING": ELFCodeMapping("STICHTING", "9T5S", "FR", "Fondation", "Fondation", 0.8), # Foundation equivalent
"ASSOCIATION": ELFCodeMapping("ASSOCIATION", "BEWI", "FR", "Association déclarée", "Association", 1.0),
"NGO": ELFCodeMapping("NGO", "BEWI", "FR", "Association déclarée", "Association", 0.9),
"GOVERNMENT_AGENCY": ELFCodeMapping("GOVERNMENT_AGENCY", "5RDO", "FR", "Établissement public", "Établissement public", 1.0),
"LIMITED_COMPANY": ELFCodeMapping("LIMITED_COMPANY", "KMPN", "FR", "Société à responsabilité limitée", "SARL", 0.9),
"COOPERATIVE": ELFCodeMapping("COOPERATIVE", "6HB6", "FR", "Société coopérative", "SCOP", 1.0),
"TRUST": ELFCodeMapping("TRUST", "9T5S", "FR", "Fondation", "Fondation", 0.7),
}
MAPPINGS_GERMANY = {
"STICHTING": ELFCodeMapping("STICHTING", "V2YH", "DE", "Stiftung", "Stiftung", 1.0),
"ASSOCIATION": ELFCodeMapping("ASSOCIATION", "QZ3L", "DE", "Eingetragener Verein", "e.V.", 1.0),
"NGO": ELFCodeMapping("NGO", "QZ3L", "DE", "Eingetragener Verein", "e.V.", 0.9),
"GOVERNMENT_AGENCY": ELFCodeMapping("GOVERNMENT_AGENCY", "SQKS", "DE", "Körperschaft des öffentlichen Rechts", "KdöR", 1.0),
"LIMITED_COMPANY": ELFCodeMapping("LIMITED_COMPANY", "XLWA", "DE", "Gesellschaft mit beschränkter Haftung", "GmbH", 0.9),
"COOPERATIVE": ELFCodeMapping("COOPERATIVE", "XAEA", "DE", "Eingetragene Genossenschaft", "eG", 1.0),
"TRUST": ELFCodeMapping("TRUST", "V2YH", "DE", "Stiftung", "Stiftung", 1.0),
}
MAPPINGS_UK = {
"STICHTING": ELFCodeMapping("STICHTING", "FC0R", "GB", "Trust", "Trust", 0.8),
"ASSOCIATION": ELFCodeMapping("ASSOCIATION", "9HLU", "GB", "Charity", "Charity", 0.9),
"NGO": ELFCodeMapping("NGO", "9HLU", "GB", "Charity", "Charity", 0.95),
"GOVERNMENT_AGENCY": ELFCodeMapping("GOVERNMENT_AGENCY", "AVYY", "GB", "Public corporation", "Public corporation", 1.0),
"LIMITED_COMPANY": ELFCodeMapping("LIMITED_COMPANY", "CBL2", "GB", "Private company limited by shares", "Ltd", 0.9),
"COOPERATIVE": ELFCodeMapping("COOPERATIVE", "83XL", "GB", "Co-operative Society", "Co-op", 1.0),
"TRUST": ELFCodeMapping("TRUST", "FC0R", "GB", "Trust", "Trust", 1.0),
}
MAPPINGS_USA = {
"STICHTING": ELFCodeMapping("STICHTING", "QQQ0", "US", "501(c)(3) Nonprofit Organization", "501(c)(3)", 0.8),
"ASSOCIATION": ELFCodeMapping("ASSOCIATION", "QQQ0", "US", "501(c)(3) Nonprofit Organization", "501(c)(3)", 0.9),
"NGO": ELFCodeMapping("NGO", "QQQ0", "US", "501(c)(3) Nonprofit Organization", "501(c)(3)", 0.95),
"GOVERNMENT_AGENCY": ELFCodeMapping("GOVERNMENT_AGENCY", "W2ES", "US", "Government Entity", "Government Entity", 1.0),
"LIMITED_COMPANY": ELFCodeMapping("LIMITED_COMPANY", "CNQ3", "US", "Business Corporation", "Corp", 0.8),
"COOPERATIVE": ELFCodeMapping("COOPERATIVE", "S63E", "US", "Cooperative", "Cooperative", 1.0),
"TRUST": ELFCodeMapping("TRUST", "7TPC", "US", "Trust", "Trust", 1.0),
}
# Country-specific mapping tables
COUNTRY_MAPPINGS = {
"NL": MAPPINGS_NETHERLANDS,
"FR": MAPPINGS_FRANCE,
"DE": MAPPINGS_GERMANY,
"GB": MAPPINGS_UK,
"UK": MAPPINGS_UK, # Alias
"US": MAPPINGS_USA,
}
# Default fallback mappings (when country unknown)
DEFAULT_MAPPINGS = {
"STICHTING": "V44D", # Dutch stichting (most common)
"ASSOCIATION": "BEWI", # French association (generic)
"NGO": "QQQ0", # US 501(c)(3) (generic nonprofit)
"GOVERNMENT_AGENCY": "5RDO", # French établissement public (generic)
"LIMITED_COMPANY": "54M6", # Dutch BV (generic)
"COOPERATIVE": "NFFH", # Dutch coöperatie (generic)
"TRUST": "7TPC", # Australian/US trust (generic)
"OTHER": "9999", # ISO 20275 code for entities with no separate legal form
}
# ============================================================================
# ISO 20275 ELF CODE VALIDATION
# ============================================================================
def load_elf_codes(csv_path: Path) -> Dict[str, Dict[str, str]]:
"""
Load ISO 20275 ELF codes from CSV file.
Returns:
Dict mapping ELF code to metadata (country, local_name, transliterated_name)
"""
elf_codes = {}
with open(csv_path, 'r', encoding='utf-8-sig') as f:
reader = csv.DictReader(f)
for row in reader:
code = row['ELF Code'].strip()
if code and code != '':
elf_codes[code] = {
'country': row.get('Country of formation', ''),
'country_code': row.get('Country Code (ISO 3166-1)', ''),
'local_name': row.get('Entity Legal Form name Local name', ''),
'transliterated_name': row.get('Entity Legal Form name Transliterated name (per ISO 01-140-10)', ''),
'status': row.get('ELF Status ACTV/INAC', 'ACTV'),
}
return elf_codes
def validate_elf_code(code: str, elf_codes: Dict[str, Dict[str, str]]) -> Tuple[bool, Optional[str]]:
"""
Validate ISO 20275 ELF code.
Returns:
(is_valid, error_message)
"""
# Check format: 4-character alphanumeric uppercase
if not re.match(r'^[A-Z0-9]{4}$', code):
return False, f"Invalid format: '{code}' (must be 4 uppercase alphanumeric characters)"
# Check if code exists in official registry
if code not in elf_codes:
return False, f"Code '{code}' not found in ISO 20275 registry"
# Check if code is active
if elf_codes[code]['status'] == 'INAC':
return False, f"Code '{code}' is INACTIVE in ISO 20275 registry"
return True, None
# ============================================================================
# MIGRATION LOGIC
# ============================================================================
@dataclass
class MigrationResult:
"""Result of migrating a single record"""
record_id: str
old_value: Optional[str]
new_value: Optional[str]
country: Optional[str]
confidence: float
status: str # "migrated", "unchanged", "manual_review", "error"
notes: str = ""
@dataclass
class MigrationReport:
"""Summary of migration operation"""
total_records: int = 0
migrated: int = 0
unchanged: int = 0
manual_review: int = 0
errors: int = 0
results: List[MigrationResult] = field(default_factory=list)
def add_result(self, result: MigrationResult):
self.results.append(result)
self.total_records += 1
if result.status == "migrated":
self.migrated += 1
elif result.status == "unchanged":
self.unchanged += 1
elif result.status == "manual_review":
self.manual_review += 1
elif result.status == "error":
self.errors += 1
def summary(self) -> str:
return f"""
Migration Report
================
Total records processed: {self.total_records}
Successfully migrated: {self.migrated}
Unchanged (already ISO 20275): {self.unchanged}
Requiring manual review: {self.manual_review}
Errors: {self.errors}
Success rate: {(self.migrated / self.total_records * 100) if self.total_records > 0 else 0:.1f}%
"""
def migrate_legal_form(
record: dict,
country_code: Optional[str],
elf_codes: Dict[str, Dict[str, str]],
confidence_threshold: float = 0.7
) -> MigrationResult:
"""
Migrate a single record's legal_form from enum to ISO 20275 code.
Args:
record: Organization record (dict)
country_code: ISO 3166-1 country code (e.g., "NL", "FR")
elf_codes: Loaded ISO 20275 codes registry
confidence_threshold: Minimum confidence for automatic migration
Returns:
MigrationResult with migration status
"""
record_id = record.get('id', 'unknown')
old_value = record.get('legal_form')
# Skip if no legal_form field
if not old_value:
return MigrationResult(
record_id=record_id,
old_value=None,
new_value=None,
country=country_code,
confidence=0.0,
status="unchanged",
notes="No legal_form field present"
)
# Skip if already ISO 20275 format
if re.match(r'^[A-Z0-9]{4}$', old_value):
# Validate existing code
is_valid, error = validate_elf_code(old_value, elf_codes)
if is_valid:
return MigrationResult(
record_id=record_id,
old_value=old_value,
new_value=old_value,
country=country_code,
confidence=1.0,
status="unchanged",
notes="Already valid ISO 20275 code"
)
else:
return MigrationResult(
record_id=record_id,
old_value=old_value,
new_value=None,
country=country_code,
confidence=0.0,
status="error",
notes=f"Invalid ISO 20275 code: {error}"
)
# Attempt country-specific mapping
if country_code and country_code in COUNTRY_MAPPINGS:
mapping_table = COUNTRY_MAPPINGS[country_code]
if old_value in mapping_table:
mapping = mapping_table[old_value]
if mapping.confidence >= confidence_threshold:
return MigrationResult(
record_id=record_id,
old_value=old_value,
new_value=mapping.new_elf_code,
country=country_code,
confidence=mapping.confidence,
status="migrated",
notes=f"Mapped to {mapping.local_name} ({mapping.transliterated_name})"
)
else:
return MigrationResult(
record_id=record_id,
old_value=old_value,
new_value=mapping.new_elf_code,
country=country_code,
confidence=mapping.confidence,
status="manual_review",
notes=f"Low confidence ({mapping.confidence:.2f}) - verify: {mapping.local_name}"
)
# Fallback: Use default mapping
if old_value in DEFAULT_MAPPINGS:
new_code = DEFAULT_MAPPINGS[old_value]
return MigrationResult(
record_id=record_id,
old_value=old_value,
new_value=new_code,
country=country_code or "unknown",
confidence=0.5,
status="manual_review",
notes=f"Default mapping used (country unknown). Verify correctness."
)
# Unknown enum value
return MigrationResult(
record_id=record_id,
old_value=old_value,
new_value=None,
country=country_code,
confidence=0.0,
status="manual_review",
notes=f"Unknown enum value: '{old_value}'. Manual mapping required."
)
def migrate_yaml_file(
input_path: Path,
output_path: Path,
elf_codes: Dict[str, Dict[str, str]],
default_country: Optional[str] = None,
dry_run: bool = False
) -> MigrationReport:
"""
Migrate legal_form values in a YAML file.
Args:
input_path: Input YAML file
output_path: Output YAML file (migrated data)
elf_codes: Loaded ISO 20275 codes registry
default_country: Default country code if not in data
dry_run: If True, don't write output file
Returns:
MigrationReport summarizing changes
"""
report = MigrationReport()
# Load input YAML
with open(input_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Handle both single record and list of records
if isinstance(data, dict):
records = [data]
elif isinstance(data, list):
records = data
else:
print(f"ERROR: Unexpected YAML structure in {input_path}")
return report
# Migrate each record
migrated_records = []
for record in records:
# Extract country code
country_code = default_country
# Try to infer country from locations field
if 'locations' in record and isinstance(record['locations'], list) and len(record['locations']) > 0:
location = record['locations'][0]
if 'country' in location:
country_code = location['country']
# Migrate legal_form
result = migrate_legal_form(record, country_code, elf_codes)
report.add_result(result)
# Update record
if result.status == "migrated":
record['legal_form'] = result.new_value
# Add migration note to provenance
if 'provenance' not in record:
record['provenance'] = {}
if 'notes' not in record['provenance']:
record['provenance']['notes'] = ""
migration_note = f"\n[MIGRATION {datetime.now(timezone.utc).isoformat()}] " \
f"legal_form migrated: '{result.old_value}''{result.new_value}' (ISO 20275). " \
f"Country: {result.country}. Confidence: {result.confidence:.2f}. {result.notes}"
record['provenance']['notes'] += migration_note
migrated_records.append(record)
# Write output YAML
if not dry_run:
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(migrated_records if len(migrated_records) > 1 else migrated_records[0],
f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(f"✓ Migrated {input_path}{output_path}")
return report
# ============================================================================
# CLI
# ============================================================================
def main():
parser = argparse.ArgumentParser(
description="Migrate legal_form enums to ISO 20275 ELF codes",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Migrate single file (Dutch institutions)
python migrate_legal_form_to_iso20275.py --input data.yaml --output migrated.yaml --country NL
# Migrate entire directory (auto-detect country from data)
python migrate_legal_form_to_iso20275.py --input-dir data/ --output-dir migrated/
# Dry run (preview changes without writing)
python migrate_legal_form_to_iso20275.py --input data.yaml --output /dev/null --dry-run
# Generate report only
python migrate_legal_form_to_iso20275.py --input data.yaml --output migrated.yaml --report-only
"""
)
parser.add_argument('--input', type=Path, help='Input YAML file')
parser.add_argument('--output', type=Path, help='Output YAML file')
parser.add_argument('--input-dir', type=Path, help='Input directory (batch mode)')
parser.add_argument('--output-dir', type=Path, help='Output directory (batch mode)')
parser.add_argument('--country', type=str, help='Default country code (ISO 3166-1, e.g., NL, FR, DE, GB, US)')
parser.add_argument('--elf-codes', type=Path,
default=Path('data/ontology/2023-09-28-elf-code-list-v1.5.csv'),
help='Path to ISO 20275 ELF codes CSV file')
parser.add_argument('--confidence-threshold', type=float, default=0.7,
help='Minimum confidence for automatic migration (0.0-1.0)')
parser.add_argument('--dry-run', action='store_true', help='Preview changes without writing files')
parser.add_argument('--report-only', action='store_true', help='Generate report only, no file output')
parser.add_argument('--report-path', type=Path, default=Path('migration_report.txt'),
help='Path to save migration report')
args = parser.parse_args()
# Validate arguments
if not args.input and not args.input_dir:
parser.error("Either --input or --input-dir required")
if args.input and not args.output:
parser.error("--output required when using --input")
if args.input_dir and not args.output_dir:
parser.error("--output-dir required when using --input-dir")
# Load ISO 20275 ELF codes
if not args.elf_codes.exists():
print(f"ERROR: ELF codes file not found: {args.elf_codes}")
print("Download from: https://www.gleif.org/en/about-lei/code-lists/iso-20275-entity-legal-forms-code-list")
sys.exit(1)
print(f"Loading ISO 20275 ELF codes from {args.elf_codes}...")
elf_codes = load_elf_codes(args.elf_codes)
print(f"✓ Loaded {len(elf_codes)} ELF codes")
# Single file mode
if args.input:
print(f"\nMigrating {args.input}...")
report = migrate_yaml_file(
args.input,
args.output,
elf_codes,
args.country,
args.dry_run or args.report_only
)
print(report.summary())
# Save detailed report
with open(args.report_path, 'w', encoding='utf-8') as f:
f.write(report.summary())
f.write("\n\nDetailed Results:\n")
f.write("="*80 + "\n")
for result in report.results:
f.write(f"\nRecord: {result.record_id}\n")
f.write(f" Status: {result.status}\n")
f.write(f" Old value: {result.old_value}\n")
f.write(f" New value: {result.new_value}\n")
f.write(f" Country: {result.country}\n")
f.write(f" Confidence: {result.confidence:.2f}\n")
f.write(f" Notes: {result.notes}\n")
print(f"\nDetailed report saved to {args.report_path}")
# Batch directory mode
elif args.input_dir:
print(f"\nMigrating directory {args.input_dir}...")
yaml_files = list(args.input_dir.glob('**/*.yaml')) + list(args.input_dir.glob('**/*.yml'))
print(f"Found {len(yaml_files)} YAML files")
combined_report = MigrationReport()
for input_file in yaml_files:
relative_path = input_file.relative_to(args.input_dir)
output_file = args.output_dir / relative_path
print(f"\n {relative_path}...")
report = migrate_yaml_file(
input_file,
output_file,
elf_codes,
args.country,
args.dry_run or args.report_only
)
# Merge into combined report
for result in report.results:
combined_report.add_result(result)
print("\n" + "="*80)
print(combined_report.summary())
# Save combined report
with open(args.report_path, 'w', encoding='utf-8') as f:
f.write("BATCH MIGRATION REPORT\n")
f.write("="*80 + "\n")
f.write(f"Input directory: {args.input_dir}\n")
f.write(f"Output directory: {args.output_dir}\n")
f.write(f"Files processed: {len(yaml_files)}\n")
f.write(combined_report.summary())
print(f"\nCombined report saved to {args.report_path}")
if __name__ == '__main__':
main()