#!/usr/bin/env python3 """ Data Migration Script: Generic Legal Form Enums → ISO 20275 ELF Codes This script migrates existing heritage institution data from generic legal form enumerations to ISO 20275 Entity Legal Forms (ELF) codes. MIGRATION POLICY: - Maps old generic enums to country-specific ISO 20275 codes - Preserves original values in provenance notes - Generates comprehensive migration report - Handles edge cases and unknown mappings - Validates ISO 20275 code format (^[A-Z0-9]{4}$) ISO 20275 Reference: - Standard: https://www.gleif.org/en/about-lei/code-lists/iso-20275-entity-legal-forms-code-list - Data: /data/ontology/2023-09-28-elf-code-list-v1.5.csv - Country guides: /schemas/20251121/elf_codes/{country}/README.md Usage: python migrate_legal_form_to_iso20275.py --input data.yaml --output migrated.yaml [--country NL] python migrate_legal_form_to_iso20275.py --input-dir data/ --output-dir migrated/ [--country NL] Author: GLAM Ontology Project Date: 2025-11-21 Version: 1.0.0 """ import argparse import csv import re import sys from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Tuple from dataclasses import dataclass, field import yaml # ============================================================================ # ISO 20275 ELF CODE MAPPINGS # ============================================================================ @dataclass class ELFCodeMapping: """Represents a mapping from old enum to ISO 20275 ELF code""" old_enum: str new_elf_code: str country: str local_name: str transliterated_name: str confidence: float # 0.0-1.0 # Generic enum → ISO 20275 mappings by country # Source: /schemas/20251121/elf_codes/{country}/README.md MAPPINGS_NETHERLANDS = { "STICHTING": ELFCodeMapping("STICHTING", "V44D", "NL", "Stichting", "Stichting", 1.0), "ASSOCIATION": ELFCodeMapping("ASSOCIATION", "33MN", "NL", "Vereniging met volledige rechtsbevoegdheid", "Vereniging", 0.9), "NGO": ELFCodeMapping("NGO", "33MN", "NL", "Vereniging met volledige rechtsbevoegdheid", "Vereniging", 0.7), # Ambiguous "GOVERNMENT_AGENCY": ELFCodeMapping("GOVERNMENT_AGENCY", "A0W7", "NL", "Publiekrechtelijke rechtspersoon", "Public entity", 0.95), "LIMITED_COMPANY": ELFCodeMapping("LIMITED_COMPANY", "54M6", "NL", "Besloten vennootschap met beperkte aansprakelijkheid", "BV", 0.85), "COOPERATIVE": ELFCodeMapping("COOPERATIVE", "NFFH", "NL", "Coöperatie", "Coöperatie", 1.0), "TRUST": ELFCodeMapping("TRUST", "V44D", "NL", "Stichting", "Stichting", 0.6), # Trust ≈ stichting in NL context } MAPPINGS_FRANCE = { "STICHTING": ELFCodeMapping("STICHTING", "9T5S", "FR", "Fondation", "Fondation", 0.8), # Foundation equivalent "ASSOCIATION": ELFCodeMapping("ASSOCIATION", "BEWI", "FR", "Association déclarée", "Association", 1.0), "NGO": ELFCodeMapping("NGO", "BEWI", "FR", "Association déclarée", "Association", 0.9), "GOVERNMENT_AGENCY": ELFCodeMapping("GOVERNMENT_AGENCY", "5RDO", "FR", "Établissement public", "Établissement public", 1.0), "LIMITED_COMPANY": ELFCodeMapping("LIMITED_COMPANY", "KMPN", "FR", "Société à responsabilité limitée", "SARL", 0.9), "COOPERATIVE": ELFCodeMapping("COOPERATIVE", "6HB6", "FR", "Société coopérative", "SCOP", 1.0), "TRUST": ELFCodeMapping("TRUST", "9T5S", "FR", "Fondation", "Fondation", 0.7), } MAPPINGS_GERMANY = { "STICHTING": ELFCodeMapping("STICHTING", "V2YH", "DE", "Stiftung", "Stiftung", 1.0), "ASSOCIATION": ELFCodeMapping("ASSOCIATION", "QZ3L", "DE", "Eingetragener Verein", "e.V.", 1.0), "NGO": ELFCodeMapping("NGO", "QZ3L", "DE", "Eingetragener Verein", "e.V.", 0.9), "GOVERNMENT_AGENCY": ELFCodeMapping("GOVERNMENT_AGENCY", "SQKS", "DE", "Körperschaft des öffentlichen Rechts", "KdöR", 1.0), "LIMITED_COMPANY": ELFCodeMapping("LIMITED_COMPANY", "XLWA", "DE", "Gesellschaft mit beschränkter Haftung", "GmbH", 0.9), "COOPERATIVE": ELFCodeMapping("COOPERATIVE", "XAEA", "DE", "Eingetragene Genossenschaft", "eG", 1.0), "TRUST": ELFCodeMapping("TRUST", "V2YH", "DE", "Stiftung", "Stiftung", 1.0), } MAPPINGS_UK = { "STICHTING": ELFCodeMapping("STICHTING", "FC0R", "GB", "Trust", "Trust", 0.8), "ASSOCIATION": ELFCodeMapping("ASSOCIATION", "9HLU", "GB", "Charity", "Charity", 0.9), "NGO": ELFCodeMapping("NGO", "9HLU", "GB", "Charity", "Charity", 0.95), "GOVERNMENT_AGENCY": ELFCodeMapping("GOVERNMENT_AGENCY", "AVYY", "GB", "Public corporation", "Public corporation", 1.0), "LIMITED_COMPANY": ELFCodeMapping("LIMITED_COMPANY", "CBL2", "GB", "Private company limited by shares", "Ltd", 0.9), "COOPERATIVE": ELFCodeMapping("COOPERATIVE", "83XL", "GB", "Co-operative Society", "Co-op", 1.0), "TRUST": ELFCodeMapping("TRUST", "FC0R", "GB", "Trust", "Trust", 1.0), } MAPPINGS_USA = { "STICHTING": ELFCodeMapping("STICHTING", "QQQ0", "US", "501(c)(3) Nonprofit Organization", "501(c)(3)", 0.8), "ASSOCIATION": ELFCodeMapping("ASSOCIATION", "QQQ0", "US", "501(c)(3) Nonprofit Organization", "501(c)(3)", 0.9), "NGO": ELFCodeMapping("NGO", "QQQ0", "US", "501(c)(3) Nonprofit Organization", "501(c)(3)", 0.95), "GOVERNMENT_AGENCY": ELFCodeMapping("GOVERNMENT_AGENCY", "W2ES", "US", "Government Entity", "Government Entity", 1.0), "LIMITED_COMPANY": ELFCodeMapping("LIMITED_COMPANY", "CNQ3", "US", "Business Corporation", "Corp", 0.8), "COOPERATIVE": ELFCodeMapping("COOPERATIVE", "S63E", "US", "Cooperative", "Cooperative", 1.0), "TRUST": ELFCodeMapping("TRUST", "7TPC", "US", "Trust", "Trust", 1.0), } # Country-specific mapping tables COUNTRY_MAPPINGS = { "NL": MAPPINGS_NETHERLANDS, "FR": MAPPINGS_FRANCE, "DE": MAPPINGS_GERMANY, "GB": MAPPINGS_UK, "UK": MAPPINGS_UK, # Alias "US": MAPPINGS_USA, } # Default fallback mappings (when country unknown) DEFAULT_MAPPINGS = { "STICHTING": "V44D", # Dutch stichting (most common) "ASSOCIATION": "BEWI", # French association (generic) "NGO": "QQQ0", # US 501(c)(3) (generic nonprofit) "GOVERNMENT_AGENCY": "5RDO", # French établissement public (generic) "LIMITED_COMPANY": "54M6", # Dutch BV (generic) "COOPERATIVE": "NFFH", # Dutch coöperatie (generic) "TRUST": "7TPC", # Australian/US trust (generic) "OTHER": "9999", # ISO 20275 code for entities with no separate legal form } # ============================================================================ # ISO 20275 ELF CODE VALIDATION # ============================================================================ def load_elf_codes(csv_path: Path) -> Dict[str, Dict[str, str]]: """ Load ISO 20275 ELF codes from CSV file. Returns: Dict mapping ELF code to metadata (country, local_name, transliterated_name) """ elf_codes = {} with open(csv_path, 'r', encoding='utf-8-sig') as f: reader = csv.DictReader(f) for row in reader: code = row['ELF Code'].strip() if code and code != '': elf_codes[code] = { 'country': row.get('Country of formation', ''), 'country_code': row.get('Country Code (ISO 3166-1)', ''), 'local_name': row.get('Entity Legal Form name Local name', ''), 'transliterated_name': row.get('Entity Legal Form name Transliterated name (per ISO 01-140-10)', ''), 'status': row.get('ELF Status ACTV/INAC', 'ACTV'), } return elf_codes def validate_elf_code(code: str, elf_codes: Dict[str, Dict[str, str]]) -> Tuple[bool, Optional[str]]: """ Validate ISO 20275 ELF code. Returns: (is_valid, error_message) """ # Check format: 4-character alphanumeric uppercase if not re.match(r'^[A-Z0-9]{4}$', code): return False, f"Invalid format: '{code}' (must be 4 uppercase alphanumeric characters)" # Check if code exists in official registry if code not in elf_codes: return False, f"Code '{code}' not found in ISO 20275 registry" # Check if code is active if elf_codes[code]['status'] == 'INAC': return False, f"Code '{code}' is INACTIVE in ISO 20275 registry" return True, None # ============================================================================ # MIGRATION LOGIC # ============================================================================ @dataclass class MigrationResult: """Result of migrating a single record""" record_id: str old_value: Optional[str] new_value: Optional[str] country: Optional[str] confidence: float status: str # "migrated", "unchanged", "manual_review", "error" notes: str = "" @dataclass class MigrationReport: """Summary of migration operation""" total_records: int = 0 migrated: int = 0 unchanged: int = 0 manual_review: int = 0 errors: int = 0 results: List[MigrationResult] = field(default_factory=list) def add_result(self, result: MigrationResult): self.results.append(result) self.total_records += 1 if result.status == "migrated": self.migrated += 1 elif result.status == "unchanged": self.unchanged += 1 elif result.status == "manual_review": self.manual_review += 1 elif result.status == "error": self.errors += 1 def summary(self) -> str: return f""" Migration Report ================ Total records processed: {self.total_records} Successfully migrated: {self.migrated} Unchanged (already ISO 20275): {self.unchanged} Requiring manual review: {self.manual_review} Errors: {self.errors} Success rate: {(self.migrated / self.total_records * 100) if self.total_records > 0 else 0:.1f}% """ def migrate_legal_form( record: dict, country_code: Optional[str], elf_codes: Dict[str, Dict[str, str]], confidence_threshold: float = 0.7 ) -> MigrationResult: """ Migrate a single record's legal_form from enum to ISO 20275 code. Args: record: Organization record (dict) country_code: ISO 3166-1 country code (e.g., "NL", "FR") elf_codes: Loaded ISO 20275 codes registry confidence_threshold: Minimum confidence for automatic migration Returns: MigrationResult with migration status """ record_id = record.get('id', 'unknown') old_value = record.get('legal_form') # Skip if no legal_form field if not old_value: return MigrationResult( record_id=record_id, old_value=None, new_value=None, country=country_code, confidence=0.0, status="unchanged", notes="No legal_form field present" ) # Skip if already ISO 20275 format if re.match(r'^[A-Z0-9]{4}$', old_value): # Validate existing code is_valid, error = validate_elf_code(old_value, elf_codes) if is_valid: return MigrationResult( record_id=record_id, old_value=old_value, new_value=old_value, country=country_code, confidence=1.0, status="unchanged", notes="Already valid ISO 20275 code" ) else: return MigrationResult( record_id=record_id, old_value=old_value, new_value=None, country=country_code, confidence=0.0, status="error", notes=f"Invalid ISO 20275 code: {error}" ) # Attempt country-specific mapping if country_code and country_code in COUNTRY_MAPPINGS: mapping_table = COUNTRY_MAPPINGS[country_code] if old_value in mapping_table: mapping = mapping_table[old_value] if mapping.confidence >= confidence_threshold: return MigrationResult( record_id=record_id, old_value=old_value, new_value=mapping.new_elf_code, country=country_code, confidence=mapping.confidence, status="migrated", notes=f"Mapped to {mapping.local_name} ({mapping.transliterated_name})" ) else: return MigrationResult( record_id=record_id, old_value=old_value, new_value=mapping.new_elf_code, country=country_code, confidence=mapping.confidence, status="manual_review", notes=f"Low confidence ({mapping.confidence:.2f}) - verify: {mapping.local_name}" ) # Fallback: Use default mapping if old_value in DEFAULT_MAPPINGS: new_code = DEFAULT_MAPPINGS[old_value] return MigrationResult( record_id=record_id, old_value=old_value, new_value=new_code, country=country_code or "unknown", confidence=0.5, status="manual_review", notes=f"Default mapping used (country unknown). Verify correctness." ) # Unknown enum value return MigrationResult( record_id=record_id, old_value=old_value, new_value=None, country=country_code, confidence=0.0, status="manual_review", notes=f"Unknown enum value: '{old_value}'. Manual mapping required." ) def migrate_yaml_file( input_path: Path, output_path: Path, elf_codes: Dict[str, Dict[str, str]], default_country: Optional[str] = None, dry_run: bool = False ) -> MigrationReport: """ Migrate legal_form values in a YAML file. Args: input_path: Input YAML file output_path: Output YAML file (migrated data) elf_codes: Loaded ISO 20275 codes registry default_country: Default country code if not in data dry_run: If True, don't write output file Returns: MigrationReport summarizing changes """ report = MigrationReport() # Load input YAML with open(input_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # Handle both single record and list of records if isinstance(data, dict): records = [data] elif isinstance(data, list): records = data else: print(f"ERROR: Unexpected YAML structure in {input_path}") return report # Migrate each record migrated_records = [] for record in records: # Extract country code country_code = default_country # Try to infer country from locations field if 'locations' in record and isinstance(record['locations'], list) and len(record['locations']) > 0: location = record['locations'][0] if 'country' in location: country_code = location['country'] # Migrate legal_form result = migrate_legal_form(record, country_code, elf_codes) report.add_result(result) # Update record if result.status == "migrated": record['legal_form'] = result.new_value # Add migration note to provenance if 'provenance' not in record: record['provenance'] = {} if 'notes' not in record['provenance']: record['provenance']['notes'] = "" migration_note = f"\n[MIGRATION {datetime.now(timezone.utc).isoformat()}] " \ f"legal_form migrated: '{result.old_value}' → '{result.new_value}' (ISO 20275). " \ f"Country: {result.country}. Confidence: {result.confidence:.2f}. {result.notes}" record['provenance']['notes'] += migration_note migrated_records.append(record) # Write output YAML if not dry_run: output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(migrated_records if len(migrated_records) > 1 else migrated_records[0], f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(f"✓ Migrated {input_path} → {output_path}") return report # ============================================================================ # CLI # ============================================================================ def main(): parser = argparse.ArgumentParser( description="Migrate legal_form enums to ISO 20275 ELF codes", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Migrate single file (Dutch institutions) python migrate_legal_form_to_iso20275.py --input data.yaml --output migrated.yaml --country NL # Migrate entire directory (auto-detect country from data) python migrate_legal_form_to_iso20275.py --input-dir data/ --output-dir migrated/ # Dry run (preview changes without writing) python migrate_legal_form_to_iso20275.py --input data.yaml --output /dev/null --dry-run # Generate report only python migrate_legal_form_to_iso20275.py --input data.yaml --output migrated.yaml --report-only """ ) parser.add_argument('--input', type=Path, help='Input YAML file') parser.add_argument('--output', type=Path, help='Output YAML file') parser.add_argument('--input-dir', type=Path, help='Input directory (batch mode)') parser.add_argument('--output-dir', type=Path, help='Output directory (batch mode)') parser.add_argument('--country', type=str, help='Default country code (ISO 3166-1, e.g., NL, FR, DE, GB, US)') parser.add_argument('--elf-codes', type=Path, default=Path('data/ontology/2023-09-28-elf-code-list-v1.5.csv'), help='Path to ISO 20275 ELF codes CSV file') parser.add_argument('--confidence-threshold', type=float, default=0.7, help='Minimum confidence for automatic migration (0.0-1.0)') parser.add_argument('--dry-run', action='store_true', help='Preview changes without writing files') parser.add_argument('--report-only', action='store_true', help='Generate report only, no file output') parser.add_argument('--report-path', type=Path, default=Path('migration_report.txt'), help='Path to save migration report') args = parser.parse_args() # Validate arguments if not args.input and not args.input_dir: parser.error("Either --input or --input-dir required") if args.input and not args.output: parser.error("--output required when using --input") if args.input_dir and not args.output_dir: parser.error("--output-dir required when using --input-dir") # Load ISO 20275 ELF codes if not args.elf_codes.exists(): print(f"ERROR: ELF codes file not found: {args.elf_codes}") print("Download from: https://www.gleif.org/en/about-lei/code-lists/iso-20275-entity-legal-forms-code-list") sys.exit(1) print(f"Loading ISO 20275 ELF codes from {args.elf_codes}...") elf_codes = load_elf_codes(args.elf_codes) print(f"✓ Loaded {len(elf_codes)} ELF codes") # Single file mode if args.input: print(f"\nMigrating {args.input}...") report = migrate_yaml_file( args.input, args.output, elf_codes, args.country, args.dry_run or args.report_only ) print(report.summary()) # Save detailed report with open(args.report_path, 'w', encoding='utf-8') as f: f.write(report.summary()) f.write("\n\nDetailed Results:\n") f.write("="*80 + "\n") for result in report.results: f.write(f"\nRecord: {result.record_id}\n") f.write(f" Status: {result.status}\n") f.write(f" Old value: {result.old_value}\n") f.write(f" New value: {result.new_value}\n") f.write(f" Country: {result.country}\n") f.write(f" Confidence: {result.confidence:.2f}\n") f.write(f" Notes: {result.notes}\n") print(f"\nDetailed report saved to {args.report_path}") # Batch directory mode elif args.input_dir: print(f"\nMigrating directory {args.input_dir}...") yaml_files = list(args.input_dir.glob('**/*.yaml')) + list(args.input_dir.glob('**/*.yml')) print(f"Found {len(yaml_files)} YAML files") combined_report = MigrationReport() for input_file in yaml_files: relative_path = input_file.relative_to(args.input_dir) output_file = args.output_dir / relative_path print(f"\n {relative_path}...") report = migrate_yaml_file( input_file, output_file, elf_codes, args.country, args.dry_run or args.report_only ) # Merge into combined report for result in report.results: combined_report.add_result(result) print("\n" + "="*80) print(combined_report.summary()) # Save combined report with open(args.report_path, 'w', encoding='utf-8') as f: f.write("BATCH MIGRATION REPORT\n") f.write("="*80 + "\n") f.write(f"Input directory: {args.input_dir}\n") f.write(f"Output directory: {args.output_dir}\n") f.write(f"Files processed: {len(yaml_files)}\n") f.write(combined_report.summary()) print(f"\nCombined report saved to {args.report_path}") if __name__ == '__main__': main()