diff --git a/scripts/fix_ghcid_location_mismatches.py b/scripts/fix_ghcid_location_mismatches.py index c7e5c245c0..e92009da08 100644 --- a/scripts/fix_ghcid_location_mismatches.py +++ b/scripts/fix_ghcid_location_mismatches.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Fix GHCID location mismatches for Type I (Intangible Heritage) custodian files. +Fix GHCID location mismatches for heritage custodian files. This script: 1. Identifies files where GHCID location component doesn't match actual location in locations[] array @@ -10,8 +10,14 @@ This script: 5. Renames files to match new GHCID Usage: - python scripts/fix_ghcid_location_mismatches.py --dry-run # Preview changes - python scripts/fix_ghcid_location_mismatches.py # Apply changes + python scripts/fix_ghcid_location_mismatches.py --dry-run # Preview Type I changes + python scripts/fix_ghcid_location_mismatches.py --type M --dry-run # Preview Museum changes + python scripts/fix_ghcid_location_mismatches.py --type A # Fix Archive files + python scripts/fix_ghcid_location_mismatches.py --type ALL --dry-run # Preview ALL types + +Supported types: A (Archive), G (Gallery), H (Holy Sites), I (Intangible), L (Library), + M (Museum), N (NGO), O (Official), R (Research), S (Society), + T (Taste/Smell), U (Unknown), X (Mixed), ALL """ import argparse @@ -354,16 +360,32 @@ def update_yaml_ghcid(data: dict, new_ghcid: str, old_ghcid: str, geonames_data: return data -def find_mismatched_files(custodian_dir: Path, db_path: str) -> list: - """Find all Type I files with GHCID location mismatches.""" +def find_mismatched_files(custodian_dir: Path, db_path: str, inst_type: str = 'I') -> list: + """Find all files of given type with GHCID location mismatches. + + Args: + custodian_dir: Path to custodian directory + db_path: Path to GeoNames database + inst_type: Institution type code (I, M, A, L, etc.) or 'ALL' for all types + """ mismatches = [] - for filepath in sorted(custodian_dir.glob('NL-*-I-*.yaml')): + # Build glob pattern based on institution type + if inst_type == 'ALL': + pattern = 'NL-*-*.yaml' + else: + pattern = f'NL-*-{inst_type}-*.yaml' + + for filepath in sorted(custodian_dir.glob(pattern)): filename = filepath.stem current_ghcid = filename + # Skip PENDING files (no location data) + if 'PENDING' in current_ghcid: + continue + # Parse current GHCID - country, region, city_code, inst_type, abbrev = parse_ghcid(current_ghcid) + country, region, city_code, file_inst_type, abbrev = parse_ghcid(current_ghcid) if not abbrev: continue @@ -399,7 +421,7 @@ def find_mismatched_files(custodian_dir: Path, db_path: str) -> list: country, geonames_data['province_code'], expected_city_code, - inst_type, + file_inst_type, abbrev ) @@ -419,6 +441,10 @@ def find_mismatched_files(custodian_dir: Path, db_path: str) -> list: return mismatches +# Valid institution type codes +VALID_INST_TYPES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'X', 'ALL'] + + def check_collision(custodian_dir: Path, new_ghcid: str, old_filepath: Path) -> bool: """Check if the new GHCID would collide with an existing file.""" new_filepath = custodian_dir / f"{new_ghcid}.yaml" @@ -426,12 +452,19 @@ def check_collision(custodian_dir: Path, new_ghcid: str, old_filepath: Path) -> def main(): - parser = argparse.ArgumentParser(description='Fix GHCID location mismatches for Type I custodian files') + parser = argparse.ArgumentParser(description='Fix GHCID location mismatches for heritage custodian files') parser.add_argument('--dry-run', action='store_true', help='Preview changes without applying') + parser.add_argument('--type', '-t', default='I', choices=VALID_INST_TYPES, + help='Institution type code: A (Archive), G (Gallery), H (Holy Sites), ' + 'I (Intangible, default), L (Library), M (Museum), N (NGO), O (Official), ' + 'R (Research), S (Society), T (Taste/Smell), U (Unknown), X (Mixed), ' + 'ALL (all types)') parser.add_argument('--custodian-dir', default='data/custodian', help='Path to custodian directory') parser.add_argument('--geonames-db', default='data/reference/geonames.db', help='Path to GeoNames database') args = parser.parse_args() + inst_type = args.type + # Resolve paths script_dir = Path(__file__).parent.parent custodian_dir = script_dir / args.custodian_dir @@ -446,16 +479,18 @@ def main(): return 1 print("=" * 80) - print("GHCID Location Mismatch Fixer for Type I (Intangible Heritage) Custodians") + type_name = 'ALL types' if inst_type == 'ALL' else f'Type {inst_type}' + print(f"GHCID Location Mismatch Fixer for {type_name} Heritage Custodians") print("=" * 80) print(f"Custodian directory: {custodian_dir}") print(f"GeoNames database: {db_path}") + print(f"Institution type: {inst_type}") print(f"Mode: {'DRY RUN (preview only)' if args.dry_run else 'LIVE (applying changes)'}") print() # Find mismatches print("Scanning for location mismatches...") - mismatches = find_mismatched_files(custodian_dir, str(db_path)) + mismatches = find_mismatched_files(custodian_dir, str(db_path), inst_type) print(f"Found {len(mismatches)} files with GHCID location mismatches") print()