refactor(scripts): generalize GHCID location fixer for all institution types

- Add --type/-t flag to specify institution type (A, G, H, I, L, M, N, O, R, S, T, U, X, ALL)
- Default still Type I (Intangible Heritage) for backward compatibility
- Skip PENDING files that have no location data
- Update help text with all supported types
This commit is contained in:
kempersc 2026-01-09 11:54:28 +01:00
parent 4d5641b6c5
commit 933deb337c

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
"""
Fix GHCID location mismatches for Type I (Intangible Heritage) custodian files.
Fix GHCID location mismatches for heritage custodian files.
This script:
1. Identifies files where GHCID location component doesn't match actual location in locations[] array
@ -10,8 +10,14 @@ This script:
5. Renames files to match new GHCID
Usage:
python scripts/fix_ghcid_location_mismatches.py --dry-run # Preview changes
python scripts/fix_ghcid_location_mismatches.py # Apply changes
python scripts/fix_ghcid_location_mismatches.py --dry-run # Preview Type I changes
python scripts/fix_ghcid_location_mismatches.py --type M --dry-run # Preview Museum changes
python scripts/fix_ghcid_location_mismatches.py --type A # Fix Archive files
python scripts/fix_ghcid_location_mismatches.py --type ALL --dry-run # Preview ALL types
Supported types: A (Archive), G (Gallery), H (Holy Sites), I (Intangible), L (Library),
M (Museum), N (NGO), O (Official), R (Research), S (Society),
T (Taste/Smell), U (Unknown), X (Mixed), ALL
"""
import argparse
@ -354,16 +360,32 @@ def update_yaml_ghcid(data: dict, new_ghcid: str, old_ghcid: str, geonames_data:
return data
def find_mismatched_files(custodian_dir: Path, db_path: str) -> list:
"""Find all Type I files with GHCID location mismatches."""
def find_mismatched_files(custodian_dir: Path, db_path: str, inst_type: str = 'I') -> list:
"""Find all files of given type with GHCID location mismatches.
Args:
custodian_dir: Path to custodian directory
db_path: Path to GeoNames database
inst_type: Institution type code (I, M, A, L, etc.) or 'ALL' for all types
"""
mismatches = []
for filepath in sorted(custodian_dir.glob('NL-*-I-*.yaml')):
# Build glob pattern based on institution type
if inst_type == 'ALL':
pattern = 'NL-*-*.yaml'
else:
pattern = f'NL-*-{inst_type}-*.yaml'
for filepath in sorted(custodian_dir.glob(pattern)):
filename = filepath.stem
current_ghcid = filename
# Skip PENDING files (no location data)
if 'PENDING' in current_ghcid:
continue
# Parse current GHCID
country, region, city_code, inst_type, abbrev = parse_ghcid(current_ghcid)
country, region, city_code, file_inst_type, abbrev = parse_ghcid(current_ghcid)
if not abbrev:
continue
@ -399,7 +421,7 @@ def find_mismatched_files(custodian_dir: Path, db_path: str) -> list:
country,
geonames_data['province_code'],
expected_city_code,
inst_type,
file_inst_type,
abbrev
)
@ -419,6 +441,10 @@ def find_mismatched_files(custodian_dir: Path, db_path: str) -> list:
return mismatches
# Valid institution type codes
VALID_INST_TYPES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'X', 'ALL']
def check_collision(custodian_dir: Path, new_ghcid: str, old_filepath: Path) -> bool:
"""Check if the new GHCID would collide with an existing file."""
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
@ -426,12 +452,19 @@ def check_collision(custodian_dir: Path, new_ghcid: str, old_filepath: Path) ->
def main():
parser = argparse.ArgumentParser(description='Fix GHCID location mismatches for Type I custodian files')
parser = argparse.ArgumentParser(description='Fix GHCID location mismatches for heritage custodian files')
parser.add_argument('--dry-run', action='store_true', help='Preview changes without applying')
parser.add_argument('--type', '-t', default='I', choices=VALID_INST_TYPES,
help='Institution type code: A (Archive), G (Gallery), H (Holy Sites), '
'I (Intangible, default), L (Library), M (Museum), N (NGO), O (Official), '
'R (Research), S (Society), T (Taste/Smell), U (Unknown), X (Mixed), '
'ALL (all types)')
parser.add_argument('--custodian-dir', default='data/custodian', help='Path to custodian directory')
parser.add_argument('--geonames-db', default='data/reference/geonames.db', help='Path to GeoNames database')
args = parser.parse_args()
inst_type = args.type
# Resolve paths
script_dir = Path(__file__).parent.parent
custodian_dir = script_dir / args.custodian_dir
@ -446,16 +479,18 @@ def main():
return 1
print("=" * 80)
print("GHCID Location Mismatch Fixer for Type I (Intangible Heritage) Custodians")
type_name = 'ALL types' if inst_type == 'ALL' else f'Type {inst_type}'
print(f"GHCID Location Mismatch Fixer for {type_name} Heritage Custodians")
print("=" * 80)
print(f"Custodian directory: {custodian_dir}")
print(f"GeoNames database: {db_path}")
print(f"Institution type: {inst_type}")
print(f"Mode: {'DRY RUN (preview only)' if args.dry_run else 'LIVE (applying changes)'}")
print()
# Find mismatches
print("Scanning for location mismatches...")
mismatches = find_mismatched_files(custodian_dir, str(db_path))
mismatches = find_mismatched_files(custodian_dir, str(db_path), inst_type)
print(f"Found {len(mismatches)} files with GHCID location mismatches")
print()