refactor(scripts): generalize GHCID location fixer for all institution types
- Add --type/-t flag to specify institution type (A, G, H, I, L, M, N, O, R, S, T, U, X, ALL) - Default still Type I (Intangible Heritage) for backward compatibility - Skip PENDING files that have no location data - Update help text with all supported types
This commit is contained in:
parent
4d5641b6c5
commit
933deb337c
1 changed files with 46 additions and 11 deletions
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fix GHCID location mismatches for Type I (Intangible Heritage) custodian files.
|
||||
Fix GHCID location mismatches for heritage custodian files.
|
||||
|
||||
This script:
|
||||
1. Identifies files where GHCID location component doesn't match actual location in locations[] array
|
||||
|
|
@ -10,8 +10,14 @@ This script:
|
|||
5. Renames files to match new GHCID
|
||||
|
||||
Usage:
|
||||
python scripts/fix_ghcid_location_mismatches.py --dry-run # Preview changes
|
||||
python scripts/fix_ghcid_location_mismatches.py # Apply changes
|
||||
python scripts/fix_ghcid_location_mismatches.py --dry-run # Preview Type I changes
|
||||
python scripts/fix_ghcid_location_mismatches.py --type M --dry-run # Preview Museum changes
|
||||
python scripts/fix_ghcid_location_mismatches.py --type A # Fix Archive files
|
||||
python scripts/fix_ghcid_location_mismatches.py --type ALL --dry-run # Preview ALL types
|
||||
|
||||
Supported types: A (Archive), G (Gallery), H (Holy Sites), I (Intangible), L (Library),
|
||||
M (Museum), N (NGO), O (Official), R (Research), S (Society),
|
||||
T (Taste/Smell), U (Unknown), X (Mixed), ALL
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -354,16 +360,32 @@ def update_yaml_ghcid(data: dict, new_ghcid: str, old_ghcid: str, geonames_data:
|
|||
return data
|
||||
|
||||
|
||||
def find_mismatched_files(custodian_dir: Path, db_path: str) -> list:
|
||||
"""Find all Type I files with GHCID location mismatches."""
|
||||
def find_mismatched_files(custodian_dir: Path, db_path: str, inst_type: str = 'I') -> list:
|
||||
"""Find all files of given type with GHCID location mismatches.
|
||||
|
||||
Args:
|
||||
custodian_dir: Path to custodian directory
|
||||
db_path: Path to GeoNames database
|
||||
inst_type: Institution type code (I, M, A, L, etc.) or 'ALL' for all types
|
||||
"""
|
||||
mismatches = []
|
||||
|
||||
for filepath in sorted(custodian_dir.glob('NL-*-I-*.yaml')):
|
||||
# Build glob pattern based on institution type
|
||||
if inst_type == 'ALL':
|
||||
pattern = 'NL-*-*.yaml'
|
||||
else:
|
||||
pattern = f'NL-*-{inst_type}-*.yaml'
|
||||
|
||||
for filepath in sorted(custodian_dir.glob(pattern)):
|
||||
filename = filepath.stem
|
||||
current_ghcid = filename
|
||||
|
||||
# Skip PENDING files (no location data)
|
||||
if 'PENDING' in current_ghcid:
|
||||
continue
|
||||
|
||||
# Parse current GHCID
|
||||
country, region, city_code, inst_type, abbrev = parse_ghcid(current_ghcid)
|
||||
country, region, city_code, file_inst_type, abbrev = parse_ghcid(current_ghcid)
|
||||
|
||||
if not abbrev:
|
||||
continue
|
||||
|
|
@ -399,7 +421,7 @@ def find_mismatched_files(custodian_dir: Path, db_path: str) -> list:
|
|||
country,
|
||||
geonames_data['province_code'],
|
||||
expected_city_code,
|
||||
inst_type,
|
||||
file_inst_type,
|
||||
abbrev
|
||||
)
|
||||
|
||||
|
|
@ -419,6 +441,10 @@ def find_mismatched_files(custodian_dir: Path, db_path: str) -> list:
|
|||
return mismatches
|
||||
|
||||
|
||||
# Valid institution type codes
|
||||
VALID_INST_TYPES = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'X', 'ALL']
|
||||
|
||||
|
||||
def check_collision(custodian_dir: Path, new_ghcid: str, old_filepath: Path) -> bool:
|
||||
"""Check if the new GHCID would collide with an existing file."""
|
||||
new_filepath = custodian_dir / f"{new_ghcid}.yaml"
|
||||
|
|
@ -426,12 +452,19 @@ def check_collision(custodian_dir: Path, new_ghcid: str, old_filepath: Path) ->
|
|||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Fix GHCID location mismatches for Type I custodian files')
|
||||
parser = argparse.ArgumentParser(description='Fix GHCID location mismatches for heritage custodian files')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Preview changes without applying')
|
||||
parser.add_argument('--type', '-t', default='I', choices=VALID_INST_TYPES,
|
||||
help='Institution type code: A (Archive), G (Gallery), H (Holy Sites), '
|
||||
'I (Intangible, default), L (Library), M (Museum), N (NGO), O (Official), '
|
||||
'R (Research), S (Society), T (Taste/Smell), U (Unknown), X (Mixed), '
|
||||
'ALL (all types)')
|
||||
parser.add_argument('--custodian-dir', default='data/custodian', help='Path to custodian directory')
|
||||
parser.add_argument('--geonames-db', default='data/reference/geonames.db', help='Path to GeoNames database')
|
||||
args = parser.parse_args()
|
||||
|
||||
inst_type = args.type
|
||||
|
||||
# Resolve paths
|
||||
script_dir = Path(__file__).parent.parent
|
||||
custodian_dir = script_dir / args.custodian_dir
|
||||
|
|
@ -446,16 +479,18 @@ def main():
|
|||
return 1
|
||||
|
||||
print("=" * 80)
|
||||
print("GHCID Location Mismatch Fixer for Type I (Intangible Heritage) Custodians")
|
||||
type_name = 'ALL types' if inst_type == 'ALL' else f'Type {inst_type}'
|
||||
print(f"GHCID Location Mismatch Fixer for {type_name} Heritage Custodians")
|
||||
print("=" * 80)
|
||||
print(f"Custodian directory: {custodian_dir}")
|
||||
print(f"GeoNames database: {db_path}")
|
||||
print(f"Institution type: {inst_type}")
|
||||
print(f"Mode: {'DRY RUN (preview only)' if args.dry_run else 'LIVE (applying changes)'}")
|
||||
print()
|
||||
|
||||
# Find mismatches
|
||||
print("Scanning for location mismatches...")
|
||||
mismatches = find_mismatched_files(custodian_dir, str(db_path))
|
||||
mismatches = find_mismatched_files(custodian_dir, str(db_path), inst_type)
|
||||
|
||||
print(f"Found {len(mismatches)} files with GHCID location mismatches")
|
||||
print()
|
||||
|
|
|
|||
Loading…
Reference in a new issue