#!/usr/bin/env python3 """ Reclassify NL-XX-XXX-PENDING files that are clearly from other countries. This script: 1. Scans NL PENDING files for country-specific indicators 2. Renames files with correct country prefixes 3. Logs all changes Usage: python scripts/reclassify_non_dutch_pending.py --dry-run python scripts/reclassify_non_dutch_pending.py """ import re import yaml import shutil from pathlib import Path from datetime import datetime, timezone from typing import Optional, Tuple CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Country detection patterns - order matters (more specific first) COUNTRY_PATTERNS = [ # Indonesia (r'\b(Indonesia|ANRI|Arsip Nasional|Indonesian|Jakarta|Yogyakarta|Bandung|Bali|Java)\b', 'ID'), # Germany (r'\b(German|Germany|Deutschland|Deutsches?|Berlin|München|Munich|Hamburg|Frankfurt|Marburg|Köln|Cologne|Heidelberg|DAI)\b', 'DE'), # France (r'\b(French|France|Français|Française|Paris|Lyon|Marseille|Rennes|Toulouse|Bordeaux|EFR|École française)\b', 'FR'), (r'\b(Musée|Château|Bibliothèque nationale|Archives de|Centre des monuments nationaux)\b', 'FR'), # Italy (r'\b(Italian|Italy|Italia|Italiano|Roma|Rome|Milano|Milan|Venezia|Venice|Firenze|Florence|Venaria Reale)\b', 'IT'), # UK (r'\b(British|Britain|UK|United Kingdom|England|London|Edinburgh|Scotland|Wales|Oxford|Cambridge|Historic Royal Palaces)\b', 'GB'), # USA (r'\b(American|America|USA|United States|Washington|New York|California|Smithsonian|Library of Congress|GIA)\b', 'US'), # Australia (r'\b(Australian|Australia|Sydney|Melbourne|Brisbane|Victoria|Queensland|Canberra)\b', 'AU'), # Spain (r'\b(Spanish|Spain|España|Español|Madrid|Barcelona|Sevilla|Valencia|Bilbao)\b', 'ES'), # Portugal (r'\b(Portuguese|Portugal|Lisboa|Lisbon|Porto|Portuguesa)\b', 'PT'), # Greece (r'\b(Greek|Greece|Athens|British School at Athens)\b', 'GR'), # Denmark (r'\b(Danish|Denmark|København|Copenhagen|Fonden)\b', 'DK'), # Belgium (r'\b(Belgian|Belgium|Belgique|België|Brussels|Bruxelles|Antwerp|Antwerpen|Gent|Ghent|ADVN)\b', 'BE'), # International orgs (not Dutch) (r'\b(DARIAH|ERIC|European Union|IUCN|International Society)\b', 'EU'), # Saudi Arabia (r'\b(Saudi|Arabia|Saudi Arabian|Riyadh|Jeddah|Arabian Oud)\b', 'SA'), # Japan (r'\b(Japanese|Japan|Tokyo|Kyoto|Osaka)\b', 'JP'), # China (r'\b(Chinese|China|Beijing|Shanghai)\b', 'CN'), # India (r'\b(Indian|India|Mumbai|Delhi|Bangalore|Kolkata)\b', 'IN'), # Kenya (r'\b(Kenyan|Kenya|Nairobi)\b', 'KE'), # Israel (r'\b(Israeli|Israel|Jerusalem|Tel Aviv)\b', 'IL'), # Norway (r'\b(Norwegian|Norway|Oslo|Bergen)\b', 'NO'), ] # Known organizations with their countries KNOWN_ORGS = { 'Anne Frank Educational Center': 'DE', # Frankfurt 'Archives de Rennes': 'FR', 'Arsip Nasional Republik Indonesia': 'ID', 'Bildarchiv Foto Marburg': 'DE', 'British School at Athens': 'GR', 'British Trust for Ornithology': 'GB', 'Centre des monuments nationaux': 'FR', 'Centro Conservazione Restauro La Venaria Reale': 'IT', 'Château de Chantilly': 'FR', 'Deutsches Archäologisches Institut': 'DE', 'École française de Rome': 'IT', # Based in Rome 'European Museum Academy': 'EU', 'GIA Gemological Institute of America': 'US', 'Historic Royal Palaces': 'GB', 'IRHT Institut de recherche et d\'histoire des textes': 'FR', 'Archaeological Research Services Ltd': 'GB', 'Australian Museums and Galleries Association Victoria': 'AU', 'Australian Society of Archivists': 'AU', 'Art:1 New Museum': 'ID', # Jakarta 'Art Zoo Museum': 'ID', # Jakarta 'Asmat Museum of Culture and Progress': 'ID', 'Arabian Oud': 'SA', 'Bonhams': 'GB', 'CIFOR': 'ID', # Bogor, Indonesia 'DARIAH-ERIC': 'DE', # Based in Germany 'Fisheries Resource Center of Indonesia': 'ID', 'Common Wadden Sea Secretariat': 'DE', # Wilhelmshaven 'International Society of Arboriculture': 'US', 'IUCN SSC Shark Specialist Group': 'CA', # Based in Canada 'Augustinus Fonden': 'DK', } def detect_country(emic_name: str) -> Optional[str]: """Detect country from organization name.""" # Check known orgs first for org, country in KNOWN_ORGS.items(): if org.lower() in emic_name.lower(): return country # Check patterns for pattern, country in COUNTRY_PATTERNS: if re.search(pattern, emic_name, re.IGNORECASE): return country return None def process_file(filepath: Path, dry_run: bool = True) -> Optional[Tuple[str, str]]: """Process a single PENDING file.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) emic_name = data.get('custodian_name', {}).get('emic_name', '') if not emic_name: return None # Detect country country = detect_country(emic_name) if not country or country == 'NL': return None # Generate new filename old_name = filepath.name # Replace NL- with new country code new_name = old_name.replace('NL-XX-XXX-PENDING', f'{country}-XX-XXX-PENDING') new_path = CUSTODIAN_DIR / new_name # Avoid collision if new_path.exists(): stem = new_path.stem new_name = f"{stem}-2.yaml" new_path = CUSTODIAN_DIR / new_name if dry_run: print(f"[WOULD RECLASSIFY] {emic_name}") print(f" NL -> {country}") print(f" {old_name} -> {new_name}") return ('dry_run', country) # Update data data['location'] = data.get('location', {}) data['location']['country'] = country # Add reclassification provenance if 'ghcid_resolution' not in data: data['ghcid_resolution'] = {} data['ghcid_resolution']['reclassified_from'] = 'NL' data['ghcid_resolution']['reclassified_to'] = country data['ghcid_resolution']['reclassified_at'] = datetime.now(timezone.utc).isoformat() with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False) shutil.move(filepath, new_path) print(f"[RECLASSIFIED] {emic_name}: NL -> {country}") return (new_name, country) except Exception as e: print(f"[ERROR] {filepath.name}: {e}") return None def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') args = parser.parse_args() # Find NL PENDING files pending_files = list(CUSTODIAN_DIR.glob("NL-XX-XXX-PENDING-*.yaml")) print(f"Scanning {len(pending_files)} NL PENDING files for non-Dutch organizations...\n") from collections import defaultdict country_counts = defaultdict(int) reclassified = 0 for filepath in pending_files: result = process_file(filepath, dry_run=args.dry_run) if result: reclassified += 1 _, country = result country_counts[country] += 1 print(f"\n{'Would reclassify' if args.dry_run else 'Reclassified'}: {reclassified}") if country_counts: print("\nBy country:") for country, count in sorted(country_counts.items(), key=lambda x: -x[1]): print(f" {country}: {count}") if __name__ == '__main__': main()