glam/scripts/reclassify_non_dutch_pending.py
2026-01-09 20:35:19 +01:00

205 lines
7.5 KiB
Python

#!/usr/bin/env python3
"""
Reclassify NL-XX-XXX-PENDING files that are clearly from other countries.
This script:
1. Scans NL PENDING files for country-specific indicators
2. Renames files with correct country prefixes
3. Logs all changes
Usage:
python scripts/reclassify_non_dutch_pending.py --dry-run
python scripts/reclassify_non_dutch_pending.py
"""
import re
import yaml
import shutil
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Tuple
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
# Country detection patterns - order matters (more specific first)
COUNTRY_PATTERNS = [
# Indonesia
(r'\b(Indonesia|ANRI|Arsip Nasional|Indonesian|Jakarta|Yogyakarta|Bandung|Bali|Java)\b', 'ID'),
# Germany
(r'\b(German|Germany|Deutschland|Deutsches?|Berlin|München|Munich|Hamburg|Frankfurt|Marburg|Köln|Cologne|Heidelberg|DAI)\b', 'DE'),
# France
(r'\b(French|France|Français|Française|Paris|Lyon|Marseille|Rennes|Toulouse|Bordeaux|EFR|École française)\b', 'FR'),
(r'\b(Musée|Château|Bibliothèque nationale|Archives de|Centre des monuments nationaux)\b', 'FR'),
# Italy
(r'\b(Italian|Italy|Italia|Italiano|Roma|Rome|Milano|Milan|Venezia|Venice|Firenze|Florence|Venaria Reale)\b', 'IT'),
# UK
(r'\b(British|Britain|UK|United Kingdom|England|London|Edinburgh|Scotland|Wales|Oxford|Cambridge|Historic Royal Palaces)\b', 'GB'),
# USA
(r'\b(American|America|USA|United States|Washington|New York|California|Smithsonian|Library of Congress|GIA)\b', 'US'),
# Australia
(r'\b(Australian|Australia|Sydney|Melbourne|Brisbane|Victoria|Queensland|Canberra)\b', 'AU'),
# Spain
(r'\b(Spanish|Spain|España|Español|Madrid|Barcelona|Sevilla|Valencia|Bilbao)\b', 'ES'),
# Portugal
(r'\b(Portuguese|Portugal|Lisboa|Lisbon|Porto|Portuguesa)\b', 'PT'),
# Greece
(r'\b(Greek|Greece|Athens|British School at Athens)\b', 'GR'),
# Denmark
(r'\b(Danish|Denmark|København|Copenhagen|Fonden)\b', 'DK'),
# Belgium
(r'\b(Belgian|Belgium|Belgique|België|Brussels|Bruxelles|Antwerp|Antwerpen|Gent|Ghent|ADVN)\b', 'BE'),
# International orgs (not Dutch)
(r'\b(DARIAH|ERIC|European Union|IUCN|International Society)\b', 'EU'),
# Saudi Arabia
(r'\b(Saudi|Arabia|Saudi Arabian|Riyadh|Jeddah|Arabian Oud)\b', 'SA'),
# Japan
(r'\b(Japanese|Japan|Tokyo|Kyoto|Osaka)\b', 'JP'),
# China
(r'\b(Chinese|China|Beijing|Shanghai)\b', 'CN'),
# India
(r'\b(Indian|India|Mumbai|Delhi|Bangalore|Kolkata)\b', 'IN'),
# Kenya
(r'\b(Kenyan|Kenya|Nairobi)\b', 'KE'),
# Israel
(r'\b(Israeli|Israel|Jerusalem|Tel Aviv)\b', 'IL'),
# Norway
(r'\b(Norwegian|Norway|Oslo|Bergen)\b', 'NO'),
]
# Known organizations with their countries
KNOWN_ORGS = {
'Anne Frank Educational Center': 'DE', # Frankfurt
'Archives de Rennes': 'FR',
'Arsip Nasional Republik Indonesia': 'ID',
'Bildarchiv Foto Marburg': 'DE',
'British School at Athens': 'GR',
'British Trust for Ornithology': 'GB',
'Centre des monuments nationaux': 'FR',
'Centro Conservazione Restauro La Venaria Reale': 'IT',
'Château de Chantilly': 'FR',
'Deutsches Archäologisches Institut': 'DE',
'École française de Rome': 'IT', # Based in Rome
'European Museum Academy': 'EU',
'GIA Gemological Institute of America': 'US',
'Historic Royal Palaces': 'GB',
'IRHT Institut de recherche et d\'histoire des textes': 'FR',
'Archaeological Research Services Ltd': 'GB',
'Australian Museums and Galleries Association Victoria': 'AU',
'Australian Society of Archivists': 'AU',
'Art:1 New Museum': 'ID', # Jakarta
'Art Zoo Museum': 'ID', # Jakarta
'Asmat Museum of Culture and Progress': 'ID',
'Arabian Oud': 'SA',
'Bonhams': 'GB',
'CIFOR': 'ID', # Bogor, Indonesia
'DARIAH-ERIC': 'DE', # Based in Germany
'Fisheries Resource Center of Indonesia': 'ID',
'Common Wadden Sea Secretariat': 'DE', # Wilhelmshaven
'International Society of Arboriculture': 'US',
'IUCN SSC Shark Specialist Group': 'CA', # Based in Canada
'Augustinus Fonden': 'DK',
}
def detect_country(emic_name: str) -> Optional[str]:
"""Detect country from organization name."""
# Check known orgs first
for org, country in KNOWN_ORGS.items():
if org.lower() in emic_name.lower():
return country
# Check patterns
for pattern, country in COUNTRY_PATTERNS:
if re.search(pattern, emic_name, re.IGNORECASE):
return country
return None
def process_file(filepath: Path, dry_run: bool = True) -> Optional[Tuple[str, str]]:
"""Process a single PENDING file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
emic_name = data.get('custodian_name', {}).get('emic_name', '')
if not emic_name:
return None
# Detect country
country = detect_country(emic_name)
if not country or country == 'NL':
return None
# Generate new filename
old_name = filepath.name
# Replace NL- with new country code
new_name = old_name.replace('NL-XX-XXX-PENDING', f'{country}-XX-XXX-PENDING')
new_path = CUSTODIAN_DIR / new_name
# Avoid collision
if new_path.exists():
stem = new_path.stem
new_name = f"{stem}-2.yaml"
new_path = CUSTODIAN_DIR / new_name
if dry_run:
print(f"[WOULD RECLASSIFY] {emic_name}")
print(f" NL -> {country}")
print(f" {old_name} -> {new_name}")
return ('dry_run', country)
# Update data
data['location'] = data.get('location', {})
data['location']['country'] = country
# Add reclassification provenance
if 'ghcid_resolution' not in data:
data['ghcid_resolution'] = {}
data['ghcid_resolution']['reclassified_from'] = 'NL'
data['ghcid_resolution']['reclassified_to'] = country
data['ghcid_resolution']['reclassified_at'] = datetime.now(timezone.utc).isoformat()
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
shutil.move(filepath, new_path)
print(f"[RECLASSIFIED] {emic_name}: NL -> {country}")
return (new_name, country)
except Exception as e:
print(f"[ERROR] {filepath.name}: {e}")
return None
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
args = parser.parse_args()
# Find NL PENDING files
pending_files = list(CUSTODIAN_DIR.glob("NL-XX-XXX-PENDING-*.yaml"))
print(f"Scanning {len(pending_files)} NL PENDING files for non-Dutch organizations...\n")
from collections import defaultdict
country_counts = defaultdict(int)
reclassified = 0
for filepath in pending_files:
result = process_file(filepath, dry_run=args.dry_run)
if result:
reclassified += 1
_, country = result
country_counts[country] += 1
print(f"\n{'Would reclassify' if args.dry_run else 'Reclassified'}: {reclassified}")
if country_counts:
print("\nBy country:")
for country, count in sorted(country_counts.items(), key=lambda x: -x[1]):
print(f" {country}: {count}")
if __name__ == '__main__':
main()