205 lines
7.5 KiB
Python
205 lines
7.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Reclassify NL-XX-XXX-PENDING files that are clearly from other countries.
|
|
|
|
This script:
|
|
1. Scans NL PENDING files for country-specific indicators
|
|
2. Renames files with correct country prefixes
|
|
3. Logs all changes
|
|
|
|
Usage:
|
|
python scripts/reclassify_non_dutch_pending.py --dry-run
|
|
python scripts/reclassify_non_dutch_pending.py
|
|
"""
|
|
|
|
import re
|
|
import yaml
|
|
import shutil
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional, Tuple
|
|
|
|
CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
# Country detection patterns - order matters (more specific first)
|
|
COUNTRY_PATTERNS = [
|
|
# Indonesia
|
|
(r'\b(Indonesia|ANRI|Arsip Nasional|Indonesian|Jakarta|Yogyakarta|Bandung|Bali|Java)\b', 'ID'),
|
|
# Germany
|
|
(r'\b(German|Germany|Deutschland|Deutsches?|Berlin|München|Munich|Hamburg|Frankfurt|Marburg|Köln|Cologne|Heidelberg|DAI)\b', 'DE'),
|
|
# France
|
|
(r'\b(French|France|Français|Française|Paris|Lyon|Marseille|Rennes|Toulouse|Bordeaux|EFR|École française)\b', 'FR'),
|
|
(r'\b(Musée|Château|Bibliothèque nationale|Archives de|Centre des monuments nationaux)\b', 'FR'),
|
|
# Italy
|
|
(r'\b(Italian|Italy|Italia|Italiano|Roma|Rome|Milano|Milan|Venezia|Venice|Firenze|Florence|Venaria Reale)\b', 'IT'),
|
|
# UK
|
|
(r'\b(British|Britain|UK|United Kingdom|England|London|Edinburgh|Scotland|Wales|Oxford|Cambridge|Historic Royal Palaces)\b', 'GB'),
|
|
# USA
|
|
(r'\b(American|America|USA|United States|Washington|New York|California|Smithsonian|Library of Congress|GIA)\b', 'US'),
|
|
# Australia
|
|
(r'\b(Australian|Australia|Sydney|Melbourne|Brisbane|Victoria|Queensland|Canberra)\b', 'AU'),
|
|
# Spain
|
|
(r'\b(Spanish|Spain|España|Español|Madrid|Barcelona|Sevilla|Valencia|Bilbao)\b', 'ES'),
|
|
# Portugal
|
|
(r'\b(Portuguese|Portugal|Lisboa|Lisbon|Porto|Portuguesa)\b', 'PT'),
|
|
# Greece
|
|
(r'\b(Greek|Greece|Athens|British School at Athens)\b', 'GR'),
|
|
# Denmark
|
|
(r'\b(Danish|Denmark|København|Copenhagen|Fonden)\b', 'DK'),
|
|
# Belgium
|
|
(r'\b(Belgian|Belgium|Belgique|België|Brussels|Bruxelles|Antwerp|Antwerpen|Gent|Ghent|ADVN)\b', 'BE'),
|
|
# International orgs (not Dutch)
|
|
(r'\b(DARIAH|ERIC|European Union|IUCN|International Society)\b', 'EU'),
|
|
# Saudi Arabia
|
|
(r'\b(Saudi|Arabia|Saudi Arabian|Riyadh|Jeddah|Arabian Oud)\b', 'SA'),
|
|
# Japan
|
|
(r'\b(Japanese|Japan|Tokyo|Kyoto|Osaka)\b', 'JP'),
|
|
# China
|
|
(r'\b(Chinese|China|Beijing|Shanghai)\b', 'CN'),
|
|
# India
|
|
(r'\b(Indian|India|Mumbai|Delhi|Bangalore|Kolkata)\b', 'IN'),
|
|
# Kenya
|
|
(r'\b(Kenyan|Kenya|Nairobi)\b', 'KE'),
|
|
# Israel
|
|
(r'\b(Israeli|Israel|Jerusalem|Tel Aviv)\b', 'IL'),
|
|
# Norway
|
|
(r'\b(Norwegian|Norway|Oslo|Bergen)\b', 'NO'),
|
|
]
|
|
|
|
# Known organizations with their countries
|
|
KNOWN_ORGS = {
|
|
'Anne Frank Educational Center': 'DE', # Frankfurt
|
|
'Archives de Rennes': 'FR',
|
|
'Arsip Nasional Republik Indonesia': 'ID',
|
|
'Bildarchiv Foto Marburg': 'DE',
|
|
'British School at Athens': 'GR',
|
|
'British Trust for Ornithology': 'GB',
|
|
'Centre des monuments nationaux': 'FR',
|
|
'Centro Conservazione Restauro La Venaria Reale': 'IT',
|
|
'Château de Chantilly': 'FR',
|
|
'Deutsches Archäologisches Institut': 'DE',
|
|
'École française de Rome': 'IT', # Based in Rome
|
|
'European Museum Academy': 'EU',
|
|
'GIA Gemological Institute of America': 'US',
|
|
'Historic Royal Palaces': 'GB',
|
|
'IRHT Institut de recherche et d\'histoire des textes': 'FR',
|
|
'Archaeological Research Services Ltd': 'GB',
|
|
'Australian Museums and Galleries Association Victoria': 'AU',
|
|
'Australian Society of Archivists': 'AU',
|
|
'Art:1 New Museum': 'ID', # Jakarta
|
|
'Art Zoo Museum': 'ID', # Jakarta
|
|
'Asmat Museum of Culture and Progress': 'ID',
|
|
'Arabian Oud': 'SA',
|
|
'Bonhams': 'GB',
|
|
'CIFOR': 'ID', # Bogor, Indonesia
|
|
'DARIAH-ERIC': 'DE', # Based in Germany
|
|
'Fisheries Resource Center of Indonesia': 'ID',
|
|
'Common Wadden Sea Secretariat': 'DE', # Wilhelmshaven
|
|
'International Society of Arboriculture': 'US',
|
|
'IUCN SSC Shark Specialist Group': 'CA', # Based in Canada
|
|
'Augustinus Fonden': 'DK',
|
|
}
|
|
|
|
|
|
def detect_country(emic_name: str) -> Optional[str]:
|
|
"""Detect country from organization name."""
|
|
# Check known orgs first
|
|
for org, country in KNOWN_ORGS.items():
|
|
if org.lower() in emic_name.lower():
|
|
return country
|
|
|
|
# Check patterns
|
|
for pattern, country in COUNTRY_PATTERNS:
|
|
if re.search(pattern, emic_name, re.IGNORECASE):
|
|
return country
|
|
|
|
return None
|
|
|
|
|
|
def process_file(filepath: Path, dry_run: bool = True) -> Optional[Tuple[str, str]]:
|
|
"""Process a single PENDING file."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
emic_name = data.get('custodian_name', {}).get('emic_name', '')
|
|
if not emic_name:
|
|
return None
|
|
|
|
# Detect country
|
|
country = detect_country(emic_name)
|
|
if not country or country == 'NL':
|
|
return None
|
|
|
|
# Generate new filename
|
|
old_name = filepath.name
|
|
# Replace NL- with new country code
|
|
new_name = old_name.replace('NL-XX-XXX-PENDING', f'{country}-XX-XXX-PENDING')
|
|
new_path = CUSTODIAN_DIR / new_name
|
|
|
|
# Avoid collision
|
|
if new_path.exists():
|
|
stem = new_path.stem
|
|
new_name = f"{stem}-2.yaml"
|
|
new_path = CUSTODIAN_DIR / new_name
|
|
|
|
if dry_run:
|
|
print(f"[WOULD RECLASSIFY] {emic_name}")
|
|
print(f" NL -> {country}")
|
|
print(f" {old_name} -> {new_name}")
|
|
return ('dry_run', country)
|
|
|
|
# Update data
|
|
data['location'] = data.get('location', {})
|
|
data['location']['country'] = country
|
|
|
|
# Add reclassification provenance
|
|
if 'ghcid_resolution' not in data:
|
|
data['ghcid_resolution'] = {}
|
|
data['ghcid_resolution']['reclassified_from'] = 'NL'
|
|
data['ghcid_resolution']['reclassified_to'] = country
|
|
data['ghcid_resolution']['reclassified_at'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
shutil.move(filepath, new_path)
|
|
print(f"[RECLASSIFIED] {emic_name}: NL -> {country}")
|
|
|
|
return (new_name, country)
|
|
|
|
except Exception as e:
|
|
print(f"[ERROR] {filepath.name}: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dry-run', action='store_true')
|
|
args = parser.parse_args()
|
|
|
|
# Find NL PENDING files
|
|
pending_files = list(CUSTODIAN_DIR.glob("NL-XX-XXX-PENDING-*.yaml"))
|
|
print(f"Scanning {len(pending_files)} NL PENDING files for non-Dutch organizations...\n")
|
|
|
|
from collections import defaultdict
|
|
country_counts = defaultdict(int)
|
|
reclassified = 0
|
|
|
|
for filepath in pending_files:
|
|
result = process_file(filepath, dry_run=args.dry_run)
|
|
if result:
|
|
reclassified += 1
|
|
_, country = result
|
|
country_counts[country] += 1
|
|
|
|
print(f"\n{'Would reclassify' if args.dry_run else 'Reclassified'}: {reclassified}")
|
|
if country_counts:
|
|
print("\nBy country:")
|
|
for country, count in sorted(country_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {country}: {count}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|