#!/usr/bin/env python3 """ Categorize PENDING files by country based on name patterns and keywords. This script analyzes NL-XX-XXX-PENDING-*.yaml files and categorizes them as: - Dutch (NL) - Should get proper GHCIDs - Non-Dutch (various countries) - Should be moved to country-specific folders or archived - Non-heritage - Should be archived (not heritage custodians) Usage: python scripts/categorize_pending_files.py --dry-run # Analyze only python scripts/categorize_pending_files.py # Move files """ import os import re import yaml from pathlib import Path from collections import defaultdict from typing import Dict, List, Tuple, Optional # Country indicators in organization names COUNTRY_KEYWORDS = { 'DE': ['GERMAN', 'DEUTSCHLAND', 'BERLIN', 'MUNICH', 'MÜNCHEN', 'HAMBURG', 'KÖLN', 'FRANKFURT', 'DRESDEN', 'STUTTGART', 'DÜSSELDORF', 'LEIPZIG', 'STIFTUNG', 'BUNDESREPUBLIK', 'KULTURSTIFTUNG', 'BADEN-WÜRTTEMBERG', 'BAYERN', 'SACHSEN'], 'BE': ['BELGIUM', 'BELGIAN', 'BRUSSELS', 'BRUXELLES', 'ANTWERP', 'GHENT', 'GENT', 'BRUGES', 'LIÈGE', 'WALLONIA', 'FLANDERS', 'VLAANDEREN'], 'FR': ['FRANCE', 'FRENCH', 'PARIS', 'LYON', 'MARSEILLE', 'TOULOUSE', 'BORDEAUX', 'NATIONALE', 'PATRIMOINE', 'MUSÉE', 'ONDA', 'PYRENEES', 'CÉVENNES'], 'GB': ['BRITISH', 'ENGLAND', 'ENGLISH', 'LONDON', 'EDINBURGH', 'SCOTLAND', 'WALES', 'OXFORD', 'CAMBRIDGE', 'ROYAL-ACADEMY', 'ROYAL-ARMOURIES', 'NATIONAL-ARCHIVES-UK', 'ROYAL-BOTANIC', 'NATIONAL-TRUST', 'ENGLISH-HERITAGE'], 'US': ['AMERICAN', 'USA', 'UNITED-STATES', 'WASHINGTON', 'NEW-YORK', 'CALIFORNIA', 'SMITHSONIAN', 'LIBRARY-OF-CONGRESS'], 'AT': ['AUSTRIA', 'AUSTRIAN', 'WIEN', 'VIENNA', 'SALZBURG', 'ÖSTERREICH'], 'CH': ['SWISS', 'SWITZERLAND', 'ZÜRICH', 'ZURICH', 'BASEL', 'GENEVA', 'GENÈVE', 'BERN'], 'IT': ['ITALY', 'ITALIAN', 'ROME', 'ROMA', 'MILAN', 'MILANO', 'FLORENCE', 'FIRENZE', 'VENICE', 'VENEZIA', 'NAPOLI', 'NAPLES'], 'ES': ['SPAIN', 'SPANISH', 'MADRID', 'BARCELONA', 'SEVILLA', 'SEVILLE', 'VALENCIA'], 'PT': ['PORTUGAL', 'PORTUGUESE', 'LISBON', 'LISBOA', 'PORTO'], 'DK': ['DANISH', 'DENMARK', 'COPENHAGEN', 'KØBENHAVN', 'THORVALDSEN'], 'SE': ['SWEDISH', 'SWEDEN', 'STOCKHOLM', 'GOTHENBURG', 'GÖTEBORG', 'MALMÖ'], 'NO': ['NORWEGIAN', 'NORWAY', 'OSLO', 'BERGEN', 'TRONDHEIM'], 'FI': ['FINNISH', 'FINLAND', 'HELSINKI'], 'PL': ['POLISH', 'POLAND', 'WARSAW', 'WARSZAWA', 'KRAKOW', 'KRAKÓW'], 'CZ': ['CZECH', 'CZECHIA', 'PRAGUE', 'PRAHA'], 'AU': ['AUSTRALIA', 'AUSTRALIAN', 'SYDNEY', 'MELBOURNE', 'CANBERRA'], 'CA': ['CANADA', 'CANADIAN', 'TORONTO', 'MONTREAL', 'VANCOUVER', 'OTTAWA'], 'JP': ['JAPAN', 'JAPANESE', 'TOKYO', 'KYOTO', 'OSAKA'], 'KR': ['KOREA', 'KOREAN', 'SEOUL'], 'CN': ['CHINA', 'CHINESE', 'BEIJING', 'SHANGHAI'], 'TW': ['TAIWAN', 'TAIWANESE', 'TAIPEI'], 'ID': ['INDONESIA', 'INDONESIAN', 'JAKARTA', 'PERPUSTAKAAN-NASIONAL', 'MUSEUM-NASIONAL', 'TAMAN-SAFARI', 'GAIA-INDONESIA', 'MUSEUM-MUSIK-INDONESIA', 'ASMAT'], 'SG': ['SINGAPORE', 'SINGAPOREAN'], 'MY': ['MALAYSIA', 'MALAYSIAN', 'KUALA-LUMPUR'], 'IL': ['ISRAEL', 'ISRAELI', 'JERUSALEM', 'TEL-AVIV'], 'AE': ['UAE', 'DUBAI', 'ABU-DHABI', 'EMIRATES'], 'SA': ['SAUDI', 'ARABIA', 'RIYADH'], 'ZA': ['SOUTH-AFRICA', 'SOUTH-AFRICAN', 'CAPE-TOWN', 'JOHANNESBURG'], 'IN': ['INDIA', 'INDIAN', 'DELHI', 'MUMBAI', 'BANGALORE'], 'RU': ['RUSSIA', 'RUSSIAN', 'MOSCOW', 'MOSKVA', 'SAINT-PETERSBURG'], 'IE': ['IRELAND', 'IRISH', 'DUBLIN'], 'NZ': ['NEW-ZEALAND', 'ZEALAND', 'AUCKLAND', 'WELLINGTON'], } # Dutch cities and keywords DUTCH_KEYWORDS = [ 'AMSTERDAM', 'ROTTERDAM', 'HAGUE', 'DEN-HAAG', 'UTRECHT', 'EINDHOVEN', 'GRONINGEN', 'TILBURG', 'ALMERE', 'BREDA', 'NIJMEGEN', 'ENSCHEDE', 'HAARLEM', 'ARNHEM', 'ZAANSTAD', 'AMERSFOORT', 'APELDOORN', 'MAASTRICHT', 'LEIDEN', 'DORDRECHT', 'ZOETERMEER', 'ZWOLLE', 'DEVENTER', 'DELFT', 'ALKMAAR', 'GOUDA', 'HILVERSUM', 'MIDDELBURG', 'LEEUWARDEN', 'ASSEN', 'NEDERLANDS', 'NEDERLANDER', 'NETHERLANDS', 'HOLLAND', 'HOLLANDS', 'RIJKS', 'STADS', 'GEMEENTE', 'PROVINCIAAL', 'NOORD-HOLLAND', 'ZUID-HOLLAND', 'NOORD-BRABANT', 'LIMBURG', 'GELDERLAND', 'OVERIJSSEL', 'DRENTHE', 'FRIESLAND', 'FLEVOLAND', 'ZEELAND', 'ERFGOED', 'HEEMKUNDE', 'OUDHEIDKAMER', 'HISTORISCHE-VERENIGING', 'HISTORISCH-CENTRUM', ] # Non-heritage organization keywords NON_HERITAGE_KEYWORDS = [ 'WILDLIFE', 'FARMING', 'AGRICULTURAL', 'NATURE-CONSERVATION', 'ENVIRONMENTAL', 'CLIMATE', 'SUSTAINABILITY', 'RENEWABLE', 'HOSPITAL', 'MEDICAL', 'HEALTHCARE', 'PHARMACEUTICAL', 'BANK', 'FINANCIAL', 'INSURANCE', 'INVESTMENT', 'SOFTWARE', 'TECHNOLOGY', 'STARTUP', 'DIGITAL-AGENCY', 'MARKETING', 'ADVERTISING', 'PR-AGENCY', 'CONSULTING', 'RESTAURANT', 'HOTEL', 'CATERING', 'HOSPITALITY', 'SPORTS', 'FITNESS', 'GYM', 'FOOTBALL', 'SOCCER', 'POLITICAL', 'ADVOCACY', 'LOBBY', 'STANDWITHUS', ] # International organizations (keep with NL code but flag as international) INTERNATIONAL_ORGS = [ 'ICOM', 'ICOMOS', 'IFLA', 'ICA-', 'UNESCO', 'EUROPEANA', 'EXARC', 'INTERNATIONAL', 'WORLD', 'GLOBAL', 'EUROPEAN-UNION', 'EU-', ] def detect_country(name: str) -> Tuple[Optional[str], str]: """ Detect likely country from organization name. Returns: Tuple of (country_code, reason) """ name_upper = name.upper().replace(' ', '-').replace('_', '-') # Check for explicit Dutch indicators first for keyword in DUTCH_KEYWORDS: if keyword in name_upper: return ('NL', f'Dutch keyword: {keyword}') # Check for international organizations for keyword in INTERNATIONAL_ORGS: if keyword in name_upper: return ('INTL', f'International org: {keyword}') # Check for non-heritage for keyword in NON_HERITAGE_KEYWORDS: if keyword in name_upper: return ('NON_HERITAGE', f'Non-heritage keyword: {keyword}') # Check other countries for country_code, keywords in COUNTRY_KEYWORDS.items(): for keyword in keywords: if keyword in name_upper: return (country_code, f'{country_code} keyword: {keyword}') return (None, 'No country detected') def analyze_pending_file(filepath: Path) -> Dict: """Analyze a single PENDING file.""" try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) except Exception as e: return {'error': str(e)} name = data.get('custodian_name', {}).get('emic_name', '') filename = filepath.stem.replace('NL-XX-XXX-PENDING-', '') # Get staff count staff_count = len(data.get('staff', {}).get('staff_list', [])) # Detect country from name and filename country_from_name, reason_name = detect_country(name) country_from_file, reason_file = detect_country(filename) # Prefer name-based detection country = country_from_name or country_from_file reason = reason_name if country_from_name else reason_file return { 'filepath': str(filepath), 'filename': filename, 'emic_name': name, 'institution_type': data.get('institution_type', 'UNKNOWN'), 'staff_count': staff_count, 'detected_country': country, 'detection_reason': reason, } def main(): import argparse parser = argparse.ArgumentParser(description='Categorize PENDING files by country') parser.add_argument('--dry-run', action='store_true', help='Analyze only, no file moves') parser.add_argument('--output', type=Path, default=Path('pending_file_analysis.yaml'), help='Output file for analysis results') args = parser.parse_args() custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian') # Find all PENDING files pending_files = list(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml')) print(f"Found {len(pending_files)} PENDING files") # Analyze each file results = [] country_counts = defaultdict(int) country_staff = defaultdict(int) for filepath in sorted(pending_files): analysis = analyze_pending_file(filepath) results.append(analysis) country = analysis.get('detected_country') or 'UNKNOWN' country_counts[country] += 1 country_staff[country] += analysis.get('staff_count', 0) # Print summary print("\n" + "=" * 80) print("COUNTRY DISTRIBUTION") print("=" * 80) print(f"{'Country':<15} {'Files':>8} {'Staff':>10}") print("-" * 35) for country in sorted(country_counts.keys(), key=lambda x: country_counts[x], reverse=True): print(f"{country:<15} {country_counts[country]:>8} {country_staff[country]:>10}") print("-" * 35) print(f"{'TOTAL':<15} {len(results):>8} {sum(country_staff.values()):>10}") # Save detailed results if args.output: with open(args.output, 'w', encoding='utf-8') as f: yaml.dump({ 'summary': { 'total_files': len(results), 'country_counts': dict(country_counts), 'country_staff_counts': dict(country_staff), }, 'files': results, }, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"\nDetailed results saved to: {args.output}") # List files by category print("\n" + "=" * 80) print("SAMPLE FILES BY CATEGORY") print("=" * 80) for country in ['NL', 'DE', 'BE', 'GB', 'FR', 'US', 'ID', 'INTL', 'NON_HERITAGE', 'UNKNOWN']: files = [r for r in results if r.get('detected_country') == country] if files: print(f"\n{country} ({len(files)} files):") for f in files[:5]: print(f" - {f['emic_name'][:50]} ({f['staff_count']} staff)") if len(files) > 5: print(f" ... and {len(files) - 5} more") if __name__ == '__main__': main()