glam/scripts/categorize_pending_files.py
kempersc 1f723fd5d7 feat(data): merge staff data from 35 PENDING files into enriched custodians
Merged LinkedIn-extracted staff sections from PENDING files into their
corresponding proper GHCID custodian files. This consolidates data from
two extraction sources:
- Existing enriched files: Google Maps, Museum Register, YouTube, etc.
- PENDING files: LinkedIn staff data extraction

Files modified:
- 28 custodian files enriched with staff data
- 35 PENDING files deleted (merged into proper locations)
- Originals archived to archive/pending_duplicates_20250109/

Key institutions enriched:
- Rijksmuseum (NL-NH-AMS-M-RM)
- Stedelijk Museum Amsterdam (NL-NH-AMS-M-SMA)
- Amsterdam Museum (NL-NH-AMS-M-AM)
- Regionaal Archief Alkmaar (NL-NH-ALK-A-RAA)
- Maritiem Museum Rotterdam (NL-ZH-ROT-M-MMR)
- And 23 more museums/archives across NL

New scripts:
- scripts/merge_staff_data.py: Automated staff data merger
- scripts/categorize_pending_files.py: PENDING file analysis utility
2026-01-09 14:51:17 +01:00

237 lines
9.9 KiB
Python

#!/usr/bin/env python3
"""
Categorize PENDING files by country based on name patterns and keywords.
This script analyzes NL-XX-XXX-PENDING-*.yaml files and categorizes them as:
- Dutch (NL) - Should get proper GHCIDs
- Non-Dutch (various countries) - Should be moved to country-specific folders or archived
- Non-heritage - Should be archived (not heritage custodians)
Usage:
python scripts/categorize_pending_files.py --dry-run # Analyze only
python scripts/categorize_pending_files.py # Move files
"""
import os
import re
import yaml
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Tuple, Optional
# Country indicators in organization names
COUNTRY_KEYWORDS = {
'DE': ['GERMAN', 'DEUTSCHLAND', 'BERLIN', 'MUNICH', 'MÜNCHEN', 'HAMBURG', 'KÖLN',
'FRANKFURT', 'DRESDEN', 'STUTTGART', 'DÜSSELDORF', 'LEIPZIG', 'STIFTUNG',
'BUNDESREPUBLIK', 'KULTURSTIFTUNG', 'BADEN-WÜRTTEMBERG', 'BAYERN', 'SACHSEN'],
'BE': ['BELGIUM', 'BELGIAN', 'BRUSSELS', 'BRUXELLES', 'ANTWERP', 'GHENT', 'GENT',
'BRUGES', 'LIÈGE', 'WALLONIA', 'FLANDERS', 'VLAANDEREN'],
'FR': ['FRANCE', 'FRENCH', 'PARIS', 'LYON', 'MARSEILLE', 'TOULOUSE', 'BORDEAUX',
'NATIONALE', 'PATRIMOINE', 'MUSÉE', 'ONDA', 'PYRENEES', 'CÉVENNES'],
'GB': ['BRITISH', 'ENGLAND', 'ENGLISH', 'LONDON', 'EDINBURGH', 'SCOTLAND', 'WALES',
'OXFORD', 'CAMBRIDGE', 'ROYAL-ACADEMY', 'ROYAL-ARMOURIES', 'NATIONAL-ARCHIVES-UK',
'ROYAL-BOTANIC', 'NATIONAL-TRUST', 'ENGLISH-HERITAGE'],
'US': ['AMERICAN', 'USA', 'UNITED-STATES', 'WASHINGTON', 'NEW-YORK', 'CALIFORNIA',
'SMITHSONIAN', 'LIBRARY-OF-CONGRESS'],
'AT': ['AUSTRIA', 'AUSTRIAN', 'WIEN', 'VIENNA', 'SALZBURG', 'ÖSTERREICH'],
'CH': ['SWISS', 'SWITZERLAND', 'ZÜRICH', 'ZURICH', 'BASEL', 'GENEVA', 'GENÈVE', 'BERN'],
'IT': ['ITALY', 'ITALIAN', 'ROME', 'ROMA', 'MILAN', 'MILANO', 'FLORENCE', 'FIRENZE',
'VENICE', 'VENEZIA', 'NAPOLI', 'NAPLES'],
'ES': ['SPAIN', 'SPANISH', 'MADRID', 'BARCELONA', 'SEVILLA', 'SEVILLE', 'VALENCIA'],
'PT': ['PORTUGAL', 'PORTUGUESE', 'LISBON', 'LISBOA', 'PORTO'],
'DK': ['DANISH', 'DENMARK', 'COPENHAGEN', 'KØBENHAVN', 'THORVALDSEN'],
'SE': ['SWEDISH', 'SWEDEN', 'STOCKHOLM', 'GOTHENBURG', 'GÖTEBORG', 'MALMÖ'],
'NO': ['NORWEGIAN', 'NORWAY', 'OSLO', 'BERGEN', 'TRONDHEIM'],
'FI': ['FINNISH', 'FINLAND', 'HELSINKI'],
'PL': ['POLISH', 'POLAND', 'WARSAW', 'WARSZAWA', 'KRAKOW', 'KRAKÓW'],
'CZ': ['CZECH', 'CZECHIA', 'PRAGUE', 'PRAHA'],
'AU': ['AUSTRALIA', 'AUSTRALIAN', 'SYDNEY', 'MELBOURNE', 'CANBERRA'],
'CA': ['CANADA', 'CANADIAN', 'TORONTO', 'MONTREAL', 'VANCOUVER', 'OTTAWA'],
'JP': ['JAPAN', 'JAPANESE', 'TOKYO', 'KYOTO', 'OSAKA'],
'KR': ['KOREA', 'KOREAN', 'SEOUL'],
'CN': ['CHINA', 'CHINESE', 'BEIJING', 'SHANGHAI'],
'TW': ['TAIWAN', 'TAIWANESE', 'TAIPEI'],
'ID': ['INDONESIA', 'INDONESIAN', 'JAKARTA', 'PERPUSTAKAAN-NASIONAL', 'MUSEUM-NASIONAL',
'TAMAN-SAFARI', 'GAIA-INDONESIA', 'MUSEUM-MUSIK-INDONESIA', 'ASMAT'],
'SG': ['SINGAPORE', 'SINGAPOREAN'],
'MY': ['MALAYSIA', 'MALAYSIAN', 'KUALA-LUMPUR'],
'IL': ['ISRAEL', 'ISRAELI', 'JERUSALEM', 'TEL-AVIV'],
'AE': ['UAE', 'DUBAI', 'ABU-DHABI', 'EMIRATES'],
'SA': ['SAUDI', 'ARABIA', 'RIYADH'],
'ZA': ['SOUTH-AFRICA', 'SOUTH-AFRICAN', 'CAPE-TOWN', 'JOHANNESBURG'],
'IN': ['INDIA', 'INDIAN', 'DELHI', 'MUMBAI', 'BANGALORE'],
'RU': ['RUSSIA', 'RUSSIAN', 'MOSCOW', 'MOSKVA', 'SAINT-PETERSBURG'],
'IE': ['IRELAND', 'IRISH', 'DUBLIN'],
'NZ': ['NEW-ZEALAND', 'ZEALAND', 'AUCKLAND', 'WELLINGTON'],
}
# Dutch cities and keywords
DUTCH_KEYWORDS = [
'AMSTERDAM', 'ROTTERDAM', 'HAGUE', 'DEN-HAAG', 'UTRECHT', 'EINDHOVEN',
'GRONINGEN', 'TILBURG', 'ALMERE', 'BREDA', 'NIJMEGEN', 'ENSCHEDE',
'HAARLEM', 'ARNHEM', 'ZAANSTAD', 'AMERSFOORT', 'APELDOORN', 'MAASTRICHT',
'LEIDEN', 'DORDRECHT', 'ZOETERMEER', 'ZWOLLE', 'DEVENTER', 'DELFT',
'ALKMAAR', 'GOUDA', 'HILVERSUM', 'MIDDELBURG', 'LEEUWARDEN', 'ASSEN',
'NEDERLANDS', 'NEDERLANDER', 'NETHERLANDS', 'HOLLAND', 'HOLLANDS',
'RIJKS', 'STADS', 'GEMEENTE', 'PROVINCIAAL', 'NOORD-HOLLAND', 'ZUID-HOLLAND',
'NOORD-BRABANT', 'LIMBURG', 'GELDERLAND', 'OVERIJSSEL', 'DRENTHE',
'FRIESLAND', 'FLEVOLAND', 'ZEELAND', 'ERFGOED', 'HEEMKUNDE', 'OUDHEIDKAMER',
'HISTORISCHE-VERENIGING', 'HISTORISCH-CENTRUM',
]
# Non-heritage organization keywords
NON_HERITAGE_KEYWORDS = [
'WILDLIFE', 'FARMING', 'AGRICULTURAL', 'NATURE-CONSERVATION',
'ENVIRONMENTAL', 'CLIMATE', 'SUSTAINABILITY', 'RENEWABLE',
'HOSPITAL', 'MEDICAL', 'HEALTHCARE', 'PHARMACEUTICAL',
'BANK', 'FINANCIAL', 'INSURANCE', 'INVESTMENT',
'SOFTWARE', 'TECHNOLOGY', 'STARTUP', 'DIGITAL-AGENCY',
'MARKETING', 'ADVERTISING', 'PR-AGENCY', 'CONSULTING',
'RESTAURANT', 'HOTEL', 'CATERING', 'HOSPITALITY',
'SPORTS', 'FITNESS', 'GYM', 'FOOTBALL', 'SOCCER',
'POLITICAL', 'ADVOCACY', 'LOBBY', 'STANDWITHUS',
]
# International organizations (keep with NL code but flag as international)
INTERNATIONAL_ORGS = [
'ICOM', 'ICOMOS', 'IFLA', 'ICA-', 'UNESCO', 'EUROPEANA', 'EXARC',
'INTERNATIONAL', 'WORLD', 'GLOBAL', 'EUROPEAN-UNION', 'EU-',
]
def detect_country(name: str) -> Tuple[Optional[str], str]:
"""
Detect likely country from organization name.
Returns:
Tuple of (country_code, reason)
"""
name_upper = name.upper().replace(' ', '-').replace('_', '-')
# Check for explicit Dutch indicators first
for keyword in DUTCH_KEYWORDS:
if keyword in name_upper:
return ('NL', f'Dutch keyword: {keyword}')
# Check for international organizations
for keyword in INTERNATIONAL_ORGS:
if keyword in name_upper:
return ('INTL', f'International org: {keyword}')
# Check for non-heritage
for keyword in NON_HERITAGE_KEYWORDS:
if keyword in name_upper:
return ('NON_HERITAGE', f'Non-heritage keyword: {keyword}')
# Check other countries
for country_code, keywords in COUNTRY_KEYWORDS.items():
for keyword in keywords:
if keyword in name_upper:
return (country_code, f'{country_code} keyword: {keyword}')
return (None, 'No country detected')
def analyze_pending_file(filepath: Path) -> Dict:
"""Analyze a single PENDING file."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
except Exception as e:
return {'error': str(e)}
name = data.get('custodian_name', {}).get('emic_name', '')
filename = filepath.stem.replace('NL-XX-XXX-PENDING-', '')
# Get staff count
staff_count = len(data.get('staff', {}).get('staff_list', []))
# Detect country from name and filename
country_from_name, reason_name = detect_country(name)
country_from_file, reason_file = detect_country(filename)
# Prefer name-based detection
country = country_from_name or country_from_file
reason = reason_name if country_from_name else reason_file
return {
'filepath': str(filepath),
'filename': filename,
'emic_name': name,
'institution_type': data.get('institution_type', 'UNKNOWN'),
'staff_count': staff_count,
'detected_country': country,
'detection_reason': reason,
}
def main():
import argparse
parser = argparse.ArgumentParser(description='Categorize PENDING files by country')
parser.add_argument('--dry-run', action='store_true', help='Analyze only, no file moves')
parser.add_argument('--output', type=Path, default=Path('pending_file_analysis.yaml'),
help='Output file for analysis results')
args = parser.parse_args()
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
# Find all PENDING files
pending_files = list(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
print(f"Found {len(pending_files)} PENDING files")
# Analyze each file
results = []
country_counts = defaultdict(int)
country_staff = defaultdict(int)
for filepath in sorted(pending_files):
analysis = analyze_pending_file(filepath)
results.append(analysis)
country = analysis.get('detected_country') or 'UNKNOWN'
country_counts[country] += 1
country_staff[country] += analysis.get('staff_count', 0)
# Print summary
print("\n" + "=" * 80)
print("COUNTRY DISTRIBUTION")
print("=" * 80)
print(f"{'Country':<15} {'Files':>8} {'Staff':>10}")
print("-" * 35)
for country in sorted(country_counts.keys(), key=lambda x: country_counts[x], reverse=True):
print(f"{country:<15} {country_counts[country]:>8} {country_staff[country]:>10}")
print("-" * 35)
print(f"{'TOTAL':<15} {len(results):>8} {sum(country_staff.values()):>10}")
# Save detailed results
if args.output:
with open(args.output, 'w', encoding='utf-8') as f:
yaml.dump({
'summary': {
'total_files': len(results),
'country_counts': dict(country_counts),
'country_staff_counts': dict(country_staff),
},
'files': results,
}, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"\nDetailed results saved to: {args.output}")
# List files by category
print("\n" + "=" * 80)
print("SAMPLE FILES BY CATEGORY")
print("=" * 80)
for country in ['NL', 'DE', 'BE', 'GB', 'FR', 'US', 'ID', 'INTL', 'NON_HERITAGE', 'UNKNOWN']:
files = [r for r in results if r.get('detected_country') == country]
if files:
print(f"\n{country} ({len(files)} files):")
for f in files[:5]:
print(f" - {f['emic_name'][:50]} ({f['staff_count']} staff)")
if len(files) > 5:
print(f" ... and {len(files) - 5} more")
if __name__ == '__main__':
main()