Merged LinkedIn-extracted staff sections from PENDING files into their corresponding proper GHCID custodian files. This consolidates data from two extraction sources: - Existing enriched files: Google Maps, Museum Register, YouTube, etc. - PENDING files: LinkedIn staff data extraction Files modified: - 28 custodian files enriched with staff data - 35 PENDING files deleted (merged into proper locations) - Originals archived to archive/pending_duplicates_20250109/ Key institutions enriched: - Rijksmuseum (NL-NH-AMS-M-RM) - Stedelijk Museum Amsterdam (NL-NH-AMS-M-SMA) - Amsterdam Museum (NL-NH-AMS-M-AM) - Regionaal Archief Alkmaar (NL-NH-ALK-A-RAA) - Maritiem Museum Rotterdam (NL-ZH-ROT-M-MMR) - And 23 more museums/archives across NL New scripts: - scripts/merge_staff_data.py: Automated staff data merger - scripts/categorize_pending_files.py: PENDING file analysis utility
237 lines
9.9 KiB
Python
237 lines
9.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Categorize PENDING files by country based on name patterns and keywords.
|
|
|
|
This script analyzes NL-XX-XXX-PENDING-*.yaml files and categorizes them as:
|
|
- Dutch (NL) - Should get proper GHCIDs
|
|
- Non-Dutch (various countries) - Should be moved to country-specific folders or archived
|
|
- Non-heritage - Should be archived (not heritage custodians)
|
|
|
|
Usage:
|
|
python scripts/categorize_pending_files.py --dry-run # Analyze only
|
|
python scripts/categorize_pending_files.py # Move files
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import yaml
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Tuple, Optional
|
|
|
|
# Country indicators in organization names
|
|
COUNTRY_KEYWORDS = {
|
|
'DE': ['GERMAN', 'DEUTSCHLAND', 'BERLIN', 'MUNICH', 'MÜNCHEN', 'HAMBURG', 'KÖLN',
|
|
'FRANKFURT', 'DRESDEN', 'STUTTGART', 'DÜSSELDORF', 'LEIPZIG', 'STIFTUNG',
|
|
'BUNDESREPUBLIK', 'KULTURSTIFTUNG', 'BADEN-WÜRTTEMBERG', 'BAYERN', 'SACHSEN'],
|
|
'BE': ['BELGIUM', 'BELGIAN', 'BRUSSELS', 'BRUXELLES', 'ANTWERP', 'GHENT', 'GENT',
|
|
'BRUGES', 'LIÈGE', 'WALLONIA', 'FLANDERS', 'VLAANDEREN'],
|
|
'FR': ['FRANCE', 'FRENCH', 'PARIS', 'LYON', 'MARSEILLE', 'TOULOUSE', 'BORDEAUX',
|
|
'NATIONALE', 'PATRIMOINE', 'MUSÉE', 'ONDA', 'PYRENEES', 'CÉVENNES'],
|
|
'GB': ['BRITISH', 'ENGLAND', 'ENGLISH', 'LONDON', 'EDINBURGH', 'SCOTLAND', 'WALES',
|
|
'OXFORD', 'CAMBRIDGE', 'ROYAL-ACADEMY', 'ROYAL-ARMOURIES', 'NATIONAL-ARCHIVES-UK',
|
|
'ROYAL-BOTANIC', 'NATIONAL-TRUST', 'ENGLISH-HERITAGE'],
|
|
'US': ['AMERICAN', 'USA', 'UNITED-STATES', 'WASHINGTON', 'NEW-YORK', 'CALIFORNIA',
|
|
'SMITHSONIAN', 'LIBRARY-OF-CONGRESS'],
|
|
'AT': ['AUSTRIA', 'AUSTRIAN', 'WIEN', 'VIENNA', 'SALZBURG', 'ÖSTERREICH'],
|
|
'CH': ['SWISS', 'SWITZERLAND', 'ZÜRICH', 'ZURICH', 'BASEL', 'GENEVA', 'GENÈVE', 'BERN'],
|
|
'IT': ['ITALY', 'ITALIAN', 'ROME', 'ROMA', 'MILAN', 'MILANO', 'FLORENCE', 'FIRENZE',
|
|
'VENICE', 'VENEZIA', 'NAPOLI', 'NAPLES'],
|
|
'ES': ['SPAIN', 'SPANISH', 'MADRID', 'BARCELONA', 'SEVILLA', 'SEVILLE', 'VALENCIA'],
|
|
'PT': ['PORTUGAL', 'PORTUGUESE', 'LISBON', 'LISBOA', 'PORTO'],
|
|
'DK': ['DANISH', 'DENMARK', 'COPENHAGEN', 'KØBENHAVN', 'THORVALDSEN'],
|
|
'SE': ['SWEDISH', 'SWEDEN', 'STOCKHOLM', 'GOTHENBURG', 'GÖTEBORG', 'MALMÖ'],
|
|
'NO': ['NORWEGIAN', 'NORWAY', 'OSLO', 'BERGEN', 'TRONDHEIM'],
|
|
'FI': ['FINNISH', 'FINLAND', 'HELSINKI'],
|
|
'PL': ['POLISH', 'POLAND', 'WARSAW', 'WARSZAWA', 'KRAKOW', 'KRAKÓW'],
|
|
'CZ': ['CZECH', 'CZECHIA', 'PRAGUE', 'PRAHA'],
|
|
'AU': ['AUSTRALIA', 'AUSTRALIAN', 'SYDNEY', 'MELBOURNE', 'CANBERRA'],
|
|
'CA': ['CANADA', 'CANADIAN', 'TORONTO', 'MONTREAL', 'VANCOUVER', 'OTTAWA'],
|
|
'JP': ['JAPAN', 'JAPANESE', 'TOKYO', 'KYOTO', 'OSAKA'],
|
|
'KR': ['KOREA', 'KOREAN', 'SEOUL'],
|
|
'CN': ['CHINA', 'CHINESE', 'BEIJING', 'SHANGHAI'],
|
|
'TW': ['TAIWAN', 'TAIWANESE', 'TAIPEI'],
|
|
'ID': ['INDONESIA', 'INDONESIAN', 'JAKARTA', 'PERPUSTAKAAN-NASIONAL', 'MUSEUM-NASIONAL',
|
|
'TAMAN-SAFARI', 'GAIA-INDONESIA', 'MUSEUM-MUSIK-INDONESIA', 'ASMAT'],
|
|
'SG': ['SINGAPORE', 'SINGAPOREAN'],
|
|
'MY': ['MALAYSIA', 'MALAYSIAN', 'KUALA-LUMPUR'],
|
|
'IL': ['ISRAEL', 'ISRAELI', 'JERUSALEM', 'TEL-AVIV'],
|
|
'AE': ['UAE', 'DUBAI', 'ABU-DHABI', 'EMIRATES'],
|
|
'SA': ['SAUDI', 'ARABIA', 'RIYADH'],
|
|
'ZA': ['SOUTH-AFRICA', 'SOUTH-AFRICAN', 'CAPE-TOWN', 'JOHANNESBURG'],
|
|
'IN': ['INDIA', 'INDIAN', 'DELHI', 'MUMBAI', 'BANGALORE'],
|
|
'RU': ['RUSSIA', 'RUSSIAN', 'MOSCOW', 'MOSKVA', 'SAINT-PETERSBURG'],
|
|
'IE': ['IRELAND', 'IRISH', 'DUBLIN'],
|
|
'NZ': ['NEW-ZEALAND', 'ZEALAND', 'AUCKLAND', 'WELLINGTON'],
|
|
}
|
|
|
|
# Dutch cities and keywords
|
|
DUTCH_KEYWORDS = [
|
|
'AMSTERDAM', 'ROTTERDAM', 'HAGUE', 'DEN-HAAG', 'UTRECHT', 'EINDHOVEN',
|
|
'GRONINGEN', 'TILBURG', 'ALMERE', 'BREDA', 'NIJMEGEN', 'ENSCHEDE',
|
|
'HAARLEM', 'ARNHEM', 'ZAANSTAD', 'AMERSFOORT', 'APELDOORN', 'MAASTRICHT',
|
|
'LEIDEN', 'DORDRECHT', 'ZOETERMEER', 'ZWOLLE', 'DEVENTER', 'DELFT',
|
|
'ALKMAAR', 'GOUDA', 'HILVERSUM', 'MIDDELBURG', 'LEEUWARDEN', 'ASSEN',
|
|
'NEDERLANDS', 'NEDERLANDER', 'NETHERLANDS', 'HOLLAND', 'HOLLANDS',
|
|
'RIJKS', 'STADS', 'GEMEENTE', 'PROVINCIAAL', 'NOORD-HOLLAND', 'ZUID-HOLLAND',
|
|
'NOORD-BRABANT', 'LIMBURG', 'GELDERLAND', 'OVERIJSSEL', 'DRENTHE',
|
|
'FRIESLAND', 'FLEVOLAND', 'ZEELAND', 'ERFGOED', 'HEEMKUNDE', 'OUDHEIDKAMER',
|
|
'HISTORISCHE-VERENIGING', 'HISTORISCH-CENTRUM',
|
|
]
|
|
|
|
# Non-heritage organization keywords
|
|
NON_HERITAGE_KEYWORDS = [
|
|
'WILDLIFE', 'FARMING', 'AGRICULTURAL', 'NATURE-CONSERVATION',
|
|
'ENVIRONMENTAL', 'CLIMATE', 'SUSTAINABILITY', 'RENEWABLE',
|
|
'HOSPITAL', 'MEDICAL', 'HEALTHCARE', 'PHARMACEUTICAL',
|
|
'BANK', 'FINANCIAL', 'INSURANCE', 'INVESTMENT',
|
|
'SOFTWARE', 'TECHNOLOGY', 'STARTUP', 'DIGITAL-AGENCY',
|
|
'MARKETING', 'ADVERTISING', 'PR-AGENCY', 'CONSULTING',
|
|
'RESTAURANT', 'HOTEL', 'CATERING', 'HOSPITALITY',
|
|
'SPORTS', 'FITNESS', 'GYM', 'FOOTBALL', 'SOCCER',
|
|
'POLITICAL', 'ADVOCACY', 'LOBBY', 'STANDWITHUS',
|
|
]
|
|
|
|
# International organizations (keep with NL code but flag as international)
|
|
INTERNATIONAL_ORGS = [
|
|
'ICOM', 'ICOMOS', 'IFLA', 'ICA-', 'UNESCO', 'EUROPEANA', 'EXARC',
|
|
'INTERNATIONAL', 'WORLD', 'GLOBAL', 'EUROPEAN-UNION', 'EU-',
|
|
]
|
|
|
|
|
|
def detect_country(name: str) -> Tuple[Optional[str], str]:
|
|
"""
|
|
Detect likely country from organization name.
|
|
|
|
Returns:
|
|
Tuple of (country_code, reason)
|
|
"""
|
|
name_upper = name.upper().replace(' ', '-').replace('_', '-')
|
|
|
|
# Check for explicit Dutch indicators first
|
|
for keyword in DUTCH_KEYWORDS:
|
|
if keyword in name_upper:
|
|
return ('NL', f'Dutch keyword: {keyword}')
|
|
|
|
# Check for international organizations
|
|
for keyword in INTERNATIONAL_ORGS:
|
|
if keyword in name_upper:
|
|
return ('INTL', f'International org: {keyword}')
|
|
|
|
# Check for non-heritage
|
|
for keyword in NON_HERITAGE_KEYWORDS:
|
|
if keyword in name_upper:
|
|
return ('NON_HERITAGE', f'Non-heritage keyword: {keyword}')
|
|
|
|
# Check other countries
|
|
for country_code, keywords in COUNTRY_KEYWORDS.items():
|
|
for keyword in keywords:
|
|
if keyword in name_upper:
|
|
return (country_code, f'{country_code} keyword: {keyword}')
|
|
|
|
return (None, 'No country detected')
|
|
|
|
|
|
def analyze_pending_file(filepath: Path) -> Dict:
|
|
"""Analyze a single PENDING file."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
except Exception as e:
|
|
return {'error': str(e)}
|
|
|
|
name = data.get('custodian_name', {}).get('emic_name', '')
|
|
filename = filepath.stem.replace('NL-XX-XXX-PENDING-', '')
|
|
|
|
# Get staff count
|
|
staff_count = len(data.get('staff', {}).get('staff_list', []))
|
|
|
|
# Detect country from name and filename
|
|
country_from_name, reason_name = detect_country(name)
|
|
country_from_file, reason_file = detect_country(filename)
|
|
|
|
# Prefer name-based detection
|
|
country = country_from_name or country_from_file
|
|
reason = reason_name if country_from_name else reason_file
|
|
|
|
return {
|
|
'filepath': str(filepath),
|
|
'filename': filename,
|
|
'emic_name': name,
|
|
'institution_type': data.get('institution_type', 'UNKNOWN'),
|
|
'staff_count': staff_count,
|
|
'detected_country': country,
|
|
'detection_reason': reason,
|
|
}
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Categorize PENDING files by country')
|
|
parser.add_argument('--dry-run', action='store_true', help='Analyze only, no file moves')
|
|
parser.add_argument('--output', type=Path, default=Path('pending_file_analysis.yaml'),
|
|
help='Output file for analysis results')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path('/Users/kempersc/apps/glam/data/custodian')
|
|
|
|
# Find all PENDING files
|
|
pending_files = list(custodian_dir.glob('NL-XX-XXX-PENDING-*.yaml'))
|
|
print(f"Found {len(pending_files)} PENDING files")
|
|
|
|
# Analyze each file
|
|
results = []
|
|
country_counts = defaultdict(int)
|
|
country_staff = defaultdict(int)
|
|
|
|
for filepath in sorted(pending_files):
|
|
analysis = analyze_pending_file(filepath)
|
|
results.append(analysis)
|
|
|
|
country = analysis.get('detected_country') or 'UNKNOWN'
|
|
country_counts[country] += 1
|
|
country_staff[country] += analysis.get('staff_count', 0)
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 80)
|
|
print("COUNTRY DISTRIBUTION")
|
|
print("=" * 80)
|
|
print(f"{'Country':<15} {'Files':>8} {'Staff':>10}")
|
|
print("-" * 35)
|
|
|
|
for country in sorted(country_counts.keys(), key=lambda x: country_counts[x], reverse=True):
|
|
print(f"{country:<15} {country_counts[country]:>8} {country_staff[country]:>10}")
|
|
|
|
print("-" * 35)
|
|
print(f"{'TOTAL':<15} {len(results):>8} {sum(country_staff.values()):>10}")
|
|
|
|
# Save detailed results
|
|
if args.output:
|
|
with open(args.output, 'w', encoding='utf-8') as f:
|
|
yaml.dump({
|
|
'summary': {
|
|
'total_files': len(results),
|
|
'country_counts': dict(country_counts),
|
|
'country_staff_counts': dict(country_staff),
|
|
},
|
|
'files': results,
|
|
}, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
print(f"\nDetailed results saved to: {args.output}")
|
|
|
|
# List files by category
|
|
print("\n" + "=" * 80)
|
|
print("SAMPLE FILES BY CATEGORY")
|
|
print("=" * 80)
|
|
|
|
for country in ['NL', 'DE', 'BE', 'GB', 'FR', 'US', 'ID', 'INTL', 'NON_HERITAGE', 'UNKNOWN']:
|
|
files = [r for r in results if r.get('detected_country') == country]
|
|
if files:
|
|
print(f"\n{country} ({len(files)} files):")
|
|
for f in files[:5]:
|
|
print(f" - {f['emic_name'][:50]} ({f['staff_count']} staff)")
|
|
if len(files) > 5:
|
|
print(f" ... and {len(files) - 5} more")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|