#!/usr/bin/env python3 """ Manual enrichment for Luxembourg institutions (Phase 1 - Final Country) Enriches Court of Justice of the European Union with: - Wikidata Q-number (Q4951) - VIAF identifier (124913422) - Enhanced description - Additional alternative names """ import yaml from datetime import datetime, timezone from pathlib import Path UNIFIED_DATASET = Path("data/instances/all/globalglam-20251111.yaml") BACKUP_PATH = Path("data/instances/all/globalglam-20251111.yaml.backup") def load_yaml(filepath: Path) -> list: """Load YAML file.""" with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def save_yaml(data: list, filepath: Path): """Save YAML file.""" with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False, width=120) def enrich_luxembourg_institutions(data: list) -> tuple[list, int]: """ Enrich Luxembourg institutions with Wikidata and VIAF identifiers. Returns: Tuple of (enriched_data, count_enriched) """ enriched_count = 0 # Enrichment data for Court of Justice of the European Union cjeu_enrichment = { 'id': 'EUR-CURIA0001', 'wikidata': 'Q4951', 'viaf': '124913422', 'viaf_alt': '140116137', # Alternative VIAF cluster 'alternative_names': [ 'CJEU', 'CJUE', 'CURIA', 'Court of Justice of the European Communities', 'CJEC', 'Gerichtshof der Europäischen Union', 'Cour de justice de l\'Union européenne' ], 'description': ( 'The Court of Justice of the European Union (CJEU) is the highest ' 'judicial authority in the European Union, consisting of the Court ' 'of Justice and the General Court. Founded in 1952 as the Court of ' 'Justice of the European Communities. The CJEU ensures the uniform ' 'interpretation and application of EU law across all member states. ' 'The Court\'s library holds over 340,000 bibliographic records, ' 'including more than 80,000 concerning European Union law, making it ' 'one of the most complete law libraries in the world regarding EU law. ' 'Archives held at Historical Archives of the European Union (HAEU) in Florence, Italy.' ) } for institution in data: if institution.get('id') == cjeu_enrichment['id']: print(f"Enriching: {institution['name']}") # Update description institution['description'] = cjeu_enrichment['description'] # Update alternative names institution['alternative_names'] = cjeu_enrichment['alternative_names'] # Add Wikidata identifier wikidata_exists = any( i.get('identifier_scheme') == 'Wikidata' for i in institution.get('identifiers', []) ) if not wikidata_exists: institution['identifiers'].append({ 'identifier_scheme': 'Wikidata', 'identifier_value': cjeu_enrichment['wikidata'], 'identifier_url': f"https://www.wikidata.org/wiki/{cjeu_enrichment['wikidata']}" }) print(f" + Added Wikidata: {cjeu_enrichment['wikidata']}") # Add VIAF identifiers (both clusters) viaf_exists = any( i.get('identifier_scheme') == 'VIAF' for i in institution.get('identifiers', []) ) if not viaf_exists: # Primary VIAF cluster institution['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': cjeu_enrichment['viaf'], 'identifier_url': f"https://viaf.org/viaf/{cjeu_enrichment['viaf']}" }) print(f" + Added VIAF: {cjeu_enrichment['viaf']}") # Alternative VIAF cluster (for merged records) institution['identifiers'].append({ 'identifier_scheme': 'VIAF', 'identifier_value': cjeu_enrichment['viaf_alt'], 'identifier_url': f"https://viaf.org/viaf/{cjeu_enrichment['viaf_alt']}", 'notes': 'Alternative VIAF cluster for earlier institutional form' }) print(f" + Added VIAF (alt): {cjeu_enrichment['viaf_alt']}") # Update provenance if 'provenance' not in institution: institution['provenance'] = {} enrichment_note = ( f"Wikidata Q{cjeu_enrichment['wikidata']} and VIAF {cjeu_enrichment['viaf']} " f"added via manual research (wikidata.org verification). Enhanced description " f"includes library holdings (340k+ records) and archival information (HAEU Florence). " f"Phase 1 final country enrichment completed {datetime.now(timezone.utc).strftime('%Y-%m-%d')}." ) institution['provenance']['enrichment_notes'] = enrichment_note institution['provenance']['last_enriched'] = datetime.now(timezone.utc).isoformat() enriched_count += 1 print(f" ✓ Enrichment complete") return data, enriched_count def main(): """Main enrichment workflow.""" print("=" * 70) print("Luxembourg Institution Enrichment - Phase 1 Final Country") print("=" * 70) print() # Backup unified dataset print(f"Creating backup: {BACKUP_PATH}") if UNIFIED_DATASET.exists(): import shutil shutil.copy(UNIFIED_DATASET, BACKUP_PATH) print("✓ Backup created") # Load data print(f"\nLoading: {UNIFIED_DATASET}") data = load_yaml(UNIFIED_DATASET) print(f"✓ Loaded {len(data):,} institutions") # Enrich Luxembourg institutions print("\n" + "-" * 70) print("Enriching Luxembourg Institutions") print("-" * 70) enriched_data, enriched_count = enrich_luxembourg_institutions(data) # Save enriched data print("\n" + "-" * 70) print(f"Saving enriched dataset: {UNIFIED_DATASET}") save_yaml(enriched_data, UNIFIED_DATASET) print(f"✓ Saved {len(enriched_data):,} institutions") # Summary print("\n" + "=" * 70) print("ENRICHMENT COMPLETE") print("=" * 70) print(f"Luxembourg institutions enriched: {enriched_count}") print(f"Total institutions in dataset: {len(enriched_data):,}") print() print("Phase 1 Complete - All 5 countries enriched:") print(" ✓ Georgia (GE) - 14 institutions") print(" ✓ Great Britain (GB) - 4 institutions") print(" ✓ Belgium (BE) - 7 institutions") print(" ✓ United States (US) - 7 institutions") print(" ✓ Luxembourg (LU) - 1 institution") print() print(f"Total Phase 1 enriched: 33 institutions") print("=" * 70) if __name__ == "__main__": main()