#!/usr/bin/env python3 """ Regenerate GHCIDs for Historical Institutions using GeoNames-based lookup. This script: 1. Loads historical institutions validation YAML 2. Uses coordinates to lookup modern location (country, region, city) from GeoNames 3. Generates proper GHCIDs using COUNTRY-REGION-CITY-TYPE-NAME format 4. Updates YAML with corrected GHCIDs matching production implementation Key principle: Use coordinates to find MODERN (2025) location identifiers, not coordinate hashes. ## GHCID Collision Resolution and Temporal Priority This script handles GHCID collisions according to temporal priority rules: **First Batch Collision** (multiple institutions discovered simultaneously): - ALL colliding institutions receive native language name suffixes in snake_case - Example: If two museums in Amsterdam both generate NL-NH-AMS-M-SM, both receive name suffixes: NL-NH-AMS-M-SM-stedelijk_museum_amsterdam, NL-NH-AMS-M-SM-science_museum_amsterdam - Rationale: Fair treatment when discovered at the same time **Historical Addition** (new institution collides with published GHCID): - ONLY the newly discovered institution receives a name suffix - Previously published GHCID remains unchanged (PID stability) - Example: If NL-NH-AMS-M-HM was published 2025-11-01, and a new institution with the same GHCID is found 2025-11-15, only the new institution gets: NL-NH-AMS-M-HM-historical_museum_amsterdam - Rationale: "Cool URIs don't change" - preserve existing citations **Implementation Notes**: - Track publication_date in provenance metadata - Detect collisions by comparing GHCID strings before name suffix addition - Decision logic: if existing GHCID has publication_date < current extraction, preserve it; only add name suffix to new institution - Log collision events in collision registry (future enhancement) **References**: - docs/PERSISTENT_IDENTIFIERS.md - Section "Historical Collision Resolution" - docs/plan/global_glam/07-ghcid-collision-resolution.md - Temporal dimension - docs/GHCID_PID_SCHEME.md - Timeline examples Usage: python scripts/regenerate_historical_ghcids.py """ import json import sys import yaml from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.identifiers.ghcid import ( GHCIDGenerator, GHCIDComponents, InstitutionType, extract_abbreviation_from_name, ) from glam_extractor.geocoding.geonames_lookup import GeoNamesDB class HistoricalGHCIDGenerator: """ Generate GHCIDs for historical institutions using modern geographic references. This generator creates BASE GHCIDs (without name suffixes) by: 1. Reverse geocoding coordinates to modern location (city, region) 2. Looking up city abbreviations from GeoNames database 3. Mapping region names to ISO 3166-2 codes 4. Generating institution abbreviations from names COLLISION HANDLING: ------------------- This class generates deterministic base GHCIDs. Collision detection and name suffix assignment happen in a separate collision resolution stage. Collision scenarios: 1. **First Batch Collision** (multiple institutions processed together): - Two institutions generate the same base GHCID - Both have the same publication_date (extraction timestamp) - Resolution: BOTH receive native language name suffixes - Example: NL-NH-AMS-M-SM-stedelijk_museum_amsterdam and NL-NH-AMS-M-SM-science_museum_amsterdam 2. **Historical Addition** (new institution added later): - New institution generates GHCID matching already-published identifier - publication_date is AFTER existing record's timestamp - Resolution: ONLY new institution receives name suffix - Existing GHCID preserved (maintains citation stability) - Example: Existing NL-NH-AMS-M-HM unchanged, new becomes NL-NH-AMS-M-HM-historical_museum_amsterdam PID Stability Principle: ------------------------ Once a GHCID is published (exported to RDF, JSON-LD, CSV), it becomes a persistent identifier that may be: - Cited in academic papers - Used in external databases and APIs - Embedded in linked data triples - Referenced in archival finding aids Per W3C "Cool URIs Don't Change" principle, published GHCIDs MUST NOT be modified retroactively. Historical additions get name suffixes; existing PIDs remain stable. References: ----------- - docs/PERSISTENT_IDENTIFIERS.md (Historical Collision Resolution section) - docs/plan/global_glam/07-ghcid-collision-resolution.md (Temporal dimension) - docs/GHCID_PID_SCHEME.md (Timeline examples) """ def __init__(self, geonames_db_path: Path): """ Initialize with GeoNames database. Args: geonames_db_path: Path object to geonames.db SQLite database """ self.geonames_db = GeoNamesDB(geonames_db_path) self.ghcid_gen = GHCIDGenerator() # Load ISO 3166-2 mappings for region codes self.reference_dir = Path(__file__).parent.parent / "data" / "reference" self.nl_mapping = self._load_region_mapping("iso_3166_2_nl.json") self.it_mapping = self._load_region_mapping("iso_3166_2_it.json") self.be_mapping = self._load_region_mapping("iso_3166_2_be.json") self.ru_mapping = self._load_region_mapping("iso_3166_2_ru.json") self.dk_mapping = self._load_region_mapping("iso_3166_2_dk.json") self.ar_mapping = self._load_region_mapping("iso_3166_2_ar.json") self.ru_mapping = self._load_region_mapping("iso_3166_2_ru.json") self.dk_mapping = self._load_region_mapping("iso_3166_2_dk.json") self.ar_mapping = self._load_region_mapping("iso_3166_2_ar.json") # Statistics self.stats = { 'total': 0, 'success': 0, 'city_found': 0, 'city_fallback': 0, 'region_found': 0, 'region_fallback': 0, } def _load_region_mapping(self, filename: str) -> Dict[str, str]: """Load ISO 3166-2 region code mapping.""" filepath = self.reference_dir / filename if not filepath.exists(): print(f"Warning: {filename} not found, region lookups may fail") return {} with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) # Extract provinces mapping (different structure per file) import unicodedata mapping = {} # Handle different JSON structures if 'provinces' in data: # NL/AR format: {"provinces": {"Drenthe": "DR", ...}} province_data = data['provinces'] elif 'regions' in data: # IT/DK format: {"regions": {"Lombardy": "25", ...}} province_data = data['regions'] elif 'federal_subjects' in data: # RU format: {"federal_subjects": {"Kaliningrad Oblast": "39", ...}} province_data = data['federal_subjects'] else: # Simple format: {"DR": "Drenthe", ...} - need to reverse province_data = {v: k for k, v in data.items() if isinstance(v, str) and len(k) == 2} # Create normalized name -> code mapping for name, code in province_data.items(): # Normalize name for lookup normalized = unicodedata.normalize('NFD', name.upper()) normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') mapping[normalized.strip()] = code return mapping def _get_region_code(self, country: str, region_name: Optional[str]) -> str: """ Get ISO 3166-2 region code for a country/region. Args: country: ISO 3166-1 alpha-2 country code region_name: Region/province name (optional) Returns: 2-letter region code or "00" if not found """ if not region_name: self.stats['region_fallback'] += 1 return "00" # Normalize region name import unicodedata normalized = unicodedata.normalize('NFD', region_name.upper()) normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') normalized = normalized.strip() # Lookup by country mapping = None if country == "NL": mapping = self.nl_mapping elif country == "IT": mapping = self.it_mapping elif country == "BE": mapping = self.be_mapping elif country == "RU": mapping = self.ru_mapping elif country == "DK": mapping = self.dk_mapping elif country == "AR": mapping = self.ar_mapping if mapping and normalized in mapping: self.stats['region_found'] += 1 return mapping[normalized] self.stats['region_fallback'] += 1 return "00" def _get_city_code_fallback(self, city_name: str) -> str: """ Generate 3-letter city code from city name. Same logic as Latin America script. Args: city_name: City name Returns: 3-letter uppercase code """ import unicodedata # Remove accents normalized = unicodedata.normalize('NFD', city_name) normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Split into words words = normalized.split() if len(words) == 1: # Single word: take first 3 letters code = words[0][:3].upper() elif words[0].lower() in ['la', 'el', 'los', 'las', 'o', 'a', 'de', 'den', 'het']: # City with article: first letter of article + first 2 of next word if len(words) > 1: code = (words[0][0] + words[1][:2]).upper() else: code = words[0][:3].upper() else: # Multi-word: take first letter of each word (up to 3) code = ''.join(w[0] for w in words[:3]).upper() # Ensure exactly 3 letters if len(code) < 3: code = code.ljust(3, 'X') elif len(code) > 3: code = code[:3] return code def _reverse_geocode_with_geonames( self, latitude: float, longitude: float, country: str ) -> Optional[Dict[str, str]]: """ Reverse geocode coordinates to find modern location info from GeoNames. Args: latitude: Latitude coordinate longitude: Longitude coordinate country: ISO 3166-1 alpha-2 country code Returns: Dict with 'city', 'region', 'city_code' or None """ # Query GeoNames database for nearby cities # Using a SQL query to find the closest city query = """ SELECT name, admin1_code, admin1_name, latitude, longitude, geonames_id, ((latitude - ?) * (latitude - ?) + (longitude - ?) * (longitude - ?)) as distance FROM cities WHERE country_code = ? ORDER BY distance LIMIT 1 """ import sqlite3 conn = sqlite3.connect(self.geonames_db.db_path) cursor = conn.cursor() try: cursor.execute(query, (latitude, latitude, longitude, longitude, country)) row = cursor.fetchone() if row: city_name, admin1, admin1_name, lat, lon, geonameid, distance = row # Lookup city abbreviation from GeoNames city_info = self.geonames_db.lookup_city(city_name, country) if city_info: city_code = city_info.get_abbreviation() else: # Fallback to 3-letter abbreviation from name city_code = self._get_city_code_fallback(city_name) return { 'city': city_name, 'region': admin1_name, # Use admin1_name for region lookup 'region_code': admin1, # Also return the code 'city_code': city_code, 'geonames_id': geonameid, 'distance_deg': distance ** 0.5 # Approximate distance } finally: conn.close() return None def generate_ghcid_for_institution(self, record: dict) -> Optional[GHCIDComponents]: """ Generate GHCID for a historical institution using modern geographic lookup. **GHCID Collision Handling**: This method generates a base GHCID (without name suffix). Collision detection and name suffix assignment happen at a later stage when comparing against previously published records. Decision logic (implemented in future collision detector): 1. **Check if base GHCID already exists in published dataset** - If NO → Use base GHCID without name suffix - If YES → Check publication timestamp 2. **Compare publication dates**: - Same batch (same publication_date) → ALL instances get name suffixes - Historical addition (current date > existing publication_date): → ONLY new institution gets name suffix → Existing GHCID preserved (PID stability) 3. **Name suffix format**: - Native language institution name converted to snake_case - Example: "Stedelijk Museum Amsterdam" → "stedelijk_museum_amsterdam" Example scenarios: **Scenario 1: First Batch Collision** ``` # 2025-11-01 batch import discovers two institutions: Institution A: "Stedelijk Museum Amsterdam" → NL-NH-AMS-M-SM-stedelijk_museum_amsterdam Institution B: "Science Museum Amsterdam" → NL-NH-AMS-M-SM-science_museum_amsterdam # Both get name suffixes because discovered simultaneously ``` **Scenario 2: Historical Addition** ``` # 2025-11-01: "Hermitage Amsterdam" published Institution A: NL-NH-AMS-M-HM (publication_date: 2025-11-01T10:00:00Z) # 2025-11-15: "Historical Museum Amsterdam" discovered Institution B: Base GHCID = NL-NH-AMS-M-HM (collision!) → Check A.publication_date (2025-11-01) < current (2025-11-15) → A unchanged, B gets name suffix: NL-NH-AMS-M-HM-historical_museum_amsterdam ``` Args: record: Institution record from YAML Returns: GHCIDComponents or None if generation fails """ self.stats['total'] += 1 # Extract required fields name = record.get('name', '') institution_type_str = record.get('institution_type', 'U') # Get location data locations = record.get('locations', []) if not locations: print(f"Warning: No location data for {name}") return None location = locations[0] # Use primary location city = location.get('city', '') country = location.get('country', 'XX') latitude = location.get('latitude') longitude = location.get('longitude') if not latitude or not longitude: print(f"Warning: No coordinates for {name}") return None # Reverse geocode to get modern location info geo_info = self._reverse_geocode_with_geonames(latitude, longitude, country) if geo_info: city_code = geo_info['city_code'] region = geo_info['region'] print(f" ✓ {name}: Found {geo_info['city']} (code: {city_code}, region: {region})") self.stats['city_found'] += 1 else: # Fallback: use city name from record city_info = self.geonames_db.lookup_city(city, country) if city_info: city_code = city_info.get_abbreviation() region = None # Will use fallback "00" print(f" ○ {name}: Using city name lookup ({city} → {city_code})") self.stats['city_found'] += 1 else: city_code = self._get_city_code_fallback(city) region = None print(f" ○ {name}: Using fallback ({city} → {city_code})") self.stats['city_fallback'] += 1 # Get region code region_code = self._get_region_code(country, region) # Map institution type if len(institution_type_str) == 1: # Single letter - map to InstitutionType type_map = { 'G': InstitutionType.GALLERY, 'L': InstitutionType.LIBRARY, 'A': InstitutionType.ARCHIVE, 'M': InstitutionType.MUSEUM, 'O': InstitutionType.OFFICIAL_INSTITUTION, 'R': InstitutionType.RESEARCH_CENTER, 'C': InstitutionType.CORPORATION, 'U': InstitutionType.UNKNOWN, 'B': InstitutionType.BOTANICAL_ZOO, 'E': InstitutionType.EDUCATION_PROVIDER, 'P': InstitutionType.PERSONAL_COLLECTION, 'S': InstitutionType.COLLECTING_SOCIETY, } inst_type = type_map.get(institution_type_str, InstitutionType.UNKNOWN) else: try: inst_type = InstitutionType[institution_type_str] except KeyError: inst_type = InstitutionType.UNKNOWN # Generate abbreviation from name abbreviation = extract_abbreviation_from_name(name) # Generate GHCID components = GHCIDComponents( country_code=country, region_code=region_code, city_locode=city_code, institution_type=inst_type, abbreviation=abbreviation ) self.stats['success'] += 1 return components def print_statistics(self): """Print generation statistics.""" print("\n" + "="*70) print("HISTORICAL GHCID REGENERATION STATISTICS") print("="*70) print(f"Total institutions processed: {self.stats['total']}") print(f"GHCIDs successfully generated: {self.stats['success']}") print(f"City codes from GeoNames: {self.stats['city_found']}") print(f"City codes from fallback: {self.stats['city_fallback']}") print(f"Region codes found: {self.stats['region_found']}") print(f"Region codes fallback (00): {self.stats['region_fallback']}") print() def main(): """Main execution.""" # Paths project_root = Path(__file__).parent.parent yaml_path = project_root / "data" / "instances" / "historical_institutions_validation.yaml" geonames_db_path = project_root / "data" / "reference" / "geonames.db" print("="*70) print("REGENERATE HISTORICAL INSTITUTION GHCIDs") print("="*70) print(f"Input: {yaml_path}") print(f"GeoNames DB: {geonames_db_path}") print() # Create backup backup_dir = project_root / "data" / "instances" / "archive" backup_dir.mkdir(exist_ok=True) timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") backup_path = backup_dir / f"historical_institutions_pre_regenerate_{timestamp}.yaml" import shutil shutil.copy(yaml_path, backup_path) print(f"✓ Backup created: {backup_path}") print() # Load YAML with open(yaml_path, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"Loaded {len(institutions)} historical institutions") print() # Initialize generator generator = HistoricalGHCIDGenerator(geonames_db_path) # Process each institution print("Generating GHCIDs using modern geographic lookup...") print() # COLLISION DETECTION NOTE: # This script regenerates GHCIDs for historical validation data. # In production, collision detection occurs when comparing against # previously published GHCIDs with temporal priority: # # 1. Build collision registry: group institutions by base GHCID (no name suffix) # 2. For each collision group: # a) Check publication_date metadata (from provenance.extraction_date) # b) If all have same publication_date → First Batch (all get name suffixes) # c) If dates differ → Historical Addition (only new ones get name suffixes) # 3. Preserve existing PIDs: Never modify GHCIDs with earlier publication_date # # See docs/plan/global_glam/07-ghcid-collision-resolution.md for algorithm for record in institutions: name = record.get('name', 'Unknown') # Generate GHCID components = generator.generate_ghcid_for_institution(record) if not components: continue # Generate all identifier formats # # NAME SUFFIX COLLISION HANDLING: # Currently generating base GHCID without name suffix. # In production collision detector: # 1. Check if this base GHCID exists in published dataset # 2. If collision detected: # - First batch (same publication_date): Add name suffix to ALL # - Historical addition (later date): Add name suffix to NEW only # 3. Name suffix generation: # - Convert native language institution name to snake_case # - Example: "Stedelijk Museum Amsterdam" → "stedelijk_museum_amsterdam" # ghcid_str = components.to_string() ghcid_uuid = components.to_uuid() ghcid_uuid_sha256 = components.to_uuid_sha256() ghcid_numeric = components.to_numeric() record_id = GHCIDComponents.generate_uuid_v7() # Update record record['ghcid_current'] = ghcid_str record['ghcid_original'] = ghcid_str # Same as current for regeneration record['ghcid_uuid'] = str(ghcid_uuid) record['ghcid_uuid_sha256'] = str(ghcid_uuid_sha256) record['ghcid_numeric'] = ghcid_numeric record['record_id'] = str(record_id) # Update GHCID history (first entry) # # GHCID HISTORY AND COLLISIONS: # ghcid_history tracks GHCID changes over time (see GHCIDHistoryEntry schema). # When collisions are resolved by adding name suffixes: # # 1. Create new history entry with valid_from timestamp # 2. Update previous entry with valid_to timestamp (if name suffix added) # 3. Record reason for change (e.g., "Name suffix added to resolve collision") # # Example history for collision resolution: # ghcid_history: # - ghcid: NL-NH-AMS-M-HM-historical_museum_amsterdam # Current (with name suffix) # ghcid_numeric: 789012345678 # valid_from: "2025-11-15T10:00:00Z" # valid_to: null # reason: "Name suffix added to resolve collision with existing NL-NH-AMS-M-HM" # - ghcid: NL-NH-AMS-M-HM # Original (without name suffix) # ghcid_numeric: 123456789012 # valid_from: "2025-11-15T09:00:00Z" # valid_to: "2025-11-15T10:00:00Z" # reason: "Historical identifier based on modern geographic location" # if 'ghcid_history' in record and record['ghcid_history']: record['ghcid_history'][0]['ghcid'] = ghcid_str record['ghcid_history'][0]['ghcid_numeric'] = ghcid_numeric record['ghcid_history'][0]['reason'] = ( "Historical identifier based on modern geographic location " "from GeoNames reverse geocoding" ) # Update identifiers list if 'identifiers' not in record: record['identifiers'] = [] # Remove old GHCID identifiers record['identifiers'] = [ i for i in record['identifiers'] if i.get('identifier_scheme') not in ['GHCID', 'GHCID_NUMERIC', 'GHCID_UUID', 'GHCID_UUID_SHA256', 'RECORD_ID'] ] # Add new GHCID identifiers record['identifiers'].extend([ { 'identifier_scheme': 'GHCID', 'identifier_value': ghcid_str }, { 'identifier_scheme': 'GHCID_NUMERIC', 'identifier_value': str(ghcid_numeric) }, { 'identifier_scheme': 'GHCID_UUID', 'identifier_value': str(ghcid_uuid), 'identifier_url': f'urn:uuid:{ghcid_uuid}' }, { 'identifier_scheme': 'GHCID_UUID_SHA256', 'identifier_value': str(ghcid_uuid_sha256), 'identifier_url': f'urn:uuid:{ghcid_uuid_sha256}' }, { 'identifier_scheme': 'RECORD_ID', 'identifier_value': str(record_id), 'identifier_url': f'urn:uuid:{record_id}' } ]) # Print statistics generator.print_statistics() # NEXT STEPS: COLLISION DETECTION AND RESOLUTION # ================================================ # This script generates base GHCIDs. For production deployment: # # 1. Build collision detector module: # - Group institutions by base GHCID (without name suffix) # - Identify collision sets (2+ institutions with same base GHCID) # # 2. Implement temporal priority resolver: # - Extract publication_date from provenance.extraction_date # - Compare timestamps within each collision set # - Apply decision logic: # * Same timestamp → First Batch → ALL get name suffixes # * Different timestamps → Historical Addition → NEW get name suffixes # # 3. Name suffix generation: # - Convert native language institution name to snake_case # - Example: "Stedelijk Museum Amsterdam" → "stedelijk_museum_amsterdam" # - Append to base GHCID: NL-NH-AMS-M-HM → NL-NH-AMS-M-HM-historical_museum_amsterdam # # 4. Update GHCID history: # - Create new GHCIDHistoryEntry when name suffix added # - Set valid_from to current timestamp # - Set valid_to on previous entry (if exists) # - Document reason: "Name suffix added to resolve collision with [other institution]" # # 5. Generate collision registry report: # - CSV with columns: base_ghcid, collision_type, resolution_strategy, # affected_institutions, publication_dates, timestamp # - Store in data/reports/ghcid_collision_registry.csv # # 6. Validation: # - Verify all GHCIDs are unique (base or with name suffix) # - Check no published GHCIDs were modified (PID stability test) # - Validate GHCID history entries have correct temporal ordering # # See docs/plan/global_glam/07-ghcid-collision-resolution.md for algorithm # See docs/GHCID_PID_SCHEME.md for timeline examples # Write updated YAML print(f"Writing updated YAML to: {yaml_path}") with open(yaml_path, 'w', encoding='utf-8') as f: # Write header f.write("---\n") f.write("# Historical Heritage Institutions - GHCID Validation Examples\n") f.write("# Generated from Wikidata SPARQL query\n") f.write("# GHCIDs regenerated using modern GeoNames-based location lookup\n") f.write(f"# Last updated: {datetime.now(timezone.utc).isoformat()}\n") f.write("#\n") f.write("# GHCID Format: COUNTRY-REGION-CITY-TYPE-NAME\n") f.write("# - City codes from GeoNames reverse geocoding (coordinates → city)\n") f.write("# - Region codes from ISO 3166-2 mappings\n") f.write("# - Same format as production implementation\n") f.write("\n") # Write institutions yaml.dump(institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False) print("✓ Done!") print() print("="*70) print("VALIDATION COMPLETE") print("="*70) if __name__ == "__main__": main()