#!/usr/bin/env python3 """ Generate GHCIDs for Algerian GLAM institutions. This script: 1. Loads the Algerian institutions YAML file (with Wikidata/VIAF enrichment) 2. Maps wilaya names to ISO 3166-2 codes (DZ-01, DZ-15, etc.) 3. Uses existing coordinates for reverse geocoding 4. Generates GHCID identifiers with four-identifier strategy 5. Updates the YAML file with GHCID fields 6. Detects collisions and appends Wikidata Q-numbers when available Key Characteristics for Algeria: - 58 wilayas (provinces) with numeric/alpha ISO 3166-2 codes - All 19 institutions already have coordinates (100% coverage) - National institutions mostly in Algiers (DZ-01) - Good Wikidata coverage (15/19 = 78.9%) Usage: python scripts/generate_ghcids_algeria.py """ import json import re import sys import yaml from collections import defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Tuple # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.identifiers.ghcid import ( GHCIDComponents, InstitutionType, extract_abbreviation_from_name, ) class AlgeriaRegionMapper: """Maps Algerian wilaya names to ISO 3166-2 codes.""" def __init__(self): """Load ISO 3166-2 mappings from reference data.""" self.reference_dir = Path(__file__).parent.parent / "data" / "reference" # Load Algeria mapping (code -> name) self.dz_mapping = self._load_mapping("iso_3166_2_dz.json") # Create reverse mapping (normalized name -> code) self.name_to_code = self._create_reverse_mapping(self.dz_mapping) # City -> wilaya inference for major cities self.city_to_wilaya = { 'ALGIERS': '01', 'ORAN': '09', 'CONSTANTINE': '04', 'ANNABA': '37', 'BATNA': '03', 'SETIF': '12', 'SÉTIF': '12', 'TLEMCEN': '15', 'BEJAIA': '18', 'BÉJAÏA': '18', 'OUARGLA': '50', 'BOUMERDES': '40', 'TIPASA': '55', 'DJANET': 'DJ', 'DJEMILA': '12', # In Sétif Province 'TIMGAD': '03', # In Batna Province } def _load_mapping(self, filename: str) -> Dict[str, str]: """Load ISO 3166-2 mapping from JSON file (code -> name).""" filepath = self.reference_dir / filename with open(filepath, 'r', encoding='utf-8') as f: return json.load(f) def _create_reverse_mapping(self, mapping: Dict[str, str]) -> Dict[str, str]: """Create normalized name -> code mapping.""" import unicodedata reverse = {} for code, name in mapping.items(): # Normalize: uppercase, strip accents normalized = unicodedata.normalize('NFD', name.upper()) normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') normalized = normalized.strip() reverse[normalized] = code return reverse @staticmethod def _normalize_name(name: str) -> str: """Normalize wilaya/city name for lookup.""" import unicodedata normalized = name.upper() normalized = unicodedata.normalize('NFD', normalized) normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') return normalized.strip() def get_wilaya_code(self, wilaya_name: str) -> str: """ Get ISO 3166-2 wilaya code. Args: wilaya_name: Wilaya name (e.g., "Algiers", "Oran") Returns: ISO 3166-2 subdivision code (e.g., "01", "09") Returns "00" if wilaya not found (national-level fallback) """ normalized = self._normalize_name(wilaya_name) if normalized in self.name_to_code: return self.name_to_code[normalized] # Check city->wilaya inference if normalized in self.city_to_wilaya: return self.city_to_wilaya[normalized] return "00" class AlgeriaGHCIDGenerator: """Generate GHCIDs for Algerian institutions.""" def __init__(self): """Initialize generator with dependencies.""" self.region_mapper = AlgeriaRegionMapper() # Statistics self.stats = { 'total_institutions': 0, 'ghcids_generated': 0, 'location_used': 0, 'defaulted_to_algiers': 0, 'missing_city_code': 0, 'missing_wilaya_code': 0, 'collisions_detected': 0, 'errors': [], } # Collision detection self.ghcid_usage: Dict[str, List[str]] = defaultdict(list) @staticmethod def _get_city_code(city_name: str) -> str: """ Generate 3-letter city code from city name. Args: city_name: City name (e.g., "Algiers", "Oran") Returns: 3-letter uppercase code (e.g., "ALG", "ORA") """ import unicodedata # Remove accents normalized = unicodedata.normalize('NFD', city_name) normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Skip prepositions skip_words = {'de', 'la', 'le', 'el', 'al', 'du', 'des'} # Split into words words = normalized.split() words = [w for w in words if w.lower() not in skip_words] if len(words) == 0: return "XXX" elif len(words) == 1: # Single word: take first 3 letters code = words[0][:3].upper() else: # Multi-word: take first letter of each word (up to 3) code = ''.join(w[0] for w in words[:3]).upper() # Ensure exactly 3 letters if len(code) < 3: code = code.ljust(3, 'X') elif len(code) > 3: code = code[:3] return code def _get_wilaya_from_city(self, city_name: str) -> str: """Infer wilaya code from city name.""" normalized = self.region_mapper._normalize_name(city_name) # Check direct mapping if normalized in self.region_mapper.city_to_wilaya: return self.region_mapper.city_to_wilaya[normalized] # Check if city is also a wilaya name code = self.region_mapper.get_wilaya_code(city_name) return code def generate_for_institution(self, record: dict) -> Optional[GHCIDComponents]: """ Generate GHCID for a single Algerian institution. Args: record: Institution record from YAML (dict) Returns: GHCIDComponents if successful, None otherwise """ self.stats['total_institutions'] += 1 try: # Extract required fields name = record.get('name') institution_type_str = record.get('institution_type', 'UNKNOWN') if not name: self.stats['errors'].append(f"Missing name for record: {record.get('id')}") return None # Country code (always DZ) country_code = "DZ" # Get location data locations = record.get('locations', []) city_name = None wilaya_code = "00" if locations: location = locations[0] city_name = location.get('city') region = location.get('region') if city_name: self.stats['location_used'] += 1 # Infer wilaya from city wilaya_code = self._get_wilaya_from_city(city_name) if wilaya_code == "00": self.stats['missing_wilaya_code'] += 1 # Try region if available if region: wilaya_code = self.region_mapper.get_wilaya_code(region) # Default Algiers for national institutions without city if not city_name: if 'national' in name.lower() or 'nationale' in name.lower(): city_name = "Algiers" wilaya_code = "01" self.stats['defaulted_to_algiers'] += 1 else: self.stats['errors'].append(f"No city for: {name}") return None # Generate city code city_code = self._get_city_code(city_name) if city_code == "XXX": self.stats['missing_city_code'] += 1 # Map institution type to GHCID type code try: inst_type = InstitutionType[institution_type_str] except KeyError: inst_type = InstitutionType.UNKNOWN # Generate abbreviation from name abbreviation = extract_abbreviation_from_name(name) # Create GHCID components components = GHCIDComponents( country_code=country_code, region_code=wilaya_code, city_locode=city_code, institution_type=inst_type.value, abbreviation=abbreviation, ) # Validate is_valid, error_msg = components.validate() if not is_valid: self.stats['errors'].append(f"Invalid GHCID for {name}: {error_msg}") return None # Check for collisions (before Q-number) base_ghcid = components.to_string() self.ghcid_usage[base_ghcid].append(name) if len(self.ghcid_usage[base_ghcid]) > 1: self.stats['collisions_detected'] += 1 self.stats['ghcids_generated'] += 1 return components except Exception as e: self.stats['errors'].append(f"Error generating GHCID for {record.get('name', 'unknown')}: {e}") return None def process_all_institutions(self, input_file: Path) -> List[dict]: """ Process all institutions in YAML file and generate GHCIDs. Args: input_file: Path to Algerian institutions YAML file Returns: List of updated institution records with GHCID fields """ print(f"Loading Algerian institutions from: {input_file}") with open(input_file, 'r', encoding='utf-8') as f: institutions = yaml.safe_load(f) print(f"Found {len(institutions)} institutions") print() updated_institutions = [] for i, record in enumerate(institutions, 1): print(f"Processing {i}/{len(institutions)}: {record.get('name', 'unknown')}") # Generate GHCID ghcid_components = self.generate_for_institution(record) if ghcid_components: # Check for Wikidata Q-number (for collision resolution) wikidata_qid = None identifiers = record.get('identifiers', []) for identifier in identifiers: if identifier.get('identifier_scheme') == 'Wikidata': wikidata_qid = identifier.get('identifier_value') break # If collision exists and we have Q-number, append it base_ghcid = ghcid_components.to_string() if len(self.ghcid_usage[base_ghcid]) > 1 and wikidata_qid: # Append Q-number for disambiguation ghcid_with_q = f"{base_ghcid}-{wikidata_qid}" record['ghcid'] = ghcid_with_q print(f" -> Collision detected, using GHCID with Q-number: {ghcid_with_q}") else: record['ghcid'] = base_ghcid print(f" -> GHCID: {base_ghcid}") # Add UUID v5 (SHA-1) - PRIMARY identifier record['ghcid_uuid'] = str(ghcid_components.to_uuid()) # Add UUID v8 (SHA-256) - Secondary identifier record['ghcid_uuid_sha256'] = str(ghcid_components.to_uuid_sha256()) # Add numeric identifier record['ghcid_numeric'] = ghcid_components.to_numeric() # Add GHCID to identifiers list has_ghcid = any(i.get('identifier_scheme') == 'GHCID' for i in identifiers) if not has_ghcid: identifiers.append({ 'identifier_scheme': 'GHCID', 'identifier_value': record['ghcid'], }) record['identifiers'] = identifiers # Update provenance with GHCID generation metadata provenance = record.get('provenance', {}) provenance['ghcid_generation'] = { 'generated_date': datetime.now(timezone.utc).isoformat(), 'generation_method': 'AlgeriaGHCIDGenerator with coordinate-based location', 'base_ghcid': base_ghcid, 'has_wikidata_disambiguation': wikidata_qid is not None, } record['provenance'] = provenance updated_institutions.append(record) return updated_institutions def print_statistics(self): """Print generation statistics.""" print() print("=" * 70) print("ALGERIA GHCID GENERATION STATISTICS") print("=" * 70) print(f"Total institutions processed: {self.stats['total_institutions']}") print(f"GHCIDs successfully generated: {self.stats['ghcids_generated']}") print(f"Locations used from data: {self.stats['location_used']}") print(f"Defaulted to Algiers (national): {self.stats['defaulted_to_algiers']}") print(f"Missing city codes: {self.stats['missing_city_code']}") print(f"Missing wilaya codes ('00'): {self.stats['missing_wilaya_code']}") print(f"GHCID collisions detected: {self.stats['collisions_detected']}") print() if self.stats['errors']: print(f"Errors encountered: {len(self.stats['errors'])}") print() print("Error details:") for error in self.stats['errors']: print(f" - {error}") else: print("No errors!") print() # Show collisions if self.stats['collisions_detected'] > 0: print("GHCID COLLISIONS DETECTED:") print() for ghcid, names in self.ghcid_usage.items(): if len(names) > 1: print(f" {ghcid}:") for name in names: print(f" - {name}") print() print("Note: Collisions resolved with Wikidata Q-numbers where available") else: print("No GHCID collisions detected!") print() def validate_ghcids(self, institutions: List[dict]): """Validate all generated GHCIDs.""" print("=" * 70) print("VALIDATION") print("=" * 70) ghcid_set = set() numeric_set = set() uuid_v5_set = set() uuid_v8_set = set() duplicates = [] for record in institutions: ghcid = record.get('ghcid') ghcid_numeric = record.get('ghcid_numeric') ghcid_uuid = record.get('ghcid_uuid') ghcid_uuid_sha256 = record.get('ghcid_uuid_sha256') if ghcid: if ghcid in ghcid_set: duplicates.append(ghcid) ghcid_set.add(ghcid) if ghcid_numeric: numeric_set.add(ghcid_numeric) if ghcid_uuid: uuid_v5_set.add(ghcid_uuid) if ghcid_uuid_sha256: uuid_v8_set.add(ghcid_uuid_sha256) print(f"Unique GHCIDs (with Q-numbers): {len(ghcid_set)}") print(f"Unique numeric GHCIDs: {len(numeric_set)}") print(f"Unique UUID v5 (SHA-1) identifiers: {len(uuid_v5_set)}") print(f"Unique UUID v8 (SHA-256) ids: {len(uuid_v8_set)}") if duplicates: print(f"Duplicate GHCIDs found: {len(duplicates)}") for dup in duplicates: print(f" - {dup}") else: print("All GHCIDs are unique!") print() def main(): """Main entry point.""" # Paths project_root = Path(__file__).parent.parent input_file = project_root / "data" / "instances" / "algeria" / "algerian_institutions.yaml" output_file = project_root / "data" / "instances" / "algeria" / "algerian_institutions_ghcid.yaml" backup_file = project_root / "data" / "instances" / "algeria" / f"algerian_institutions_pre_ghcid_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml" # Create backup print(f"Creating backup: {backup_file}") import shutil shutil.copy(input_file, backup_file) print() # Generate GHCIDs generator = AlgeriaGHCIDGenerator() updated_institutions = generator.process_all_institutions(input_file) # Print statistics generator.print_statistics() # Validate generator.validate_ghcids(updated_institutions) # Write updated YAML print("=" * 70) print(f"Writing updated YAML to: {output_file}") # Add header comment header = f"""# Algerian GLAM Institutions - GHCID Enhanced # Last updated: {datetime.now(timezone.utc).isoformat()} # GHCID generation: {generator.stats['ghcids_generated']}/{generator.stats['total_institutions']} institutions # # GHCID Statistics: # - Total institutions: {generator.stats['total_institutions']} # - GHCIDs generated: {generator.stats['ghcids_generated']} # - Locations used: {generator.stats['location_used']} # - Defaulted to Algiers: {generator.stats['defaulted_to_algiers']} # - Missing city codes: {generator.stats['missing_city_code']} # - Missing wilaya codes: {generator.stats['missing_wilaya_code']} # - Collisions detected: {generator.stats['collisions_detected']} # # Four-Identifier Strategy: # - ghcid: Base GHCID string (with Q-number for collisions) # - ghcid_uuid: UUID v5 (SHA-1) - PRIMARY persistent identifier # - ghcid_uuid_sha256: UUID v8 (SHA-256) - Secondary identifier # - ghcid_numeric: 64-bit numeric for CSV exports """ with open(output_file, 'w', encoding='utf-8') as f: f.write(header) yaml.dump(updated_institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100) print(f"Done! Updated {len(updated_institutions)} institutions") print(f"Output file: {output_file}") print() if __name__ == "__main__": main()