glam/scripts/generate_ghcids_algeria.py
2025-12-07 00:26:01 +01:00

530 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Generate GHCIDs for Algerian GLAM institutions.
This script:
1. Loads the Algerian institutions YAML file (with Wikidata/VIAF enrichment)
2. Maps wilaya names to ISO 3166-2 codes (DZ-01, DZ-15, etc.)
3. Uses existing coordinates for reverse geocoding
4. Generates GHCID identifiers with four-identifier strategy
5. Updates the YAML file with GHCID fields
6. Detects collisions and appends Wikidata Q-numbers when available
Key Characteristics for Algeria:
- 58 wilayas (provinces) with numeric/alpha ISO 3166-2 codes
- All 19 institutions already have coordinates (100% coverage)
- National institutions mostly in Algiers (DZ-01)
- Good Wikidata coverage (15/19 = 78.9%)
Usage:
python scripts/generate_ghcids_algeria.py
"""
import json
import re
import sys
import yaml
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.identifiers.ghcid import (
GHCIDComponents,
InstitutionType,
extract_abbreviation_from_name,
)
class AlgeriaRegionMapper:
"""Maps Algerian wilaya names to ISO 3166-2 codes."""
def __init__(self):
"""Load ISO 3166-2 mappings from reference data."""
self.reference_dir = Path(__file__).parent.parent / "data" / "reference"
# Load Algeria mapping (code -> name)
self.dz_mapping = self._load_mapping("iso_3166_2_dz.json")
# Create reverse mapping (normalized name -> code)
self.name_to_code = self._create_reverse_mapping(self.dz_mapping)
# City -> wilaya inference for major cities
self.city_to_wilaya = {
'ALGIERS': '01',
'ORAN': '09',
'CONSTANTINE': '04',
'ANNABA': '37',
'BATNA': '03',
'SETIF': '12',
'SÉTIF': '12',
'TLEMCEN': '15',
'BEJAIA': '18',
'BÉJAÏA': '18',
'OUARGLA': '50',
'BOUMERDES': '40',
'TIPASA': '55',
'DJANET': 'DJ',
'DJEMILA': '12', # In Sétif Province
'TIMGAD': '03', # In Batna Province
}
def _load_mapping(self, filename: str) -> Dict[str, str]:
"""Load ISO 3166-2 mapping from JSON file (code -> name)."""
filepath = self.reference_dir / filename
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
def _create_reverse_mapping(self, mapping: Dict[str, str]) -> Dict[str, str]:
"""Create normalized name -> code mapping."""
import unicodedata
reverse = {}
for code, name in mapping.items():
# Normalize: uppercase, strip accents
normalized = unicodedata.normalize('NFD', name.upper())
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
normalized = normalized.strip()
reverse[normalized] = code
return reverse
@staticmethod
def _normalize_name(name: str) -> str:
"""Normalize wilaya/city name for lookup."""
import unicodedata
normalized = name.upper()
normalized = unicodedata.normalize('NFD', normalized)
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
return normalized.strip()
def get_wilaya_code(self, wilaya_name: str) -> str:
"""
Get ISO 3166-2 wilaya code.
Args:
wilaya_name: Wilaya name (e.g., "Algiers", "Oran")
Returns:
ISO 3166-2 subdivision code (e.g., "01", "09")
Returns "00" if wilaya not found (national-level fallback)
"""
normalized = self._normalize_name(wilaya_name)
if normalized in self.name_to_code:
return self.name_to_code[normalized]
# Check city->wilaya inference
if normalized in self.city_to_wilaya:
return self.city_to_wilaya[normalized]
return "00"
class AlgeriaGHCIDGenerator:
"""Generate GHCIDs for Algerian institutions."""
def __init__(self):
"""Initialize generator with dependencies."""
self.region_mapper = AlgeriaRegionMapper()
# Statistics
self.stats = {
'total_institutions': 0,
'ghcids_generated': 0,
'location_used': 0,
'defaulted_to_algiers': 0,
'missing_city_code': 0,
'missing_wilaya_code': 0,
'collisions_detected': 0,
'errors': [],
}
# Collision detection
self.ghcid_usage: Dict[str, List[str]] = defaultdict(list)
@staticmethod
def _get_city_code(city_name: str) -> str:
"""
Generate 3-letter city code from city name.
Args:
city_name: City name (e.g., "Algiers", "Oran")
Returns:
3-letter uppercase code (e.g., "ALG", "ORA")
"""
import unicodedata
# Remove accents
normalized = unicodedata.normalize('NFD', city_name)
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
# Skip prepositions
skip_words = {'de', 'la', 'le', 'el', 'al', 'du', 'des'}
# Split into words
words = normalized.split()
words = [w for w in words if w.lower() not in skip_words]
if len(words) == 0:
return "XXX"
elif len(words) == 1:
# Single word: take first 3 letters
code = words[0][:3].upper()
else:
# Multi-word: take first letter of each word (up to 3)
code = ''.join(w[0] for w in words[:3]).upper()
# Ensure exactly 3 letters
if len(code) < 3:
code = code.ljust(3, 'X')
elif len(code) > 3:
code = code[:3]
return code
def _get_wilaya_from_city(self, city_name: str) -> str:
"""Infer wilaya code from city name."""
normalized = self.region_mapper._normalize_name(city_name)
# Check direct mapping
if normalized in self.region_mapper.city_to_wilaya:
return self.region_mapper.city_to_wilaya[normalized]
# Check if city is also a wilaya name
code = self.region_mapper.get_wilaya_code(city_name)
return code
def generate_for_institution(self, record: dict) -> Optional[GHCIDComponents]:
"""
Generate GHCID for a single Algerian institution.
Args:
record: Institution record from YAML (dict)
Returns:
GHCIDComponents if successful, None otherwise
"""
self.stats['total_institutions'] += 1
try:
# Extract required fields
name = record.get('name')
institution_type_str = record.get('institution_type', 'UNKNOWN')
if not name:
self.stats['errors'].append(f"Missing name for record: {record.get('id')}")
return None
# Country code (always DZ)
country_code = "DZ"
# Get location data
locations = record.get('locations', [])
city_name = None
wilaya_code = "00"
if locations:
location = locations[0]
city_name = location.get('city')
region = location.get('region')
if city_name:
self.stats['location_used'] += 1
# Infer wilaya from city
wilaya_code = self._get_wilaya_from_city(city_name)
if wilaya_code == "00":
self.stats['missing_wilaya_code'] += 1
# Try region if available
if region:
wilaya_code = self.region_mapper.get_wilaya_code(region)
# Default Algiers for national institutions without city
if not city_name:
if 'national' in name.lower() or 'nationale' in name.lower():
city_name = "Algiers"
wilaya_code = "01"
self.stats['defaulted_to_algiers'] += 1
else:
self.stats['errors'].append(f"No city for: {name}")
return None
# Generate city code
city_code = self._get_city_code(city_name)
if city_code == "XXX":
self.stats['missing_city_code'] += 1
# Map institution type to GHCID type code
try:
inst_type = InstitutionType[institution_type_str]
except KeyError:
inst_type = InstitutionType.UNKNOWN
# Generate abbreviation from name
abbreviation = extract_abbreviation_from_name(name)
# Create GHCID components
components = GHCIDComponents(
country_code=country_code,
region_code=wilaya_code,
city_locode=city_code,
institution_type=inst_type.value,
abbreviation=abbreviation,
)
# Validate
is_valid, error_msg = components.validate()
if not is_valid:
self.stats['errors'].append(f"Invalid GHCID for {name}: {error_msg}")
return None
# Check for collisions (before Q-number)
base_ghcid = components.to_string()
self.ghcid_usage[base_ghcid].append(name)
if len(self.ghcid_usage[base_ghcid]) > 1:
self.stats['collisions_detected'] += 1
self.stats['ghcids_generated'] += 1
return components
except Exception as e:
self.stats['errors'].append(f"Error generating GHCID for {record.get('name', 'unknown')}: {e}")
return None
def process_all_institutions(self, input_file: Path) -> List[dict]:
"""
Process all institutions in YAML file and generate GHCIDs.
Args:
input_file: Path to Algerian institutions YAML file
Returns:
List of updated institution records with GHCID fields
"""
print(f"Loading Algerian institutions from: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"Found {len(institutions)} institutions")
print()
updated_institutions = []
for i, record in enumerate(institutions, 1):
print(f"Processing {i}/{len(institutions)}: {record.get('name', 'unknown')}")
# Generate GHCID
ghcid_components = self.generate_for_institution(record)
if ghcid_components:
# Check for Wikidata Q-number (for collision resolution)
wikidata_qid = None
identifiers = record.get('identifiers', [])
for identifier in identifiers:
if identifier.get('identifier_scheme') == 'Wikidata':
wikidata_qid = identifier.get('identifier_value')
break
# If collision exists and we have Q-number, append it
base_ghcid = ghcid_components.to_string()
if len(self.ghcid_usage[base_ghcid]) > 1 and wikidata_qid:
# Append Q-number for disambiguation
ghcid_with_q = f"{base_ghcid}-{wikidata_qid}"
record['ghcid'] = ghcid_with_q
print(f" -> Collision detected, using GHCID with Q-number: {ghcid_with_q}")
else:
record['ghcid'] = base_ghcid
print(f" -> GHCID: {base_ghcid}")
# Add UUID v5 (SHA-1) - PRIMARY identifier
record['ghcid_uuid'] = str(ghcid_components.to_uuid())
# Add UUID v8 (SHA-256) - Secondary identifier
record['ghcid_uuid_sha256'] = str(ghcid_components.to_uuid_sha256())
# Add numeric identifier
record['ghcid_numeric'] = ghcid_components.to_numeric()
# Add GHCID to identifiers list
has_ghcid = any(i.get('identifier_scheme') == 'GHCID' for i in identifiers)
if not has_ghcid:
identifiers.append({
'identifier_scheme': 'GHCID',
'identifier_value': record['ghcid'],
})
record['identifiers'] = identifiers
# Update provenance with GHCID generation metadata
provenance = record.get('provenance', {})
provenance['ghcid_generation'] = {
'generated_date': datetime.now(timezone.utc).isoformat(),
'generation_method': 'AlgeriaGHCIDGenerator with coordinate-based location',
'base_ghcid': base_ghcid,
'has_wikidata_disambiguation': wikidata_qid is not None,
}
record['provenance'] = provenance
updated_institutions.append(record)
return updated_institutions
def print_statistics(self):
"""Print generation statistics."""
print()
print("=" * 70)
print("ALGERIA GHCID GENERATION STATISTICS")
print("=" * 70)
print(f"Total institutions processed: {self.stats['total_institutions']}")
print(f"GHCIDs successfully generated: {self.stats['ghcids_generated']}")
print(f"Locations used from data: {self.stats['location_used']}")
print(f"Defaulted to Algiers (national): {self.stats['defaulted_to_algiers']}")
print(f"Missing city codes: {self.stats['missing_city_code']}")
print(f"Missing wilaya codes ('00'): {self.stats['missing_wilaya_code']}")
print(f"GHCID collisions detected: {self.stats['collisions_detected']}")
print()
if self.stats['errors']:
print(f"Errors encountered: {len(self.stats['errors'])}")
print()
print("Error details:")
for error in self.stats['errors']:
print(f" - {error}")
else:
print("No errors!")
print()
# Show collisions
if self.stats['collisions_detected'] > 0:
print("GHCID COLLISIONS DETECTED:")
print()
for ghcid, names in self.ghcid_usage.items():
if len(names) > 1:
print(f" {ghcid}:")
for name in names:
print(f" - {name}")
print()
print("Note: Collisions resolved with Wikidata Q-numbers where available")
else:
print("No GHCID collisions detected!")
print()
def validate_ghcids(self, institutions: List[dict]):
"""Validate all generated GHCIDs."""
print("=" * 70)
print("VALIDATION")
print("=" * 70)
ghcid_set = set()
numeric_set = set()
uuid_v5_set = set()
uuid_v8_set = set()
duplicates = []
for record in institutions:
ghcid = record.get('ghcid')
ghcid_numeric = record.get('ghcid_numeric')
ghcid_uuid = record.get('ghcid_uuid')
ghcid_uuid_sha256 = record.get('ghcid_uuid_sha256')
if ghcid:
if ghcid in ghcid_set:
duplicates.append(ghcid)
ghcid_set.add(ghcid)
if ghcid_numeric:
numeric_set.add(ghcid_numeric)
if ghcid_uuid:
uuid_v5_set.add(ghcid_uuid)
if ghcid_uuid_sha256:
uuid_v8_set.add(ghcid_uuid_sha256)
print(f"Unique GHCIDs (with Q-numbers): {len(ghcid_set)}")
print(f"Unique numeric GHCIDs: {len(numeric_set)}")
print(f"Unique UUID v5 (SHA-1) identifiers: {len(uuid_v5_set)}")
print(f"Unique UUID v8 (SHA-256) ids: {len(uuid_v8_set)}")
if duplicates:
print(f"Duplicate GHCIDs found: {len(duplicates)}")
for dup in duplicates:
print(f" - {dup}")
else:
print("All GHCIDs are unique!")
print()
def main():
"""Main entry point."""
# Paths
project_root = Path(__file__).parent.parent
input_file = project_root / "data" / "instances" / "algeria" / "algerian_institutions.yaml"
output_file = project_root / "data" / "instances" / "algeria" / "algerian_institutions_ghcid.yaml"
backup_file = project_root / "data" / "instances" / "algeria" / f"algerian_institutions_pre_ghcid_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
# Create backup
print(f"Creating backup: {backup_file}")
import shutil
shutil.copy(input_file, backup_file)
print()
# Generate GHCIDs
generator = AlgeriaGHCIDGenerator()
updated_institutions = generator.process_all_institutions(input_file)
# Print statistics
generator.print_statistics()
# Validate
generator.validate_ghcids(updated_institutions)
# Write updated YAML
print("=" * 70)
print(f"Writing updated YAML to: {output_file}")
# Add header comment
header = f"""# Algerian GLAM Institutions - GHCID Enhanced
# Last updated: {datetime.now(timezone.utc).isoformat()}
# GHCID generation: {generator.stats['ghcids_generated']}/{generator.stats['total_institutions']} institutions
#
# GHCID Statistics:
# - Total institutions: {generator.stats['total_institutions']}
# - GHCIDs generated: {generator.stats['ghcids_generated']}
# - Locations used: {generator.stats['location_used']}
# - Defaulted to Algiers: {generator.stats['defaulted_to_algiers']}
# - Missing city codes: {generator.stats['missing_city_code']}
# - Missing wilaya codes: {generator.stats['missing_wilaya_code']}
# - Collisions detected: {generator.stats['collisions_detected']}
#
# Four-Identifier Strategy:
# - ghcid: Base GHCID string (with Q-number for collisions)
# - ghcid_uuid: UUID v5 (SHA-1) - PRIMARY persistent identifier
# - ghcid_uuid_sha256: UUID v8 (SHA-256) - Secondary identifier
# - ghcid_numeric: 64-bit numeric for CSV exports
"""
with open(output_file, 'w', encoding='utf-8') as f:
f.write(header)
yaml.dump(updated_institutions, f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=100)
print(f"Done! Updated {len(updated_institutions)} institutions")
print(f"Output file: {output_file}")
print()
if __name__ == "__main__":
main()