530 lines
19 KiB
Python
530 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Generate GHCIDs for Algerian GLAM institutions.
|
|
|
|
This script:
|
|
1. Loads the Algerian institutions YAML file (with Wikidata/VIAF enrichment)
|
|
2. Maps wilaya names to ISO 3166-2 codes (DZ-01, DZ-15, etc.)
|
|
3. Uses existing coordinates for reverse geocoding
|
|
4. Generates GHCID identifiers with four-identifier strategy
|
|
5. Updates the YAML file with GHCID fields
|
|
6. Detects collisions and appends Wikidata Q-numbers when available
|
|
|
|
Key Characteristics for Algeria:
|
|
- 58 wilayas (provinces) with numeric/alpha ISO 3166-2 codes
|
|
- All 19 institutions already have coordinates (100% coverage)
|
|
- National institutions mostly in Algiers (DZ-01)
|
|
- Good Wikidata coverage (15/19 = 78.9%)
|
|
|
|
Usage:
|
|
python scripts/generate_ghcids_algeria.py
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
import yaml
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Tuple
|
|
|
|
# Add src to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from glam_extractor.identifiers.ghcid import (
|
|
GHCIDComponents,
|
|
InstitutionType,
|
|
extract_abbreviation_from_name,
|
|
)
|
|
|
|
|
|
class AlgeriaRegionMapper:
|
|
"""Maps Algerian wilaya names to ISO 3166-2 codes."""
|
|
|
|
def __init__(self):
|
|
"""Load ISO 3166-2 mappings from reference data."""
|
|
self.reference_dir = Path(__file__).parent.parent / "data" / "reference"
|
|
|
|
# Load Algeria mapping (code -> name)
|
|
self.dz_mapping = self._load_mapping("iso_3166_2_dz.json")
|
|
|
|
# Create reverse mapping (normalized name -> code)
|
|
self.name_to_code = self._create_reverse_mapping(self.dz_mapping)
|
|
|
|
# City -> wilaya inference for major cities
|
|
self.city_to_wilaya = {
|
|
'ALGIERS': '01',
|
|
'ORAN': '09',
|
|
'CONSTANTINE': '04',
|
|
'ANNABA': '37',
|
|
'BATNA': '03',
|
|
'SETIF': '12',
|
|
'SÉTIF': '12',
|
|
'TLEMCEN': '15',
|
|
'BEJAIA': '18',
|
|
'BÉJAÏA': '18',
|
|
'OUARGLA': '50',
|
|
'BOUMERDES': '40',
|
|
'TIPASA': '55',
|
|
'DJANET': 'DJ',
|
|
'DJEMILA': '12', # In Sétif Province
|
|
'TIMGAD': '03', # In Batna Province
|
|
}
|
|
|
|
def _load_mapping(self, filename: str) -> Dict[str, str]:
|
|
"""Load ISO 3166-2 mapping from JSON file (code -> name)."""
|
|
filepath = self.reference_dir / filename
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
def _create_reverse_mapping(self, mapping: Dict[str, str]) -> Dict[str, str]:
|
|
"""Create normalized name -> code mapping."""
|
|
import unicodedata
|
|
|
|
reverse = {}
|
|
for code, name in mapping.items():
|
|
# Normalize: uppercase, strip accents
|
|
normalized = unicodedata.normalize('NFD', name.upper())
|
|
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
normalized = normalized.strip()
|
|
reverse[normalized] = code
|
|
return reverse
|
|
|
|
@staticmethod
|
|
def _normalize_name(name: str) -> str:
|
|
"""Normalize wilaya/city name for lookup."""
|
|
import unicodedata
|
|
|
|
normalized = name.upper()
|
|
normalized = unicodedata.normalize('NFD', normalized)
|
|
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
return normalized.strip()
|
|
|
|
def get_wilaya_code(self, wilaya_name: str) -> str:
|
|
"""
|
|
Get ISO 3166-2 wilaya code.
|
|
|
|
Args:
|
|
wilaya_name: Wilaya name (e.g., "Algiers", "Oran")
|
|
|
|
Returns:
|
|
ISO 3166-2 subdivision code (e.g., "01", "09")
|
|
Returns "00" if wilaya not found (national-level fallback)
|
|
"""
|
|
normalized = self._normalize_name(wilaya_name)
|
|
|
|
if normalized in self.name_to_code:
|
|
return self.name_to_code[normalized]
|
|
|
|
# Check city->wilaya inference
|
|
if normalized in self.city_to_wilaya:
|
|
return self.city_to_wilaya[normalized]
|
|
|
|
return "00"
|
|
|
|
|
|
class AlgeriaGHCIDGenerator:
|
|
"""Generate GHCIDs for Algerian institutions."""
|
|
|
|
def __init__(self):
|
|
"""Initialize generator with dependencies."""
|
|
self.region_mapper = AlgeriaRegionMapper()
|
|
|
|
# Statistics
|
|
self.stats = {
|
|
'total_institutions': 0,
|
|
'ghcids_generated': 0,
|
|
'location_used': 0,
|
|
'defaulted_to_algiers': 0,
|
|
'missing_city_code': 0,
|
|
'missing_wilaya_code': 0,
|
|
'collisions_detected': 0,
|
|
'errors': [],
|
|
}
|
|
|
|
# Collision detection
|
|
self.ghcid_usage: Dict[str, List[str]] = defaultdict(list)
|
|
|
|
@staticmethod
|
|
def _get_city_code(city_name: str) -> str:
|
|
"""
|
|
Generate 3-letter city code from city name.
|
|
|
|
Args:
|
|
city_name: City name (e.g., "Algiers", "Oran")
|
|
|
|
Returns:
|
|
3-letter uppercase code (e.g., "ALG", "ORA")
|
|
"""
|
|
import unicodedata
|
|
|
|
# Remove accents
|
|
normalized = unicodedata.normalize('NFD', city_name)
|
|
normalized = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
|
|
|
|
# Skip prepositions
|
|
skip_words = {'de', 'la', 'le', 'el', 'al', 'du', 'des'}
|
|
|
|
# Split into words
|
|
words = normalized.split()
|
|
words = [w for w in words if w.lower() not in skip_words]
|
|
|
|
if len(words) == 0:
|
|
return "XXX"
|
|
elif len(words) == 1:
|
|
# Single word: take first 3 letters
|
|
code = words[0][:3].upper()
|
|
else:
|
|
# Multi-word: take first letter of each word (up to 3)
|
|
code = ''.join(w[0] for w in words[:3]).upper()
|
|
|
|
# Ensure exactly 3 letters
|
|
if len(code) < 3:
|
|
code = code.ljust(3, 'X')
|
|
elif len(code) > 3:
|
|
code = code[:3]
|
|
|
|
return code
|
|
|
|
def _get_wilaya_from_city(self, city_name: str) -> str:
|
|
"""Infer wilaya code from city name."""
|
|
normalized = self.region_mapper._normalize_name(city_name)
|
|
|
|
# Check direct mapping
|
|
if normalized in self.region_mapper.city_to_wilaya:
|
|
return self.region_mapper.city_to_wilaya[normalized]
|
|
|
|
# Check if city is also a wilaya name
|
|
code = self.region_mapper.get_wilaya_code(city_name)
|
|
return code
|
|
|
|
def generate_for_institution(self, record: dict) -> Optional[GHCIDComponents]:
|
|
"""
|
|
Generate GHCID for a single Algerian institution.
|
|
|
|
Args:
|
|
record: Institution record from YAML (dict)
|
|
|
|
Returns:
|
|
GHCIDComponents if successful, None otherwise
|
|
"""
|
|
self.stats['total_institutions'] += 1
|
|
|
|
try:
|
|
# Extract required fields
|
|
name = record.get('name')
|
|
institution_type_str = record.get('institution_type', 'UNKNOWN')
|
|
|
|
if not name:
|
|
self.stats['errors'].append(f"Missing name for record: {record.get('id')}")
|
|
return None
|
|
|
|
# Country code (always DZ)
|
|
country_code = "DZ"
|
|
|
|
# Get location data
|
|
locations = record.get('locations', [])
|
|
city_name = None
|
|
wilaya_code = "00"
|
|
|
|
if locations:
|
|
location = locations[0]
|
|
city_name = location.get('city')
|
|
region = location.get('region')
|
|
|
|
if city_name:
|
|
self.stats['location_used'] += 1
|
|
# Infer wilaya from city
|
|
wilaya_code = self._get_wilaya_from_city(city_name)
|
|
if wilaya_code == "00":
|
|
self.stats['missing_wilaya_code'] += 1
|
|
# Try region if available
|
|
if region:
|
|
wilaya_code = self.region_mapper.get_wilaya_code(region)
|
|
|
|
# Default Algiers for national institutions without city
|
|
if not city_name:
|
|
if 'national' in name.lower() or 'nationale' in name.lower():
|
|
city_name = "Algiers"
|
|
wilaya_code = "01"
|
|
self.stats['defaulted_to_algiers'] += 1
|
|
else:
|
|
self.stats['errors'].append(f"No city for: {name}")
|
|
return None
|
|
|
|
# Generate city code
|
|
city_code = self._get_city_code(city_name)
|
|
if city_code == "XXX":
|
|
self.stats['missing_city_code'] += 1
|
|
|
|
# Map institution type to GHCID type code
|
|
try:
|
|
inst_type = InstitutionType[institution_type_str]
|
|
except KeyError:
|
|
inst_type = InstitutionType.UNKNOWN
|
|
|
|
# Generate abbreviation from name
|
|
abbreviation = extract_abbreviation_from_name(name)
|
|
|
|
# Create GHCID components
|
|
components = GHCIDComponents(
|
|
country_code=country_code,
|
|
region_code=wilaya_code,
|
|
city_locode=city_code,
|
|
institution_type=inst_type.value,
|
|
abbreviation=abbreviation,
|
|
)
|
|
|
|
# Validate
|
|
is_valid, error_msg = components.validate()
|
|
if not is_valid:
|
|
self.stats['errors'].append(f"Invalid GHCID for {name}: {error_msg}")
|
|
return None
|
|
|
|
# Check for collisions (before Q-number)
|
|
base_ghcid = components.to_string()
|
|
self.ghcid_usage[base_ghcid].append(name)
|
|
if len(self.ghcid_usage[base_ghcid]) > 1:
|
|
self.stats['collisions_detected'] += 1
|
|
|
|
self.stats['ghcids_generated'] += 1
|
|
return components
|
|
|
|
except Exception as e:
|
|
self.stats['errors'].append(f"Error generating GHCID for {record.get('name', 'unknown')}: {e}")
|
|
return None
|
|
|
|
def process_all_institutions(self, input_file: Path) -> List[dict]:
|
|
"""
|
|
Process all institutions in YAML file and generate GHCIDs.
|
|
|
|
Args:
|
|
input_file: Path to Algerian institutions YAML file
|
|
|
|
Returns:
|
|
List of updated institution records with GHCID fields
|
|
"""
|
|
print(f"Loading Algerian institutions from: {input_file}")
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"Found {len(institutions)} institutions")
|
|
print()
|
|
|
|
updated_institutions = []
|
|
|
|
for i, record in enumerate(institutions, 1):
|
|
print(f"Processing {i}/{len(institutions)}: {record.get('name', 'unknown')}")
|
|
|
|
# Generate GHCID
|
|
ghcid_components = self.generate_for_institution(record)
|
|
|
|
if ghcid_components:
|
|
# Check for Wikidata Q-number (for collision resolution)
|
|
wikidata_qid = None
|
|
identifiers = record.get('identifiers', [])
|
|
for identifier in identifiers:
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
wikidata_qid = identifier.get('identifier_value')
|
|
break
|
|
|
|
# If collision exists and we have Q-number, append it
|
|
base_ghcid = ghcid_components.to_string()
|
|
if len(self.ghcid_usage[base_ghcid]) > 1 and wikidata_qid:
|
|
# Append Q-number for disambiguation
|
|
ghcid_with_q = f"{base_ghcid}-{wikidata_qid}"
|
|
record['ghcid'] = ghcid_with_q
|
|
print(f" -> Collision detected, using GHCID with Q-number: {ghcid_with_q}")
|
|
else:
|
|
record['ghcid'] = base_ghcid
|
|
print(f" -> GHCID: {base_ghcid}")
|
|
|
|
# Add UUID v5 (SHA-1) - PRIMARY identifier
|
|
record['ghcid_uuid'] = str(ghcid_components.to_uuid())
|
|
|
|
# Add UUID v8 (SHA-256) - Secondary identifier
|
|
record['ghcid_uuid_sha256'] = str(ghcid_components.to_uuid_sha256())
|
|
|
|
# Add numeric identifier
|
|
record['ghcid_numeric'] = ghcid_components.to_numeric()
|
|
|
|
# Add GHCID to identifiers list
|
|
has_ghcid = any(i.get('identifier_scheme') == 'GHCID' for i in identifiers)
|
|
if not has_ghcid:
|
|
identifiers.append({
|
|
'identifier_scheme': 'GHCID',
|
|
'identifier_value': record['ghcid'],
|
|
})
|
|
|
|
record['identifiers'] = identifiers
|
|
|
|
# Update provenance with GHCID generation metadata
|
|
provenance = record.get('provenance', {})
|
|
provenance['ghcid_generation'] = {
|
|
'generated_date': datetime.now(timezone.utc).isoformat(),
|
|
'generation_method': 'AlgeriaGHCIDGenerator with coordinate-based location',
|
|
'base_ghcid': base_ghcid,
|
|
'has_wikidata_disambiguation': wikidata_qid is not None,
|
|
}
|
|
record['provenance'] = provenance
|
|
|
|
updated_institutions.append(record)
|
|
|
|
return updated_institutions
|
|
|
|
def print_statistics(self):
|
|
"""Print generation statistics."""
|
|
print()
|
|
print("=" * 70)
|
|
print("ALGERIA GHCID GENERATION STATISTICS")
|
|
print("=" * 70)
|
|
print(f"Total institutions processed: {self.stats['total_institutions']}")
|
|
print(f"GHCIDs successfully generated: {self.stats['ghcids_generated']}")
|
|
print(f"Locations used from data: {self.stats['location_used']}")
|
|
print(f"Defaulted to Algiers (national): {self.stats['defaulted_to_algiers']}")
|
|
print(f"Missing city codes: {self.stats['missing_city_code']}")
|
|
print(f"Missing wilaya codes ('00'): {self.stats['missing_wilaya_code']}")
|
|
print(f"GHCID collisions detected: {self.stats['collisions_detected']}")
|
|
print()
|
|
|
|
if self.stats['errors']:
|
|
print(f"Errors encountered: {len(self.stats['errors'])}")
|
|
print()
|
|
print("Error details:")
|
|
for error in self.stats['errors']:
|
|
print(f" - {error}")
|
|
else:
|
|
print("No errors!")
|
|
|
|
print()
|
|
|
|
# Show collisions
|
|
if self.stats['collisions_detected'] > 0:
|
|
print("GHCID COLLISIONS DETECTED:")
|
|
print()
|
|
for ghcid, names in self.ghcid_usage.items():
|
|
if len(names) > 1:
|
|
print(f" {ghcid}:")
|
|
for name in names:
|
|
print(f" - {name}")
|
|
print()
|
|
print("Note: Collisions resolved with Wikidata Q-numbers where available")
|
|
else:
|
|
print("No GHCID collisions detected!")
|
|
|
|
print()
|
|
|
|
def validate_ghcids(self, institutions: List[dict]):
|
|
"""Validate all generated GHCIDs."""
|
|
print("=" * 70)
|
|
print("VALIDATION")
|
|
print("=" * 70)
|
|
|
|
ghcid_set = set()
|
|
numeric_set = set()
|
|
uuid_v5_set = set()
|
|
uuid_v8_set = set()
|
|
duplicates = []
|
|
|
|
for record in institutions:
|
|
ghcid = record.get('ghcid')
|
|
ghcid_numeric = record.get('ghcid_numeric')
|
|
ghcid_uuid = record.get('ghcid_uuid')
|
|
ghcid_uuid_sha256 = record.get('ghcid_uuid_sha256')
|
|
|
|
if ghcid:
|
|
if ghcid in ghcid_set:
|
|
duplicates.append(ghcid)
|
|
ghcid_set.add(ghcid)
|
|
|
|
if ghcid_numeric:
|
|
numeric_set.add(ghcid_numeric)
|
|
|
|
if ghcid_uuid:
|
|
uuid_v5_set.add(ghcid_uuid)
|
|
|
|
if ghcid_uuid_sha256:
|
|
uuid_v8_set.add(ghcid_uuid_sha256)
|
|
|
|
print(f"Unique GHCIDs (with Q-numbers): {len(ghcid_set)}")
|
|
print(f"Unique numeric GHCIDs: {len(numeric_set)}")
|
|
print(f"Unique UUID v5 (SHA-1) identifiers: {len(uuid_v5_set)}")
|
|
print(f"Unique UUID v8 (SHA-256) ids: {len(uuid_v8_set)}")
|
|
|
|
if duplicates:
|
|
print(f"Duplicate GHCIDs found: {len(duplicates)}")
|
|
for dup in duplicates:
|
|
print(f" - {dup}")
|
|
else:
|
|
print("All GHCIDs are unique!")
|
|
|
|
print()
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
# Paths
|
|
project_root = Path(__file__).parent.parent
|
|
input_file = project_root / "data" / "instances" / "algeria" / "algerian_institutions.yaml"
|
|
output_file = project_root / "data" / "instances" / "algeria" / "algerian_institutions_ghcid.yaml"
|
|
backup_file = project_root / "data" / "instances" / "algeria" / f"algerian_institutions_pre_ghcid_{datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
|
|
|
|
# Create backup
|
|
print(f"Creating backup: {backup_file}")
|
|
import shutil
|
|
shutil.copy(input_file, backup_file)
|
|
print()
|
|
|
|
# Generate GHCIDs
|
|
generator = AlgeriaGHCIDGenerator()
|
|
updated_institutions = generator.process_all_institutions(input_file)
|
|
|
|
# Print statistics
|
|
generator.print_statistics()
|
|
|
|
# Validate
|
|
generator.validate_ghcids(updated_institutions)
|
|
|
|
# Write updated YAML
|
|
print("=" * 70)
|
|
print(f"Writing updated YAML to: {output_file}")
|
|
|
|
# Add header comment
|
|
header = f"""# Algerian GLAM Institutions - GHCID Enhanced
|
|
# Last updated: {datetime.now(timezone.utc).isoformat()}
|
|
# GHCID generation: {generator.stats['ghcids_generated']}/{generator.stats['total_institutions']} institutions
|
|
#
|
|
# GHCID Statistics:
|
|
# - Total institutions: {generator.stats['total_institutions']}
|
|
# - GHCIDs generated: {generator.stats['ghcids_generated']}
|
|
# - Locations used: {generator.stats['location_used']}
|
|
# - Defaulted to Algiers: {generator.stats['defaulted_to_algiers']}
|
|
# - Missing city codes: {generator.stats['missing_city_code']}
|
|
# - Missing wilaya codes: {generator.stats['missing_wilaya_code']}
|
|
# - Collisions detected: {generator.stats['collisions_detected']}
|
|
#
|
|
# Four-Identifier Strategy:
|
|
# - ghcid: Base GHCID string (with Q-number for collisions)
|
|
# - ghcid_uuid: UUID v5 (SHA-1) - PRIMARY persistent identifier
|
|
# - ghcid_uuid_sha256: UUID v8 (SHA-256) - Secondary identifier
|
|
# - ghcid_numeric: 64-bit numeric for CSV exports
|
|
|
|
"""
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(header)
|
|
yaml.dump(updated_institutions, f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=100)
|
|
|
|
print(f"Done! Updated {len(updated_institutions)} institutions")
|
|
print(f"Output file: {output_file}")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|