#!/usr/bin/env python3 """ Comprehensive Enhancement Pipeline for Tunisian Heritage Institutions This script applies the full enhancement workflow to the validated Tunisia dataset: 1. GHCID Generation - Generate persistent identifiers using TN country code 2. Geocoding - Add latitude/longitude coordinates via Nominatim API 3. Wikidata Enrichment - Add Q-numbers and additional identifiers via SPARQL 4. Multi-Format Export - Generate RDF, JSON-LD, CSV for different use cases Features: - Deterministic UUID generation (UUID v5 and v8) - Collision detection and resolution - Persistent caching for API results - Rate limiting (Nominatim: 1 req/sec) - Checkpoint saving for resume capability - Comprehensive logging and statistics Usage: python scripts/enhance_tunisia_dataset.py [--skip-ghcid] [--skip-geocoding] [--skip-wikidata] Options: --skip-ghcid Skip GHCID generation (if already done) --skip-geocoding Skip geocoding (if already done) --skip-wikidata Skip Wikidata enrichment (if already done) --dry-run Show what would be done without modifying data --verbose Show detailed progress information """ import argparse import json import sqlite3 import sys import time import unicodedata import uuid import yaml from collections import Counter, defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional, Set, Tuple import requests from SPARQLWrapper import JSON as SPARQL_JSON from SPARQLWrapper import SPARQLWrapper # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.identifiers.ghcid import ( GHCID_NAMESPACE, GHCIDComponents, GHCIDGenerator, InstitutionType, extract_abbreviation_from_name, ) from glam_extractor.geocoding.geonames_lookup import GeoNamesDB # ============================================================================= # GHCID GENERATION # ============================================================================= class TunisiaGHCIDGenerator: """Generate GHCIDs for Tunisian heritage institutions.""" # Tunisian governorate codes (ISO 3166-2:TN) GOVERNORATE_CODES = { # Normalize for lookup: UPPERCASE, no accents "TUNIS": "TUN", "ARIANA": "ARI", "BEN AROUS": "BEN", "MANOUBA": "MAN", "NABEUL": "NAB", "ZAGHOUAN": "ZAG", "BIZERTE": "BIZ", "BEJA": "BEJ", "JENDOUBA": "JEN", "KEF": "KEF", "SILIANA": "SIL", "KAIROUAN": "KAI", "KASSERINE": "KAS", "SIDI BOUZID": "SID", "SOUSSE": "SOU", "MONASTIR": "MON", "MAHDIA": "MAH", "SFAX": "SFA", "GAFSA": "GAF", "TOZEUR": "TOZ", "KEBILI": "KEB", "GABES": "GAB", "MEDENINE": "MED", "TATAOUINE": "TAT" } def __init__(self): self.ghcid_gen = GHCIDGenerator() self.geonames_db = GeoNamesDB() self.stats = { 'total': 0, 'generated': 0, 'missing_city_code': 0, 'collisions': 0, 'errors': [] } self.ghcid_usage: Dict[str, List[str]] = defaultdict(list) @staticmethod def normalize_name(name: str) -> str: """Normalize name: uppercase, remove accents, remove spaces and punctuation.""" normalized = unicodedata.normalize('NFD', name.upper()) # Remove accents no_accents = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn') # Remove spaces and punctuation, keep only alphanumeric return ''.join(c for c in no_accents if c.isalnum()) def get_governorate_code(self, city: str, region: Optional[str]) -> str: """ Get governorate code from city or region name. Args: city: City name (e.g., "Tunis", "Sfax", "Kairouan") region: Region/governorate name (optional) Returns: 3-letter governorate code (e.g., "TUN", "SFA", "KAI") """ # Try region first (more specific) if region: normalized = self.normalize_name(region) if normalized in self.GOVERNORATE_CODES: return self.GOVERNORATE_CODES[normalized] # Try city name normalized = self.normalize_name(city) if normalized in self.GOVERNORATE_CODES: return self.GOVERNORATE_CODES[normalized] # Common city-to-governorate mappings city_mappings = { "LA MARSA": "TUN", "CARTHAGE": "TUN", "SIDI BOU SAID": "TUN", "LA GOULETTE": "TUN", "HAMMAMET": "NAB", "NABEUL": "NAB", "KORBA": "NAB", } if normalized in city_mappings: return city_mappings[normalized] # Fallback to national-level return "TN0" def get_city_code(self, city: str, country: str = "TN") -> Optional[str]: """Get 3-letter city code from GeoNames.""" if not city: return None # Try GeoNames lookup result = self.geonames_db.lookup_city(city, country) if result: # Use the built-in abbreviation method return result.get_abbreviation() # Fallback to first 3 letters of normalized city name normalized = self.normalize_name(city) return normalized[:3] if normalized else None def generate_ghcid(self, institution: Dict[str, Any]) -> Optional[Dict[str, str]]: """ Generate GHCID for a Tunisian institution. Returns: Dict with ghcid, ghcid_uuid, ghcid_uuid_sha256, ghcid_numeric """ self.stats['total'] += 1 # Extract location locations = institution.get('locations', []) if not locations: self.stats['errors'].append(f"{institution['name']}: No location") return None location = locations[0] city = location.get('city', '') region = location.get('region', '') country = location.get('country', 'TN') if country != 'TN': self.stats['errors'].append(f"{institution['name']}: Not Tunisia ({country})") return None # Get codes gov_code = self.get_governorate_code(city, region) city_code = self.get_city_code(city, country) if not city_code: self.stats['missing_city_code'] += 1 # Use first 3 letters of city name as fallback city_code = self.normalize_name(city)[:3] if city else "UNK" # Institution type inst_type_str = institution.get('institution_type', 'MIXED') try: inst_type = InstitutionType[inst_type_str] except KeyError: inst_type = InstitutionType.MIXED # Get name - prefer full name over acronym for abbreviation extraction name = institution.get('name', '') alternative_names = institution.get('alternative_names', []) # Use original name for abbreviation (better results than from acronyms) # Unless the original name is already short (< 15 chars), then use first alternative if len(name) < 15 and alternative_names: english_name = alternative_names[0] else: english_name = name # Generate GHCID components components = self.ghcid_gen.generate( institution_name=name, english_name=english_name, institution_type=inst_type, country_code="TN", region_code=gov_code, city_locode=city_code ) # Check for Wikidata Q-number (for collision resolution) wikidata_qid = None identifiers = institution.get('identifiers', []) for identifier in identifiers: if identifier.get('identifier_scheme') == 'Wikidata': wikidata_qid = identifier.get('identifier_value', '') break # Build GHCID string (with Q-number if available) base_ghcid = components.to_string() if wikidata_qid: ghcid_str = f"{base_ghcid}-{wikidata_qid}" else: ghcid_str = base_ghcid # Collision detection (use base GHCID without Q-number) self.ghcid_usage[base_ghcid].append(name) # Generate all UUID formats from the final GHCID string ghcid_result = { 'ghcid': ghcid_str, 'ghcid_uuid': str(uuid.uuid5(GHCID_NAMESPACE, ghcid_str)), 'ghcid_uuid_sha256': str(components.to_uuid_sha256()), 'ghcid_numeric': components.to_numeric() } if len(self.ghcid_usage[base_ghcid]) > 1: self.stats['collisions'] += 1 self.stats['generated'] += 1 return ghcid_result def print_statistics(self): """Print GHCID generation statistics.""" print("\n" + "="*80) print("GHCID GENERATION STATISTICS") print("="*80) print(f"Total institutions: {self.stats['total']}") print(f"GHCIDs generated: {self.stats['generated']}") print(f"Missing city codes: {self.stats['missing_city_code']}") print(f"Collisions detected: {self.stats['collisions']}") if self.stats['collisions'] > 0: print("\nCollisions (multiple institutions with same base GHCID):") for ghcid, names in self.ghcid_usage.items(): if len(names) > 1: print(f" {ghcid}:") for name in names: print(f" - {name}") if self.stats['errors']: print(f"\nErrors: {len(self.stats['errors'])}") for error in self.stats['errors'][:10]: print(f" - {error}") # ============================================================================= # GEOCODING # ============================================================================= class GeocodingCache: """Persistent SQLite cache for geocoding results.""" def __init__(self, cache_file: Path): self.cache_file = cache_file cache_file.parent.mkdir(parents=True, exist_ok=True) self.conn = sqlite3.connect(cache_file) self._initialize() def _initialize(self): """Create cache table.""" self.conn.execute(""" CREATE TABLE IF NOT EXISTS geocoding_cache ( query TEXT PRIMARY KEY, latitude REAL, longitude REAL, display_name TEXT, timestamp TEXT, success INTEGER ) """) self.conn.commit() def get(self, query: str) -> Optional[Dict]: """Retrieve cached result.""" cursor = self.conn.execute( "SELECT latitude, longitude, display_name, success FROM geocoding_cache WHERE query = ?", (query,) ) row = cursor.fetchone() if row and row[3]: # success=1 return { 'latitude': row[0], 'longitude': row[1], 'display_name': row[2] } return None def put(self, query: str, result: Optional[Dict]): """Store result in cache.""" if result: self.conn.execute(""" INSERT OR REPLACE INTO geocoding_cache (query, latitude, longitude, display_name, timestamp, success) VALUES (?, ?, ?, ?, ?, 1) """, ( query, result.get('latitude'), result.get('longitude'), result.get('display_name', ''), datetime.now(timezone.utc).isoformat() )) else: self.conn.execute(""" INSERT OR REPLACE INTO geocoding_cache (query, latitude, longitude, display_name, timestamp, success) VALUES (?, NULL, NULL, NULL, ?, 0) """, (query, datetime.now(timezone.utc).isoformat())) self.conn.commit() def close(self): """Close database connection.""" self.conn.close() class TunisiaGeocoder: """Geocode Tunisian institutions with Nominatim API.""" NOMINATIM_URL = "https://nominatim.openstreetmap.org/search" USER_AGENT = "GLAM-Heritage-Tunisia/1.0 (research project)" RATE_LIMIT = 1.0 # 1 request per second def __init__(self, cache_file: Path): self.cache = GeocodingCache(cache_file) self.last_request_time = 0 self.stats = { 'total': 0, 'already_geocoded': 0, 'cache_hits': 0, 'api_calls': 0, 'successful': 0, 'failed': 0 } def _wait_for_rate_limit(self): """Enforce 1 req/sec rate limit.""" elapsed = time.time() - self.last_request_time if elapsed < self.RATE_LIMIT: time.sleep(self.RATE_LIMIT - elapsed) self.last_request_time = time.time() def _build_query(self, city: str, country: str = "Tunisia") -> str: """Build Nominatim search query.""" return f"{city}, {country}" def _call_nominatim(self, query: str) -> Optional[Dict]: """Call Nominatim API.""" params = { 'q': query, 'format': 'json', 'limit': 1, 'countrycodes': 'tn' } headers = {'User-Agent': self.USER_AGENT} try: self._wait_for_rate_limit() response = requests.get(self.NOMINATIM_URL, params=params, headers=headers, timeout=10) response.raise_for_status() results = response.json() if results and len(results) > 0: result = results[0] return { 'latitude': float(result['lat']), 'longitude': float(result['lon']), 'display_name': result.get('display_name', '') } return None except Exception as e: print(f"\n ❌ Nominatim error: {e}") return None def geocode_institution(self, institution: Dict[str, Any]) -> bool: """ Geocode an institution's location. Returns: True if geocoded (or already had coords), False otherwise """ self.stats['total'] += 1 locations = institution.get('locations', []) if not locations: return False location = locations[0] # Skip if already geocoded if location.get('latitude') is not None and location.get('longitude') is not None: self.stats['already_geocoded'] += 1 return True city = location.get('city', '') if not city: return False # Build query query = self._build_query(city) # Check cache cached = self.cache.get(query) if cached: self.stats['cache_hits'] += 1 location['latitude'] = cached['latitude'] location['longitude'] = cached['longitude'] self.stats['successful'] += 1 return True # Call API self.stats['api_calls'] += 1 result = self._call_nominatim(query) # Store in cache self.cache.put(query, result) if result: location['latitude'] = result['latitude'] location['longitude'] = result['longitude'] self.stats['successful'] += 1 return True else: self.stats['failed'] += 1 return False def print_statistics(self): """Print geocoding statistics.""" print("\n" + "="*80) print("GEOCODING STATISTICS") print("="*80) print(f"Total institutions: {self.stats['total']}") print(f"Already geocoded: {self.stats['already_geocoded']}") print(f"Cache hits: {self.stats['cache_hits']}") print(f"API calls: {self.stats['api_calls']}") print(f"Successful: {self.stats['successful']}") print(f"Failed: {self.stats['failed']}") # ============================================================================= # WIKIDATA ENRICHMENT # ============================================================================= class WikidataEnricher: """Enrich institutions with Wikidata data via SPARQL.""" SPARQL_ENDPOINT = "https://query.wikidata.org/sparql" USER_AGENT = "GLAM-Heritage-Tunisia/1.0" def __init__(self): self.sparql = SPARQLWrapper(self.SPARQL_ENDPOINT) self.sparql.setReturnFormat(SPARQL_JSON) self.sparql.addCustomHttpHeader("User-Agent", self.USER_AGENT) self.stats = { 'total': 0, 'already_enriched': 0, 'searched': 0, 'found': 0, 'enriched': 0, 'failed': 0 } def search_wikidata(self, name: str, city: str) -> Optional[Dict[str, Any]]: """ Search Wikidata for an institution. Args: name: Institution name city: City name Returns: Dict with qid, name, identifiers, etc. """ query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception WHERE {{ ?item rdfs:label "{name}"@en . ?item wdt:P31/wdt:P279* ?type . VALUES ?type {{ wd:Q33506 wd:Q7075 wd:Q166118 wd:Q1030034 wd:Q473972 wd:Q570116 wd:Q22687 wd:Q28564 }} OPTIONAL {{ ?item wdt:P214 ?viaf . }} OPTIONAL {{ ?item wdt:P791 ?isil . }} OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P625 ?coords . }} OPTIONAL {{ ?item wdt:P571 ?inception . }} SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en,fr,ar" . }} }} LIMIT 1 """ self.sparql.setQuery(query) try: time.sleep(0.5) # Rate limiting raw_results = self.sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if not bindings: return None binding = bindings[0] item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): return None result = { "qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "identifiers": {} } if "viaf" in binding: result["identifiers"]["VIAF"] = binding["viaf"]["value"] if "isil" in binding: result["identifiers"]["ISIL"] = binding["isil"]["value"] if "website" in binding: result["identifiers"]["Website"] = binding["website"]["value"] if "inception" in binding: result["founded_date"] = binding["inception"]["value"].split("T")[0] if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str.startswith("Point("): lon, lat = coords_str[6:-1].split() result["latitude"] = float(lat) result["longitude"] = float(lon) return result except Exception as e: print(f"\n ❌ Wikidata error: {e}") return None def enrich_institution(self, institution: Dict[str, Any]) -> bool: """ Enrich institution with Wikidata data. Returns: True if enriched, False otherwise """ self.stats['total'] += 1 # Check if already has Wikidata ID identifiers = institution.get('identifiers', []) existing_schemes = {i.get('identifier_scheme') for i in identifiers} if 'Wikidata' in existing_schemes: self.stats['already_enriched'] += 1 return False # Extract search parameters name = institution.get('name', '') locations = institution.get('locations', []) city = locations[0].get('city', '') if locations else '' if not name: return False # Search Wikidata self.stats['searched'] += 1 wd_data = self.search_wikidata(name, city) if not wd_data: self.stats['failed'] += 1 return False self.stats['found'] += 1 # Add Wikidata identifier identifiers.append({ 'identifier_scheme': 'Wikidata', 'identifier_value': wd_data['qid'], 'identifier_url': f"https://www.wikidata.org/wiki/{wd_data['qid']}" }) # Add other identifiers for scheme, value in wd_data.get('identifiers', {}).items(): if scheme not in existing_schemes: id_obj = { 'identifier_scheme': scheme, 'identifier_value': value } if scheme == 'VIAF': id_obj['identifier_url'] = f"https://viaf.org/viaf/{value}" elif scheme == 'Website': id_obj['identifier_url'] = value identifiers.append(id_obj) # Add founding date if missing if 'founded_date' in wd_data and not institution.get('founded_date'): institution['founded_date'] = wd_data['founded_date'] # Add coordinates if missing if 'latitude' in wd_data and locations: location = locations[0] if location.get('latitude') is None: location['latitude'] = wd_data['latitude'] location['longitude'] = wd_data['longitude'] # Update provenance prov = institution.get('provenance', {}) existing_method = prov.get('extraction_method', '') prov['extraction_method'] = f"{existing_method} + Wikidata enrichment" self.stats['enriched'] += 1 return True def print_statistics(self): """Print enrichment statistics.""" print("\n" + "="*80) print("WIKIDATA ENRICHMENT STATISTICS") print("="*80) print(f"Total institutions: {self.stats['total']}") print(f"Already enriched: {self.stats['already_enriched']}") print(f"Searched: {self.stats['searched']}") print(f"Found: {self.stats['found']}") print(f"Enriched: {self.stats['enriched']}") print(f"Failed: {self.stats['failed']}") # ============================================================================= # MAIN PIPELINE # ============================================================================= def load_institutions(yaml_file: Path) -> List[Dict[str, Any]]: """Load institutions from YAML file.""" with open(yaml_file, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) if isinstance(data, list): return data elif isinstance(data, dict) and 'institutions' in data: return data['institutions'] else: raise ValueError(f"Unexpected YAML structure in {yaml_file}") def save_institutions(institutions: List[Dict[str, Any]], yaml_file: Path): """Save institutions to YAML file.""" # Add metadata header output = { '_metadata': { 'title': 'Tunisian Heritage Institutions - Enhanced Dataset', 'description': 'Validated and enriched dataset with GHCIDs, geocoding, and Wikidata links', 'generated': datetime.now(timezone.utc).isoformat(), 'count': len(institutions), 'schema_version': '0.2.1', 'enhancements': [ 'GHCID generation with UUID v5/v8', 'Geocoding via Nominatim API', 'Wikidata enrichment via SPARQL' ] }, 'institutions': institutions } with open(yaml_file, 'w', encoding='utf-8') as f: yaml.dump(output, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=120) print(f"\n✅ Saved {len(institutions)} institutions to {yaml_file}") def main(): parser = argparse.ArgumentParser(description="Enhance Tunisia heritage dataset") parser.add_argument('--skip-ghcid', action='store_true', help="Skip GHCID generation") parser.add_argument('--skip-geocoding', action='store_true', help="Skip geocoding") parser.add_argument('--skip-wikidata', action='store_true', help="Skip Wikidata enrichment") parser.add_argument('--dry-run', action='store_true', help="Show what would be done") parser.add_argument('--verbose', action='store_true', help="Verbose output") args = parser.parse_args() # Paths base_dir = Path(__file__).parent.parent input_file = base_dir / "data" / "instances" / "tunisia" / "tunisian_institutions.yaml" output_file = base_dir / "data" / "instances" / "tunisia" / "tunisian_institutions_enhanced.yaml" cache_dir = base_dir / "data" / "cache" cache_dir.mkdir(parents=True, exist_ok=True) print("="*80) print("TUNISIA HERITAGE INSTITUTIONS - ENHANCEMENT PIPELINE") print("="*80) print(f"Input: {input_file}") print(f"Output: {output_file}") if args.dry_run: print("\n⚠️ DRY RUN MODE - No changes will be made\n") # Load data print(f"\n📖 Loading institutions...") institutions = load_institutions(input_file) print(f" Loaded {len(institutions)} institutions") # Step 1: GHCID Generation if not args.skip_ghcid: print("\n" + "="*80) print("STEP 1: GHCID GENERATION") print("="*80) ghcid_gen = TunisiaGHCIDGenerator() for i, inst in enumerate(institutions, 1): if args.verbose: print(f"\n[{i}/{len(institutions)}] {inst.get('name', 'Unknown')}") if not args.dry_run: ghcid_result = ghcid_gen.generate_ghcid(inst) if ghcid_result: inst['ghcid'] = ghcid_result['ghcid'] inst['ghcid_uuid'] = ghcid_result['ghcid_uuid'] inst['ghcid_uuid_sha256'] = ghcid_result['ghcid_uuid_sha256'] inst['ghcid_numeric'] = ghcid_result['ghcid_numeric'] if args.verbose: print(f" GHCID: {ghcid_result['ghcid']}") print(f" UUID: {ghcid_result['ghcid_uuid']}") ghcid_gen.print_statistics() # Step 2: Geocoding if not args.skip_geocoding: print("\n" + "="*80) print("STEP 2: GEOCODING") print("="*80) cache_file = cache_dir / "tunisia_geocoding.db" geocoder = TunisiaGeocoder(cache_file) for i, inst in enumerate(institutions, 1): if args.verbose: print(f"\n[{i}/{len(institutions)}] {inst.get('name', 'Unknown')}") if not args.dry_run: success = geocoder.geocode_institution(inst) if args.verbose and success: locations = inst.get('locations', []) if locations and locations[0].get('latitude'): print(f" Coordinates: {locations[0]['latitude']:.4f}, {locations[0]['longitude']:.4f}") geocoder.print_statistics() geocoder.cache.close() # Step 3: Wikidata Enrichment if not args.skip_wikidata: print("\n" + "="*80) print("STEP 3: WIKIDATA ENRICHMENT") print("="*80) print("⚠️ This step searches Wikidata and may take several minutes...") enricher = WikidataEnricher() for i, inst in enumerate(institutions, 1): if args.verbose: print(f"\n[{i}/{len(institutions)}] {inst.get('name', 'Unknown')}") if not args.dry_run: enriched = enricher.enrich_institution(inst) if args.verbose and enriched: # Find Wikidata ID for identifier in inst.get('identifiers', []): if identifier.get('identifier_scheme') == 'Wikidata': print(f" ✅ Wikidata: {identifier['identifier_value']}") break enricher.print_statistics() # Save results if not args.dry_run: print("\n" + "="*80) print("SAVING ENHANCED DATASET") print("="*80) save_institutions(institutions, output_file) print("\n" + "="*80) print("✅ ENHANCEMENT COMPLETE") print("="*80) print(f"\nEnhanced dataset: {output_file}") print("\nNext steps:") print(" 1. Review enhancements and validate results") print(" 2. Export to RDF/JSON-LD: python scripts/export_tunisia_formats.py") print(" 3. Integrate into global GLAM dataset") else: print("\n✅ Dry run complete - no changes made") if __name__ == "__main__": main()