#!/usr/bin/env python3 """ Convert Bulgarian ISIL Registry to LinkML-compliant YAML format. This script performs 5 integration steps: 1. Convert JSON to LinkML-compliant YAML format 2. Map library types to GLAMORCUBESFIXPHDNT taxonomy (all → LIBRARY) 3. Geocode addresses to lat/lon coordinates 4. Generate GHCIDs for all institutions 5. Enrich missing names from Wikidata Input: data/isil/bulgarian_isil_registry.json Output: data/instances/bulgaria_isil_libraries.yaml """ import json import re import sys import unicodedata from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Any from dataclasses import dataclass, asdict # Add project root to path project_root = Path(__file__).parent.parent sys.path.insert(0, str(project_root)) sys.path.insert(0, str(project_root / "src")) from glam_extractor.identifiers.ghcid import GHCIDComponents from glam_extractor.geocoding.geonames_lookup import GeoNamesDB # ============================================================================= # Configuration # ============================================================================= PROJECT_ROOT = Path(__file__).parent.parent INPUT_FILE = PROJECT_ROOT / "data/isil/bulgarian_isil_registry.json" OUTPUT_FILE = PROJECT_ROOT / "data/instances/bulgaria_isil_libraries.yaml" GEONAMES_DB = PROJECT_ROOT / "data/reference/geonames.db" CITY_REGION_LOOKUP = PROJECT_ROOT / "data/reference/bulgarian_city_regions.json" # Bulgarian administrative regions (oblasti) → ISO 3166-2:BG codes BULGARIAN_REGIONS = { 'Благоевград': 'BG-01', # Blagoevgrad 'Бургас': 'BG-02', # Burgas 'Варна': 'BG-03', # Varna 'Велико Търново': 'BG-04', # Veliko Tarnovo 'Видин': 'BG-05', # Vidin 'Враца': 'BG-06', # Vratsa 'Габрово': 'BG-07', # Gabrovo 'Добрич': 'BG-08', # Dobrich 'Кърджали': 'BG-09', # Kardzhali 'Кюстендил': 'BG-10', # Kyustendil 'Ловеч': 'BG-11', # Lovech 'Монтана': 'BG-12', # Montana 'Пазарджик': 'BG-13', # Pazardzhik 'Перник': 'BG-14', # Pernik 'Плевен': 'BG-15', # Pleven 'Пловдив': 'BG-16', # Plovdiv 'Разград': 'BG-17', # Razgrad 'Русе': 'BG-18', # Ruse 'Силистра': 'BG-19', # Silistra 'Сливен': 'BG-20', # Sliven 'Смолян': 'BG-21', # Smolyan 'София': 'BG-22', # Sofia (capital) 'София област': 'BG-23', # Sofia Province 'Стара Загора': 'BG-24', # Stara Zagora 'Търговище': 'BG-25', # Targovishte 'Хасково': 'BG-26', # Haskovo 'Шумен': 'BG-27', # Shumen 'Ямбол': 'BG-28', # Yambol } # City name mappings (Bulgarian Cyrillic → GeoNames English) BULGARIAN_CITY_MAPPINGS = { 'София': 'Sofia', 'Бургас': 'Burgas', 'Варна': 'Varna', 'Пловдив': 'Plovdiv', 'Русе': 'Ruse', 'Стара Загора': 'Stara Zagora', 'Плевен': 'Pleven', 'Сливен': 'Sliven', 'Добрич': 'Dobrich', 'Габрово': 'Gabrovo', 'Видин': 'Vidin', 'Враца': 'Vratsa', 'Велико Търново': 'Veliko Tarnovo', 'Ловеч': 'Lovech', 'Кюстендил': 'Kyustendil', 'Благоевград': 'Blagoevgrad', 'Пазарджик': 'Pazardzhik', 'Монтана': 'Montana', 'Кърджали': 'Kardzhali', 'Смолян': 'Smolyan', 'Силистра': 'Silistra', 'Разград': 'Razgrad', 'Търговище': 'Targovishte', 'Хасково': 'Haskovo', 'Шумен': 'Shumen', 'Ямбол': 'Yambol', 'Перник': 'Pernik', } # ============================================================================= # Data Models # ============================================================================= @dataclass class Location: """Location information for heritage custodians.""" city: Optional[str] = None street_address: Optional[str] = None postal_code: Optional[str] = None region: Optional[str] = None country: str = 'BG' latitude: Optional[float] = None longitude: Optional[float] = None geonames_id: Optional[int] = None @dataclass class Identifier: """External identifiers (ISIL, Wikidata, etc.).""" identifier_scheme: str identifier_value: str identifier_url: Optional[str] = None @dataclass class Provenance: """Data provenance metadata.""" data_source: str data_tier: str extraction_date: str extraction_method: str confidence_score: float conversation_id: Optional[str] = None source_url: Optional[str] = None @dataclass class HeritageCustodian: """Main heritage custodian record (LinkML-compliant).""" id: str name: str institution_type: str ghcid: Optional[str] = None ghcid_uuid: Optional[str] = None ghcid_uuid_sha256: Optional[str] = None ghcid_numeric: Optional[int] = None alternative_names: Optional[List[str]] = None description: Optional[str] = None locations: Optional[List[Location]] = None identifiers: Optional[List[Identifier]] = None homepage: Optional[str] = None contact_info: Optional[Dict[str, Any]] = None collections: Optional[List[Dict[str, Any]]] = None provenance: Optional[Provenance] = None # ============================================================================= # Utilities # ============================================================================= def extract_city_from_address(address: str) -> Optional[str]: """ Extract city name from Bulgarian address. Bulgarian address format: - "гр. София 1504, бул. ..." → city is "София" - "Бургас 8000, ул. ..." → city is "Бургас" - "с. Плетена 2954, община Сатовча" → city is "Плетена" (village) Returns: City name in Bulgarian (Cyrillic) """ if not address: return None # Remove leading "гр." (city), "с." (village), or "община" (municipality) address = address.strip() # Pattern 1: "гр. CityName POSTAL, ..." match = re.match(r'(?:гр\.|с\.)\s*([А-Яа-я\s\-]+?)\s*\d{4}', address) if match: return match.group(1).strip() # Pattern 2: "CityName POSTAL, ..." match = re.match(r'([А-Яа-я\s\-]+?)\s*\d{4}', address) if match: return match.group(1).strip() # Fallback: take first word sequence before digits match = re.match(r'([А-Яа-я\s\-]+)', address) if match: return match.group(1).strip() return None def extract_region_from_library_type(library_type: str) -> Optional[str]: """ Extract region name from library type field. Bulgarian regional libraries include the oblast in the type field: "Регионална библиотека (Област Бургас)" → region is "Бургас" """ if not library_type: return None match = re.search(r'Област\s+([А-Яа-я\s]+)', library_type) if match: return match.group(1).strip() return None def transliterate_bulgarian(text: str) -> str: """ Transliterate Bulgarian Cyrillic to Latin alphabet for GHCID abbreviations. Uses BGN/PCGN romanization standard for Bulgarian. """ cyrillic_to_latin = { 'А': 'A', 'а': 'a', 'Б': 'B', 'б': 'b', 'В': 'V', 'в': 'v', 'Г': 'G', 'г': 'g', 'Д': 'D', 'д': 'd', 'Е': 'E', 'е': 'e', 'Ж': 'Zh', 'ж': 'zh', 'З': 'Z', 'з': 'z', 'И': 'I', 'и': 'i', 'Й': 'Y', 'й': 'y', 'К': 'K', 'к': 'k', 'Л': 'L', 'л': 'l', 'М': 'M', 'м': 'm', 'Н': 'N', 'н': 'n', 'О': 'O', 'о': 'o', 'П': 'P', 'п': 'p', 'Р': 'R', 'р': 'r', 'С': 'S', 'с': 's', 'Т': 'T', 'т': 't', 'У': 'U', 'у': 'u', 'Ф': 'F', 'ф': 'f', 'Х': 'H', 'х': 'h', 'Ц': 'Ts', 'ц': 'ts', 'Ч': 'Ch', 'ч': 'ch', 'Ш': 'Sh', 'ш': 'sh', 'Щ': 'Sht', 'щ': 'sht', 'Ъ': 'A', 'ъ': 'a', 'Ь': 'Y', 'ь': 'y', 'Ю': 'Yu', 'ю': 'yu', 'Я': 'Ya', 'я': 'ya', } result = [] for char in text: result.append(cyrillic_to_latin.get(char, char)) return ''.join(result) def generate_abbreviation_from_name(name: str, isil_code: str) -> str: """ Generate institution abbreviation for GHCID. Strategy: 1. If name has explicit abbreviation in parentheses, use it 2. Otherwise, use last 4 digits of ISIL code Examples: "Национална библиотека „Св. св. Кирил и Методий" (НБКМ)" → "NBKM" "Библиотека при НЧ..." (BG-0130000) → "0000" """ # Check for abbreviation in parentheses match = re.search(r'\(([А-Яа-яA-Za-z0-9]+)\)', name or '') if match: abbr = match.group(1) # Transliterate if Cyrillic if any('\u0400' <= c <= '\u04FF' for c in abbr): abbr = transliterate_bulgarian(abbr) return abbr.upper()[:10] # Fallback: use last 4 digits of ISIL code isil_suffix = isil_code.split('-')[-1] return isil_suffix[-4:] def map_bulgarian_library_type(library_type: str) -> str: """ Map Bulgarian library type to GLAMORCUBESFIXPHDNT taxonomy. All Bulgarian institutions in this registry are libraries (LIBRARY class). This includes: - National libraries - Regional libraries (oblast-level) - University libraries (academic) - Community center libraries (chitalishta) - Municipal libraries (city-level) - Scientific libraries (research institutes) """ # All institutions are LIBRARY type in this registry return 'LIBRARY' # ============================================================================= # Main Conversion Logic # ============================================================================= def convert_bulgarian_institutions() -> List[HeritageCustodian]: """ Convert Bulgarian ISIL registry to LinkML-compliant records. Performs 5 integration steps: 1. Parse JSON and extract fields 2. Map library types to GLAMORCUBESFIXPHDNT taxonomy 3. Geocode addresses using GeoNames 4. Generate GHCIDs with UUIDs 5. Enrich names (placeholder - Wikidata enrichment would go here) """ print("Loading Bulgarian ISIL registry...") with open(INPUT_FILE, 'r', encoding='utf-8') as f: data = json.load(f) institutions_json = data['institutions'] print(f"Found {len(institutions_json)} institutions") # Load city-region lookup table print("Loading city-region lookup table...") city_region_map = {} if CITY_REGION_LOOKUP.exists(): with open(CITY_REGION_LOOKUP, 'r', encoding='utf-8') as f: city_region_map = json.load(f) print(f"Loaded {len(city_region_map)} city-region mappings") else: print("Warning: City-region lookup not found, limited GHCID coverage") # Initialize geocoding print("Initializing GeoNames database...") if GEONAMES_DB.exists(): geonames_db = GeoNamesDB(db_path=GEONAMES_DB) else: print("Warning: GeoNames database not found, geocoding will be skipped") geonames_db = None results = [] geocoded_count = 0 ghcid_count = 0 for idx, inst in enumerate(institutions_json, 1): if idx % 10 == 0: print(f"Processing institution {idx}/{len(institutions_json)}...") # =================================================================== # STEP 1: Extract basic fields # =================================================================== isil_code = inst['isil'] name_bg = inst.get('name_bg') or f"Library {isil_code}" name_en = inst.get('name_en') library_type_bg = inst.get('library_type', '') address = inst.get('address', '') # Build alternative names list alt_names = [] if name_en: alt_names.append(name_en) if inst.get('name_variants'): alt_names.append(inst['name_variants']) # =================================================================== # STEP 2: Map to GLAMORCUBESFIXPHDNT taxonomy # =================================================================== institution_type = map_bulgarian_library_type(library_type_bg) # =================================================================== # STEP 3: Geocode address # =================================================================== city_bg = extract_city_from_address(address) region_bg = extract_region_from_library_type(library_type_bg) # Map Bulgarian city to English for GeoNames lookup city_en = BULGARIAN_CITY_MAPPINGS.get(city_bg, city_bg) if city_bg else None # Enhance region info using city-region lookup if available if city_en and city_en in city_region_map and not region_bg: # Handle both old and new field naming conventions region_entry = city_region_map[city_en] region_bg = region_entry.get('region_bulgarian') or region_entry.get('region_name') location = Location( city=city_en or city_bg, street_address=address, region=region_bg, country='BG' ) # Attempt GeoNames lookup if city_en and geonames_db: try: city_info = geonames_db.lookup_city(city_en, 'BG') if city_info: location.latitude = city_info.latitude location.longitude = city_info.longitude location.geonames_id = city_info.geonames_id geocoded_count += 1 except Exception as e: print(f" Warning: Geocoding failed for {city_en}: {e}") # =================================================================== # STEP 4: Generate GHCID # =================================================================== ghcid = None ghcid_uuid = None ghcid_uuid_sha256 = None ghcid_numeric = None # Try to get region from lookup table or from library type field region_iso = None if city_en and city_en in city_region_map: # Use city-region lookup table (handle both old and new field names) region_entry = city_region_map[city_en] region_iso = region_entry.get('region_numeric') or region_entry.get('region_numeric') elif region_bg: # Fallback to extracted region from library type region_code = BULGARIAN_REGIONS.get(region_bg, 'BG-22') region_iso = region_code.split('-')[1] if city_en and region_iso and geonames_db: try: # Get city code from GeoNames city_info = geonames_db.lookup_city(city_en, 'BG') if city_info: city_code = city_info.get_abbreviation() # Generate abbreviation abbreviation = generate_abbreviation_from_name(name_bg, isil_code) # Build GHCID components components = GHCIDComponents( country_code='BG', region_code=region_iso, city_locode=city_code, institution_type='L', # Library abbreviation=abbreviation ) # Generate identifiers using GHCIDComponents methods ghcid = components.to_string() ghcid_uuid = str(components.to_uuid()) ghcid_uuid_sha256 = str(components.to_uuid_sha256()) ghcid_numeric = components.to_numeric() ghcid_count += 1 except Exception as e: print(f" Warning: GHCID generation failed for {name_bg}: {e}") # =================================================================== # STEP 5: Enrich names (placeholder - Wikidata would go here) # =================================================================== # TODO: Query Wikidata for institutions with matching ISIL codes # For now, use existing names from registry # Build identifier list identifiers = [ Identifier( identifier_scheme='ISIL', identifier_value=isil_code, ) ] if inst.get('website'): identifiers.append( Identifier( identifier_scheme='Website', identifier_value=inst['website'], identifier_url=inst['website'] if inst['website'].startswith('http') else f"http://{inst['website']}" ) ) # Build contact info contact_info = {} if inst.get('email'): # If multiple emails, take the first one (schema expects single email) email = inst['email'] if ',' in email: email = email.split(',')[0].strip() contact_info['email'] = email if inst.get('phone_fax'): contact_info['phone'] = inst['phone_fax'] # Build collections metadata collections = [] if inst.get('collections'): collections.append({ 'collection_name': 'General Collection', 'collection_type': 'bibliographic', 'collection_description': inst['collections'], # Use proper schema field name 'item_count': inst.get('collection_size', 'Not specified') # Use proper schema field name }) # Build provenance provenance = Provenance( data_source='CSV_REGISTRY', data_tier='TIER_1_AUTHORITATIVE', extraction_date=datetime.now(timezone.utc).isoformat(), extraction_method='HTML table parsing from Bulgarian National Library ISIL registry', confidence_score=0.98, source_url='https://www.nationallibrary.bg/wp/?page_id=5686' ) # Create HeritageCustodian record custodian = HeritageCustodian( id=f"https://w3id.org/heritage/custodian/bg/{isil_code.lower().replace('-', '')}", name=name_bg, alternative_names=alt_names if alt_names else None, institution_type=institution_type, ghcid=ghcid, ghcid_uuid=ghcid_uuid, ghcid_uuid_sha256=ghcid_uuid_sha256, ghcid_numeric=ghcid_numeric, description=library_type_bg, locations=[location], identifiers=identifiers, homepage=inst.get('website'), contact_info=contact_info if contact_info else None, collections=collections if collections else None, provenance=provenance ) results.append(custodian) print(f"\n=== Conversion Complete ===") print(f"Total institutions: {len(results)}") print(f"Geocoded: {geocoded_count}/{len(results)} ({geocoded_count/len(results)*100:.1f}%)") print(f"GHCIDs generated: {ghcid_count}/{len(results)} ({ghcid_count/len(results)*100:.1f}%)") return results def export_to_yaml(institutions: List[HeritageCustodian]) -> None: """ Export institutions to LinkML-compliant YAML format. Note: Using custom YAML serialization to handle dataclasses. """ import yaml # Convert dataclasses to dicts institutions_dicts = [] for inst in institutions: inst_dict = { 'id': inst.id, 'name': inst.name, 'institution_type': inst.institution_type, } if inst.ghcid: inst_dict['ghcid_current'] = inst.ghcid if inst.ghcid_uuid: inst_dict['ghcid_uuid'] = inst.ghcid_uuid if inst.ghcid_uuid_sha256: inst_dict['ghcid_uuid_sha256'] = inst.ghcid_uuid_sha256 if inst.ghcid_numeric: inst_dict['ghcid_numeric'] = inst.ghcid_numeric # type: ignore if inst.alternative_names: inst_dict['alternative_names'] = inst.alternative_names # type: ignore if inst.description: inst_dict['description'] = inst.description if inst.homepage: inst_dict['homepage'] = inst.homepage # Locations if inst.locations: inst_dict['locations'] = [] # type: ignore for loc in inst.locations: loc_dict: Dict[str, Any] = {'country': loc.country} if loc.city: loc_dict['city'] = loc.city if loc.street_address: loc_dict['street_address'] = loc.street_address if loc.region: loc_dict['region'] = loc.region if loc.latitude: loc_dict['latitude'] = loc.latitude # type: ignore if loc.longitude: loc_dict['longitude'] = loc.longitude # type: ignore if loc.geonames_id: loc_dict['geonames_id'] = str(loc.geonames_id) # type: ignore inst_dict['locations'].append(loc_dict) # type: ignore # Identifiers if inst.identifiers: inst_dict['identifiers'] = [] # type: ignore for ident in inst.identifiers: ident_dict = { 'identifier_scheme': ident.identifier_scheme, 'identifier_value': ident.identifier_value } if ident.identifier_url: ident_dict['identifier_url'] = ident.identifier_url inst_dict['identifiers'].append(ident_dict) # type: ignore # Contact info if inst.contact_info: inst_dict['contact_info'] = inst.contact_info # type: ignore # Collections if inst.collections: inst_dict['collections'] = inst.collections # type: ignore # Provenance if inst.provenance: prov_dict: Dict[str, Any] = { 'data_source': inst.provenance.data_source, 'data_tier': inst.provenance.data_tier, 'extraction_date': inst.provenance.extraction_date, 'extraction_method': inst.provenance.extraction_method, 'confidence_score': inst.provenance.confidence_score, # type: ignore } if inst.provenance.source_url: prov_dict['source_url'] = inst.provenance.source_url inst_dict['provenance'] = prov_dict # type: ignore institutions_dicts.append(inst_dict) # Write YAML print(f"\nExporting to {OUTPUT_FILE}...") OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: f.write('---\n') f.write('# Bulgarian ISIL Registry - Heritage Custodian Institutions\n') f.write('# Converted to LinkML-compliant format\n') f.write(f'# Generated: {datetime.now(timezone.utc).isoformat()}\n') f.write(f'# Source: Bulgarian National Library ISIL Registry\n') f.write(f'# Total institutions: {len(institutions_dicts)}\n') f.write('\n') yaml.dump(institutions_dicts, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Exported {len(institutions_dicts)} institutions to {OUTPUT_FILE}") # ============================================================================= # Main Entry Point # ============================================================================= def main(): """Main conversion workflow.""" print("=" * 70) print("Bulgarian ISIL Registry → LinkML Conversion") print("=" * 70) print() # Check input file exists if not INPUT_FILE.exists(): print(f"Error: Input file not found: {INPUT_FILE}") sys.exit(1) # Check GeoNames database exists if not GEONAMES_DB.exists(): print(f"Warning: GeoNames database not found at {GEONAMES_DB}") print("Geocoding will be skipped. Run scripts to build GeoNames DB first.") # Convert institutions institutions = convert_bulgarian_institutions() # Export to YAML export_to_yaml(institutions) print() print("=" * 70) print("✓ Conversion Complete!") print("=" * 70) print() print("Next steps:") print("1. Review output: cat", OUTPUT_FILE) print("2. Validate schema: linkml-validate -s schemas/heritage_custodian.yaml", OUTPUT_FILE) print("3. Enrich with Wikidata: scripts/enrich_bulgarian_wikidata.py") print("4. Generate RDF: scripts/export_bulgarian_to_rdf.py") if __name__ == '__main__': main()