glam/scripts/convert_bulgarian_isil_to_linkml.py

#!/usr/bin/env python3
"""
Convert Bulgarian ISIL Registry to LinkML-compliant YAML format.

This script performs 5 integration steps:
1. Convert JSON to LinkML-compliant YAML format
2. Map library types to GLAMORCUBESFIXPHDNT taxonomy (all → LIBRARY)
3. Geocode addresses to lat/lon coordinates
4. Generate GHCIDs for all institutions
5. Enrich missing names from Wikidata

Input:  data/isil/bulgarian_isil_registry.json
Output: data/instances/bulgaria_isil_libraries.yaml
"""

import json
import re
import sys
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict

# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / "src"))

from glam_extractor.identifiers.ghcid import GHCIDComponents
from glam_extractor.geocoding.geonames_lookup import GeoNamesDB


# =============================================================================
# Configuration
# =============================================================================

PROJECT_ROOT = Path(__file__).parent.parent
INPUT_FILE = PROJECT_ROOT / "data/isil/bulgarian_isil_registry.json"
OUTPUT_FILE = PROJECT_ROOT / "data/instances/bulgaria_isil_libraries.yaml"
GEONAMES_DB = PROJECT_ROOT / "data/reference/geonames.db"
CITY_REGION_LOOKUP = PROJECT_ROOT / "data/reference/bulgarian_city_regions.json"

# Bulgarian administrative regions (oblasti) → ISO 3166-2:BG codes
BULGARIAN_REGIONS = {
    'Благоевград': 'BG-01',  # Blagoevgrad
    'Бургас': 'BG-02',       # Burgas
    'Варна': 'BG-03',        # Varna
    'Велико Търново': 'BG-04',  # Veliko Tarnovo
    'Видин': 'BG-05',        # Vidin
    'Враца': 'BG-06',        # Vratsa
    'Габрово': 'BG-07',      # Gabrovo
    'Добрич': 'BG-08',       # Dobrich
    'Кърджали': 'BG-09',     # Kardzhali
    'Кюстендил': 'BG-10',    # Kyustendil
    'Ловеч': 'BG-11',        # Lovech
    'Монтана': 'BG-12',      # Montana
    'Пазарджик': 'BG-13',    # Pazardzhik
    'Перник': 'BG-14',       # Pernik
    'Плевен': 'BG-15',       # Pleven
    'Пловдив': 'BG-16',      # Plovdiv
    'Разград': 'BG-17',      # Razgrad
    'Русе': 'BG-18',         # Ruse
    'Силистра': 'BG-19',     # Silistra
    'Сливен': 'BG-20',       # Sliven
    'Смолян': 'BG-21',       # Smolyan
    'София': 'BG-22',        # Sofia (capital)
    'София област': 'BG-23', # Sofia Province
    'Стара Загора': 'BG-24', # Stara Zagora
    'Търговище': 'BG-25',    # Targovishte
    'Хасково': 'BG-26',      # Haskovo
    'Шумен': 'BG-27',        # Shumen
    'Ямбол': 'BG-28',        # Yambol
}

# City name mappings (Bulgarian Cyrillic → GeoNames English)
BULGARIAN_CITY_MAPPINGS = {
    'София': 'Sofia',
    'Бургас': 'Burgas',
    'Варна': 'Varna',
    'Пловдив': 'Plovdiv',
    'Русе': 'Ruse',
    'Стара Загора': 'Stara Zagora',
    'Плевен': 'Pleven',
    'Сливен': 'Sliven',
    'Добрич': 'Dobrich',
    'Габрово': 'Gabrovo',
    'Видин': 'Vidin',
    'Враца': 'Vratsa',
    'Велико Търново': 'Veliko Tarnovo',
    'Ловеч': 'Lovech',
    'Кюстендил': 'Kyustendil',
    'Благоевград': 'Blagoevgrad',
    'Пазарджик': 'Pazardzhik',
    'Монтана': 'Montana',
    'Кърджали': 'Kardzhali',
    'Смолян': 'Smolyan',
    'Силистра': 'Silistra',
    'Разград': 'Razgrad',
    'Търговище': 'Targovishte',
    'Хасково': 'Haskovo',
    'Шумен': 'Shumen',
    'Ямбол': 'Yambol',
    'Перник': 'Pernik',
}


# =============================================================================
# Data Models
# =============================================================================

@dataclass
class Location:
    """Location information for heritage custodians."""
    city: Optional[str] = None
    street_address: Optional[str] = None
    postal_code: Optional[str] = None
    region: Optional[str] = None
    country: str = 'BG'
    latitude: Optional[float] = None
    longitude: Optional[float] = None
    geonames_id: Optional[int] = None


@dataclass
class Identifier:
    """External identifiers (ISIL, Wikidata, etc.)."""
    identifier_scheme: str
    identifier_value: str
    identifier_url: Optional[str] = None


@dataclass
class Provenance:
    """Data provenance metadata."""
    data_source: str
    data_tier: str
    extraction_date: str
    extraction_method: str
    confidence_score: float
    conversation_id: Optional[str] = None
    source_url: Optional[str] = None


@dataclass
class HeritageCustodian:
    """Main heritage custodian record (LinkML-compliant)."""
    id: str
    name: str
    institution_type: str
    ghcid: Optional[str] = None
    ghcid_uuid: Optional[str] = None
    ghcid_uuid_sha256: Optional[str] = None
    ghcid_numeric: Optional[int] = None
    alternative_names: Optional[List[str]] = None
    description: Optional[str] = None
    locations: Optional[List[Location]] = None
    identifiers: Optional[List[Identifier]] = None
    homepage: Optional[str] = None
    contact_info: Optional[Dict[str, Any]] = None
    collections: Optional[List[Dict[str, Any]]] = None
    provenance: Optional[Provenance] = None


# =============================================================================
# Utilities
# =============================================================================

def extract_city_from_address(address: str) -> Optional[str]:
    """
    Extract city name from Bulgarian address.

    Bulgarian address format:
    - "гр. София 1504, бул. ..." → city is "София"
    - "Бургас 8000, ул. ..." → city is "Бургас"
    - "с. Плетена 2954, община Сатовча" → city is "Плетена" (village)

    Returns:
        City name in Bulgarian (Cyrillic)
    """
    if not address:
        return None

    # Remove leading "гр." (city), "с." (village), or "община" (municipality)
    address = address.strip()

    # Pattern 1: "гр. CityName POSTAL, ..."
    match = re.match(r'(?:гр\.|с\.)\s*([А-Яа-я\s\-]+?)\s*\d{4}', address)
    if match:
        return match.group(1).strip()

    # Pattern 2: "CityName POSTAL, ..."
    match = re.match(r'([А-Яа-я\s\-]+?)\s*\d{4}', address)
    if match:
        return match.group(1).strip()

    # Fallback: take first word sequence before digits
    match = re.match(r'([А-Яа-я\s\-]+)', address)
    if match:
        return match.group(1).strip()

    return None


def extract_region_from_library_type(library_type: str) -> Optional[str]:
    """
    Extract region name from library type field.

    Bulgarian regional libraries include the oblast in the type field:
    "Регионална библиотека (Област Бургас)" → region is "Бургас"
    """
    if not library_type:
        return None

    match = re.search(r'Област\s+([А-Яа-я\s]+)', library_type)
    if match:
        return match.group(1).strip()

    return None


def transliterate_bulgarian(text: str) -> str:
    """
    Transliterate Bulgarian Cyrillic to Latin alphabet for GHCID abbreviations.

    Uses BGN/PCGN romanization standard for Bulgarian.
    """
    cyrillic_to_latin = {
        'А': 'A', 'а': 'a',
        'Б': 'B', 'б': 'b',
        'В': 'V', 'в': 'v',
        'Г': 'G', 'г': 'g',
        'Д': 'D', 'д': 'd',
        'Е': 'E', 'е': 'e',
        'Ж': 'Zh', 'ж': 'zh',
        'З': 'Z', 'з': 'z',
        'И': 'I', 'и': 'i',
        'Й': 'Y', 'й': 'y',
        'К': 'K', 'к': 'k',
        'Л': 'L', 'л': 'l',
        'М': 'M', 'м': 'm',
        'Н': 'N', 'н': 'n',
        'О': 'O', 'о': 'o',
        'П': 'P', 'п': 'p',
        'Р': 'R', 'р': 'r',
        'С': 'S', 'с': 's',
        'Т': 'T', 'т': 't',
        'У': 'U', 'у': 'u',
        'Ф': 'F', 'ф': 'f',
        'Х': 'H', 'х': 'h',
        'Ц': 'Ts', 'ц': 'ts',
        'Ч': 'Ch', 'ч': 'ch',
        'Ш': 'Sh', 'ш': 'sh',
        'Щ': 'Sht', 'щ': 'sht',
        'Ъ': 'A', 'ъ': 'a',
        'Ь': 'Y', 'ь': 'y',
        'Ю': 'Yu', 'ю': 'yu',
        'Я': 'Ya', 'я': 'ya',
    }

    result = []
    for char in text:
        result.append(cyrillic_to_latin.get(char, char))

    return ''.join(result)


def generate_abbreviation_from_name(name: str, isil_code: str) -> str:
    """
    Generate institution abbreviation for GHCID.

    Strategy:
    1. If name has explicit abbreviation in parentheses, use it
    2. Otherwise, use last 4 digits of ISIL code

    Examples:
        "Национална библиотека „Св. св. Кирил и Методий" (НБКМ)" → "NBKM"
        "Библиотека при НЧ..." (BG-0130000) → "0000"
    """
    # Check for abbreviation in parentheses
    match = re.search(r'\(([А-Яа-яA-Za-z0-9]+)\)', name or '')
    if match:
        abbr = match.group(1)
        # Transliterate if Cyrillic
        if any('\u0400' <= c <= '\u04FF' for c in abbr):
            abbr = transliterate_bulgarian(abbr)
        return abbr.upper()[:10]

    # Fallback: use last 4 digits of ISIL code
    isil_suffix = isil_code.split('-')[-1]
    return isil_suffix[-4:]


def map_bulgarian_library_type(library_type: str) -> str:
    """
    Map Bulgarian library type to GLAMORCUBESFIXPHDNT taxonomy.

    All Bulgarian institutions in this registry are libraries (LIBRARY class).
    This includes:
    - National libraries
    - Regional libraries (oblast-level)
    - University libraries (academic)
    - Community center libraries (chitalishta)
    - Municipal libraries (city-level)
    - Scientific libraries (research institutes)
    """
    # All institutions are LIBRARY type in this registry
    return 'LIBRARY'


# =============================================================================
# Main Conversion Logic
# =============================================================================

def convert_bulgarian_institutions() -> List[HeritageCustodian]:
    """
    Convert Bulgarian ISIL registry to LinkML-compliant records.

    Performs 5 integration steps:
    1. Parse JSON and extract fields
    2. Map library types to GLAMORCUBESFIXPHDNT taxonomy
    3. Geocode addresses using GeoNames
    4. Generate GHCIDs with UUIDs
    5. Enrich names (placeholder - Wikidata enrichment would go here)
    """
    print("Loading Bulgarian ISIL registry...")
    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        data = json.load(f)

    institutions_json = data['institutions']
    print(f"Found {len(institutions_json)} institutions")

    # Load city-region lookup table
    print("Loading city-region lookup table...")
    city_region_map = {}
    if CITY_REGION_LOOKUP.exists():
        with open(CITY_REGION_LOOKUP, 'r', encoding='utf-8') as f:
            city_region_map = json.load(f)
        print(f"Loaded {len(city_region_map)} city-region mappings")
    else:
        print("Warning: City-region lookup not found, limited GHCID coverage")

    # Initialize geocoding
    print("Initializing GeoNames database...")
    if GEONAMES_DB.exists():
        geonames_db = GeoNamesDB(db_path=GEONAMES_DB)
    else:
        print("Warning: GeoNames database not found, geocoding will be skipped")
        geonames_db = None

    results = []
    geocoded_count = 0
    ghcid_count = 0

    for idx, inst in enumerate(institutions_json, 1):
        if idx % 10 == 0:
            print(f"Processing institution {idx}/{len(institutions_json)}...")

        # ===================================================================
        # STEP 1: Extract basic fields
        # ===================================================================

        isil_code = inst['isil']
        name_bg = inst.get('name_bg') or f"Library {isil_code}"
        name_en = inst.get('name_en')
        library_type_bg = inst.get('library_type', '')
        address = inst.get('address', '')

        # Build alternative names list
        alt_names = []
        if name_en:
            alt_names.append(name_en)
        if inst.get('name_variants'):
            alt_names.append(inst['name_variants'])

        # ===================================================================
        # STEP 2: Map to GLAMORCUBESFIXPHDNT taxonomy
        # ===================================================================

        institution_type = map_bulgarian_library_type(library_type_bg)

        # ===================================================================
        # STEP 3: Geocode address
        # ===================================================================

        city_bg = extract_city_from_address(address)
        region_bg = extract_region_from_library_type(library_type_bg)

        # Map Bulgarian city to English for GeoNames lookup
        city_en = BULGARIAN_CITY_MAPPINGS.get(city_bg, city_bg) if city_bg else None

        # Enhance region info using city-region lookup if available
        if city_en and city_en in city_region_map and not region_bg:
            # Handle both old and new field naming conventions
            region_entry = city_region_map[city_en]
            region_bg = region_entry.get('region_bulgarian') or region_entry.get('region_name')

        location = Location(
            city=city_en or city_bg,
            street_address=address,
            region=region_bg,
            country='BG'
        )

        # Attempt GeoNames lookup
        if city_en and geonames_db:
            try:
                city_info = geonames_db.lookup_city(city_en, 'BG')
                if city_info:
                    location.latitude = city_info.latitude
                    location.longitude = city_info.longitude
                    location.geonames_id = city_info.geonames_id
                    geocoded_count += 1
            except Exception as e:
                print(f"  Warning: Geocoding failed for {city_en}: {e}")

        # ===================================================================
        # STEP 4: Generate GHCID
        # ===================================================================

        ghcid = None
        ghcid_uuid = None
        ghcid_uuid_sha256 = None
        ghcid_numeric = None

        # Try to get region from lookup table or from library type field
        region_iso = None
        if city_en and city_en in city_region_map:
            # Use city-region lookup table (handle both old and new field names)
            region_entry = city_region_map[city_en]
            region_iso = region_entry.get('region_numeric') or region_entry.get('region_numeric')
        elif region_bg:
            # Fallback to extracted region from library type
            region_code = BULGARIAN_REGIONS.get(region_bg, 'BG-22')
            region_iso = region_code.split('-')[1]

        if city_en and region_iso and geonames_db:
            try:
                # Get city code from GeoNames
                city_info = geonames_db.lookup_city(city_en, 'BG')
                if city_info:
                    city_code = city_info.get_abbreviation()

                    # Generate abbreviation
                    abbreviation = generate_abbreviation_from_name(name_bg, isil_code)

                    # Build GHCID components
                    components = GHCIDComponents(
                        country_code='BG',
                        region_code=region_iso,
                        city_locode=city_code,
                        institution_type='L',  # Library
                        abbreviation=abbreviation
                    )

                    # Generate identifiers using GHCIDComponents methods
                    ghcid = components.to_string()
                    ghcid_uuid = str(components.to_uuid())
                    ghcid_uuid_sha256 = str(components.to_uuid_sha256())
                    ghcid_numeric = components.to_numeric()
                    ghcid_count += 1
            except Exception as e:
                print(f"  Warning: GHCID generation failed for {name_bg}: {e}")

        # ===================================================================
        # STEP 5: Enrich names (placeholder - Wikidata would go here)
        # ===================================================================

        # TODO: Query Wikidata for institutions with matching ISIL codes
        # For now, use existing names from registry

        # Build identifier list
        identifiers = [
            Identifier(
                identifier_scheme='ISIL',
                identifier_value=isil_code,

            )
        ]

        if inst.get('website'):
            identifiers.append(
                Identifier(
                    identifier_scheme='Website',
                    identifier_value=inst['website'],
                    identifier_url=inst['website'] if inst['website'].startswith('http') else f"http://{inst['website']}"
                )
            )

        # Build contact info
        contact_info = {}
        if inst.get('email'):
            # If multiple emails, take the first one (schema expects single email)
            email = inst['email']
            if ',' in email:
                email = email.split(',')[0].strip()
            contact_info['email'] = email
        if inst.get('phone_fax'):
            contact_info['phone'] = inst['phone_fax']

        # Build collections metadata
        collections = []
        if inst.get('collections'):
            collections.append({
                'collection_name': 'General Collection',
                'collection_type': 'bibliographic',
                'collection_description': inst['collections'],  # Use proper schema field name
                'item_count': inst.get('collection_size', 'Not specified')  # Use proper schema field name
            })

        # Build provenance
        provenance = Provenance(
            data_source='CSV_REGISTRY',
            data_tier='TIER_1_AUTHORITATIVE',
            extraction_date=datetime.now(timezone.utc).isoformat(),
            extraction_method='HTML table parsing from Bulgarian National Library ISIL registry',
            confidence_score=0.98,
            source_url='https://www.nationallibrary.bg/wp/?page_id=5686'
        )

        # Create HeritageCustodian record
        custodian = HeritageCustodian(
            id=f"https://w3id.org/heritage/custodian/bg/{isil_code.lower().replace('-', '')}",
            name=name_bg,
            alternative_names=alt_names if alt_names else None,
            institution_type=institution_type,
            ghcid=ghcid,
            ghcid_uuid=ghcid_uuid,
            ghcid_uuid_sha256=ghcid_uuid_sha256,
            ghcid_numeric=ghcid_numeric,
            description=library_type_bg,
            locations=[location],
            identifiers=identifiers,
            homepage=inst.get('website'),
            contact_info=contact_info if contact_info else None,
            collections=collections if collections else None,
            provenance=provenance
        )

        results.append(custodian)

    print(f"\n=== Conversion Complete ===")
    print(f"Total institutions: {len(results)}")
    print(f"Geocoded: {geocoded_count}/{len(results)} ({geocoded_count/len(results)*100:.1f}%)")
    print(f"GHCIDs generated: {ghcid_count}/{len(results)} ({ghcid_count/len(results)*100:.1f}%)")

    return results


def export_to_yaml(institutions: List[HeritageCustodian]) -> None:
    """
    Export institutions to LinkML-compliant YAML format.

    Note: Using custom YAML serialization to handle dataclasses.
    """
    import yaml

    # Convert dataclasses to dicts
    institutions_dicts = []
    for inst in institutions:
        inst_dict = {
            'id': inst.id,
            'name': inst.name,
            'institution_type': inst.institution_type,
        }

        if inst.ghcid:
            inst_dict['ghcid_current'] = inst.ghcid
        if inst.ghcid_uuid:
            inst_dict['ghcid_uuid'] = inst.ghcid_uuid
        if inst.ghcid_uuid_sha256:
            inst_dict['ghcid_uuid_sha256'] = inst.ghcid_uuid_sha256
        if inst.ghcid_numeric:
            inst_dict['ghcid_numeric'] = inst.ghcid_numeric  # type: ignore

        if inst.alternative_names:
            inst_dict['alternative_names'] = inst.alternative_names  # type: ignore
        if inst.description:
            inst_dict['description'] = inst.description
        if inst.homepage:
            inst_dict['homepage'] = inst.homepage

        # Locations
        if inst.locations:
            inst_dict['locations'] = []  # type: ignore
            for loc in inst.locations:
                loc_dict: Dict[str, Any] = {'country': loc.country}
                if loc.city:
                    loc_dict['city'] = loc.city
                if loc.street_address:
                    loc_dict['street_address'] = loc.street_address
                if loc.region:
                    loc_dict['region'] = loc.region
                if loc.latitude:
                    loc_dict['latitude'] = loc.latitude  # type: ignore
                if loc.longitude:
                    loc_dict['longitude'] = loc.longitude  # type: ignore
                if loc.geonames_id:
                    loc_dict['geonames_id'] = str(loc.geonames_id)  # type: ignore
                inst_dict['locations'].append(loc_dict)  # type: ignore

        # Identifiers
        if inst.identifiers:
            inst_dict['identifiers'] = []  # type: ignore
            for ident in inst.identifiers:
                ident_dict = {
                    'identifier_scheme': ident.identifier_scheme,
                    'identifier_value': ident.identifier_value
                }
                if ident.identifier_url:
                    ident_dict['identifier_url'] = ident.identifier_url
                inst_dict['identifiers'].append(ident_dict)  # type: ignore

        # Contact info
        if inst.contact_info:
            inst_dict['contact_info'] = inst.contact_info  # type: ignore

        # Collections
        if inst.collections:
            inst_dict['collections'] = inst.collections  # type: ignore

        # Provenance
        if inst.provenance:
            prov_dict: Dict[str, Any] = {
                'data_source': inst.provenance.data_source,
                'data_tier': inst.provenance.data_tier,
                'extraction_date': inst.provenance.extraction_date,
                'extraction_method': inst.provenance.extraction_method,
                'confidence_score': inst.provenance.confidence_score,  # type: ignore
            }
            if inst.provenance.source_url:
                prov_dict['source_url'] = inst.provenance.source_url
            inst_dict['provenance'] = prov_dict  # type: ignore

        institutions_dicts.append(inst_dict)

    # Write YAML
    print(f"\nExporting to {OUTPUT_FILE}...")
    OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        f.write('---\n')
        f.write('# Bulgarian ISIL Registry - Heritage Custodian Institutions\n')
        f.write('# Converted to LinkML-compliant format\n')
        f.write(f'# Generated: {datetime.now(timezone.utc).isoformat()}\n')
        f.write(f'# Source: Bulgarian National Library ISIL Registry\n')
        f.write(f'# Total institutions: {len(institutions_dicts)}\n')
        f.write('\n')
        yaml.dump(institutions_dicts, f, allow_unicode=True, default_flow_style=False, sort_keys=False)

    print(f"✓ Exported {len(institutions_dicts)} institutions to {OUTPUT_FILE}")


# =============================================================================
# Main Entry Point
# =============================================================================

def main():
    """Main conversion workflow."""
    print("=" * 70)
    print("Bulgarian ISIL Registry → LinkML Conversion")
    print("=" * 70)
    print()

    # Check input file exists
    if not INPUT_FILE.exists():
        print(f"Error: Input file not found: {INPUT_FILE}")
        sys.exit(1)

    # Check GeoNames database exists
    if not GEONAMES_DB.exists():
        print(f"Warning: GeoNames database not found at {GEONAMES_DB}")
        print("Geocoding will be skipped. Run scripts to build GeoNames DB first.")

    # Convert institutions
    institutions = convert_bulgarian_institutions()

    # Export to YAML
    export_to_yaml(institutions)

    print()
    print("=" * 70)
    print("✓ Conversion Complete!")
    print("=" * 70)
    print()
    print("Next steps:")
    print("1. Review output: cat", OUTPUT_FILE)
    print("2. Validate schema: linkml-validate -s schemas/heritage_custodian.yaml", OUTPUT_FILE)
    print("3. Enrich with Wikidata: scripts/enrich_bulgarian_wikidata.py")
    print("4. Generate RDF: scripts/export_bulgarian_to_rdf.py")


if __name__ == '__main__':
    main()