glam/scripts/convert_bulgarian_isil_to_linkml.py
2025-12-09 09:16:19 +01:00

694 lines
25 KiB
Python
Executable file
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Convert Bulgarian ISIL Registry to LinkML-compliant YAML format.
This script performs 5 integration steps:
1. Convert JSON to LinkML-compliant YAML format
2. Map library types to GLAMORCUBESFIXPHDNT taxonomy (all → LIBRARY)
3. Geocode addresses to lat/lon coordinates
4. Generate GHCIDs for all institutions
5. Enrich missing names from Wikidata
Input: data/isil/bulgarian_isil_registry.json
Output: data/instances/bulgaria_isil_libraries.yaml
"""
import json
import re
import sys
import unicodedata
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, asdict
# Add project root to path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / "src"))
from glam_extractor.identifiers.ghcid import GHCIDComponents
from glam_extractor.geocoding.geonames_lookup import GeoNamesDB
# =============================================================================
# Configuration
# =============================================================================
PROJECT_ROOT = Path(__file__).parent.parent
INPUT_FILE = PROJECT_ROOT / "data/isil/bulgarian_isil_registry.json"
OUTPUT_FILE = PROJECT_ROOT / "data/instances/bulgaria_isil_libraries.yaml"
GEONAMES_DB = PROJECT_ROOT / "data/reference/geonames.db"
CITY_REGION_LOOKUP = PROJECT_ROOT / "data/reference/bulgarian_city_regions.json"
# Bulgarian administrative regions (oblasti) → ISO 3166-2:BG codes
BULGARIAN_REGIONS = {
'Благоевград': 'BG-01', # Blagoevgrad
'Бургас': 'BG-02', # Burgas
'Варна': 'BG-03', # Varna
'Велико Търново': 'BG-04', # Veliko Tarnovo
'Видин': 'BG-05', # Vidin
'Враца': 'BG-06', # Vratsa
'Габрово': 'BG-07', # Gabrovo
'Добрич': 'BG-08', # Dobrich
'Кърджали': 'BG-09', # Kardzhali
'Кюстендил': 'BG-10', # Kyustendil
'Ловеч': 'BG-11', # Lovech
'Монтана': 'BG-12', # Montana
'Пазарджик': 'BG-13', # Pazardzhik
'Перник': 'BG-14', # Pernik
'Плевен': 'BG-15', # Pleven
'Пловдив': 'BG-16', # Plovdiv
'Разград': 'BG-17', # Razgrad
'Русе': 'BG-18', # Ruse
'Силистра': 'BG-19', # Silistra
'Сливен': 'BG-20', # Sliven
'Смолян': 'BG-21', # Smolyan
'София': 'BG-22', # Sofia (capital)
'София област': 'BG-23', # Sofia Province
'Стара Загора': 'BG-24', # Stara Zagora
'Търговище': 'BG-25', # Targovishte
'Хасково': 'BG-26', # Haskovo
'Шумен': 'BG-27', # Shumen
'Ямбол': 'BG-28', # Yambol
}
# City name mappings (Bulgarian Cyrillic → GeoNames English)
BULGARIAN_CITY_MAPPINGS = {
'София': 'Sofia',
'Бургас': 'Burgas',
'Варна': 'Varna',
'Пловдив': 'Plovdiv',
'Русе': 'Ruse',
'Стара Загора': 'Stara Zagora',
'Плевен': 'Pleven',
'Сливен': 'Sliven',
'Добрич': 'Dobrich',
'Габрово': 'Gabrovo',
'Видин': 'Vidin',
'Враца': 'Vratsa',
'Велико Търново': 'Veliko Tarnovo',
'Ловеч': 'Lovech',
'Кюстендил': 'Kyustendil',
'Благоевград': 'Blagoevgrad',
'Пазарджик': 'Pazardzhik',
'Монтана': 'Montana',
'Кърджали': 'Kardzhali',
'Смолян': 'Smolyan',
'Силистра': 'Silistra',
'Разград': 'Razgrad',
'Търговище': 'Targovishte',
'Хасково': 'Haskovo',
'Шумен': 'Shumen',
'Ямбол': 'Yambol',
'Перник': 'Pernik',
}
# =============================================================================
# Data Models
# =============================================================================
@dataclass
class Location:
"""Location information for heritage custodians."""
city: Optional[str] = None
street_address: Optional[str] = None
postal_code: Optional[str] = None
region: Optional[str] = None
country: str = 'BG'
latitude: Optional[float] = None
longitude: Optional[float] = None
geonames_id: Optional[int] = None
@dataclass
class Identifier:
"""External identifiers (ISIL, Wikidata, etc.)."""
identifier_scheme: str
identifier_value: str
identifier_url: Optional[str] = None
@dataclass
class Provenance:
"""Data provenance metadata."""
data_source: str
data_tier: str
extraction_date: str
extraction_method: str
confidence_score: float
conversation_id: Optional[str] = None
source_url: Optional[str] = None
@dataclass
class HeritageCustodian:
"""Main heritage custodian record (LinkML-compliant)."""
id: str
name: str
institution_type: str
ghcid: Optional[str] = None
ghcid_uuid: Optional[str] = None
ghcid_uuid_sha256: Optional[str] = None
ghcid_numeric: Optional[int] = None
alternative_names: Optional[List[str]] = None
description: Optional[str] = None
locations: Optional[List[Location]] = None
identifiers: Optional[List[Identifier]] = None
homepage: Optional[str] = None
contact_info: Optional[Dict[str, Any]] = None
collections: Optional[List[Dict[str, Any]]] = None
provenance: Optional[Provenance] = None
# =============================================================================
# Utilities
# =============================================================================
def extract_city_from_address(address: str) -> Optional[str]:
"""
Extract city name from Bulgarian address.
Bulgarian address format:
- "гр. София 1504, бул. ..." → city is "София"
- "Бургас 8000, ул. ..." → city is "Бургас"
- "с. Плетена 2954, община Сатовча" → city is "Плетена" (village)
Returns:
City name in Bulgarian (Cyrillic)
"""
if not address:
return None
# Remove leading "гр." (city), "с." (village), or "община" (municipality)
address = address.strip()
# Pattern 1: "гр. CityName POSTAL, ..."
match = re.match(r'(?:гр\.|с\.)\s*([А-Яа-я\s\-]+?)\s*\d{4}', address)
if match:
return match.group(1).strip()
# Pattern 2: "CityName POSTAL, ..."
match = re.match(r'([А-Яа-я\s\-]+?)\s*\d{4}', address)
if match:
return match.group(1).strip()
# Fallback: take first word sequence before digits
match = re.match(r'([А-Яа-я\s\-]+)', address)
if match:
return match.group(1).strip()
return None
def extract_region_from_library_type(library_type: str) -> Optional[str]:
"""
Extract region name from library type field.
Bulgarian regional libraries include the oblast in the type field:
"Регионална библиотека (Област Бургас)" → region is "Бургас"
"""
if not library_type:
return None
match = re.search(r'Област\s+([А-Яа-я\s]+)', library_type)
if match:
return match.group(1).strip()
return None
def transliterate_bulgarian(text: str) -> str:
"""
Transliterate Bulgarian Cyrillic to Latin alphabet for GHCID abbreviations.
Uses BGN/PCGN romanization standard for Bulgarian.
"""
cyrillic_to_latin = {
'А': 'A', 'а': 'a',
'Б': 'B', 'б': 'b',
'В': 'V', 'в': 'v',
'Г': 'G', 'г': 'g',
'Д': 'D', 'д': 'd',
'Е': 'E', 'е': 'e',
'Ж': 'Zh', 'ж': 'zh',
'З': 'Z', 'з': 'z',
'И': 'I', 'и': 'i',
'Й': 'Y', 'й': 'y',
'К': 'K', 'к': 'k',
'Л': 'L', 'л': 'l',
'М': 'M', 'м': 'm',
'Н': 'N', 'н': 'n',
'О': 'O', 'о': 'o',
'П': 'P', 'п': 'p',
'Р': 'R', 'р': 'r',
'С': 'S', 'с': 's',
'Т': 'T', 'т': 't',
'У': 'U', 'у': 'u',
'Ф': 'F', 'ф': 'f',
'Х': 'H', 'х': 'h',
'Ц': 'Ts', 'ц': 'ts',
'Ч': 'Ch', 'ч': 'ch',
'Ш': 'Sh', 'ш': 'sh',
'Щ': 'Sht', 'щ': 'sht',
'Ъ': 'A', 'ъ': 'a',
'Ь': 'Y', 'ь': 'y',
'Ю': 'Yu', 'ю': 'yu',
'Я': 'Ya', 'я': 'ya',
}
result = []
for char in text:
result.append(cyrillic_to_latin.get(char, char))
return ''.join(result)
def generate_abbreviation_from_name(name: str, isil_code: str) -> str:
"""
Generate institution abbreviation for GHCID.
Strategy:
1. If name has explicit abbreviation in parentheses, use it
2. Otherwise, use last 4 digits of ISIL code
Examples:
"Национална библиотека „Св. св. Кирил и Методий" (НБКМ)""NBKM"
"Библиотека при НЧ..." (BG-0130000) → "0000"
"""
# Check for abbreviation in parentheses
match = re.search(r'\(([А-Яа-яA-Za-z0-9]+)\)', name or '')
if match:
abbr = match.group(1)
# Transliterate if Cyrillic
if any('\u0400' <= c <= '\u04FF' for c in abbr):
abbr = transliterate_bulgarian(abbr)
return abbr.upper()[:10]
# Fallback: use last 4 digits of ISIL code
isil_suffix = isil_code.split('-')[-1]
return isil_suffix[-4:]
def map_bulgarian_library_type(library_type: str) -> str:
"""
Map Bulgarian library type to GLAMORCUBESFIXPHDNT taxonomy.
All Bulgarian institutions in this registry are libraries (LIBRARY class).
This includes:
- National libraries
- Regional libraries (oblast-level)
- University libraries (academic)
- Community center libraries (chitalishta)
- Municipal libraries (city-level)
- Scientific libraries (research institutes)
"""
# All institutions are LIBRARY type in this registry
return 'LIBRARY'
# =============================================================================
# Main Conversion Logic
# =============================================================================
def convert_bulgarian_institutions() -> List[HeritageCustodian]:
"""
Convert Bulgarian ISIL registry to LinkML-compliant records.
Performs 5 integration steps:
1. Parse JSON and extract fields
2. Map library types to GLAMORCUBESFIXPHDNT taxonomy
3. Geocode addresses using GeoNames
4. Generate GHCIDs with UUIDs
5. Enrich names (placeholder - Wikidata enrichment would go here)
"""
print("Loading Bulgarian ISIL registry...")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
data = json.load(f)
institutions_json = data['institutions']
print(f"Found {len(institutions_json)} institutions")
# Load city-region lookup table
print("Loading city-region lookup table...")
city_region_map = {}
if CITY_REGION_LOOKUP.exists():
with open(CITY_REGION_LOOKUP, 'r', encoding='utf-8') as f:
city_region_map = json.load(f)
print(f"Loaded {len(city_region_map)} city-region mappings")
else:
print("Warning: City-region lookup not found, limited GHCID coverage")
# Initialize geocoding
print("Initializing GeoNames database...")
if GEONAMES_DB.exists():
geonames_db = GeoNamesDB(db_path=GEONAMES_DB)
else:
print("Warning: GeoNames database not found, geocoding will be skipped")
geonames_db = None
results = []
geocoded_count = 0
ghcid_count = 0
for idx, inst in enumerate(institutions_json, 1):
if idx % 10 == 0:
print(f"Processing institution {idx}/{len(institutions_json)}...")
# ===================================================================
# STEP 1: Extract basic fields
# ===================================================================
isil_code = inst['isil']
name_bg = inst.get('name_bg') or f"Library {isil_code}"
name_en = inst.get('name_en')
library_type_bg = inst.get('library_type', '')
address = inst.get('address', '')
# Build alternative names list
alt_names = []
if name_en:
alt_names.append(name_en)
if inst.get('name_variants'):
alt_names.append(inst['name_variants'])
# ===================================================================
# STEP 2: Map to GLAMORCUBESFIXPHDNT taxonomy
# ===================================================================
institution_type = map_bulgarian_library_type(library_type_bg)
# ===================================================================
# STEP 3: Geocode address
# ===================================================================
city_bg = extract_city_from_address(address)
region_bg = extract_region_from_library_type(library_type_bg)
# Map Bulgarian city to English for GeoNames lookup
city_en = BULGARIAN_CITY_MAPPINGS.get(city_bg, city_bg) if city_bg else None
# Enhance region info using city-region lookup if available
if city_en and city_en in city_region_map and not region_bg:
# Handle both old and new field naming conventions
region_entry = city_region_map[city_en]
region_bg = region_entry.get('region_bulgarian') or region_entry.get('region_name')
location = Location(
city=city_en or city_bg,
street_address=address,
region=region_bg,
country='BG'
)
# Attempt GeoNames lookup
if city_en and geonames_db:
try:
city_info = geonames_db.lookup_city(city_en, 'BG')
if city_info:
location.latitude = city_info.latitude
location.longitude = city_info.longitude
location.geonames_id = city_info.geonames_id
geocoded_count += 1
except Exception as e:
print(f" Warning: Geocoding failed for {city_en}: {e}")
# ===================================================================
# STEP 4: Generate GHCID
# ===================================================================
ghcid = None
ghcid_uuid = None
ghcid_uuid_sha256 = None
ghcid_numeric = None
# Try to get region from lookup table or from library type field
region_iso = None
if city_en and city_en in city_region_map:
# Use city-region lookup table (handle both old and new field names)
region_entry = city_region_map[city_en]
region_iso = region_entry.get('region_numeric') or region_entry.get('region_numeric')
elif region_bg:
# Fallback to extracted region from library type
region_code = BULGARIAN_REGIONS.get(region_bg, 'BG-22')
region_iso = region_code.split('-')[1]
if city_en and region_iso and geonames_db:
try:
# Get city code from GeoNames
city_info = geonames_db.lookup_city(city_en, 'BG')
if city_info:
city_code = city_info.get_abbreviation()
# Generate abbreviation
abbreviation = generate_abbreviation_from_name(name_bg, isil_code)
# Build GHCID components
components = GHCIDComponents(
country_code='BG',
region_code=region_iso,
city_locode=city_code,
institution_type='L', # Library
abbreviation=abbreviation
)
# Generate identifiers using GHCIDComponents methods
ghcid = components.to_string()
ghcid_uuid = str(components.to_uuid())
ghcid_uuid_sha256 = str(components.to_uuid_sha256())
ghcid_numeric = components.to_numeric()
ghcid_count += 1
except Exception as e:
print(f" Warning: GHCID generation failed for {name_bg}: {e}")
# ===================================================================
# STEP 5: Enrich names (placeholder - Wikidata would go here)
# ===================================================================
# TODO: Query Wikidata for institutions with matching ISIL codes
# For now, use existing names from registry
# Build identifier list
identifiers = [
Identifier(
identifier_scheme='ISIL',
identifier_value=isil_code,
)
]
if inst.get('website'):
identifiers.append(
Identifier(
identifier_scheme='Website',
identifier_value=inst['website'],
identifier_url=inst['website'] if inst['website'].startswith('http') else f"http://{inst['website']}"
)
)
# Build contact info
contact_info = {}
if inst.get('email'):
# If multiple emails, take the first one (schema expects single email)
email = inst['email']
if ',' in email:
email = email.split(',')[0].strip()
contact_info['email'] = email
if inst.get('phone_fax'):
contact_info['phone'] = inst['phone_fax']
# Build collections metadata
collections = []
if inst.get('collections'):
collections.append({
'collection_name': 'General Collection',
'collection_type': 'bibliographic',
'collection_description': inst['collections'], # Use proper schema field name
'item_count': inst.get('collection_size', 'Not specified') # Use proper schema field name
})
# Build provenance
provenance = Provenance(
data_source='CSV_REGISTRY',
data_tier='TIER_1_AUTHORITATIVE',
extraction_date=datetime.now(timezone.utc).isoformat(),
extraction_method='HTML table parsing from Bulgarian National Library ISIL registry',
confidence_score=0.98,
source_url='https://www.nationallibrary.bg/wp/?page_id=5686'
)
# Create HeritageCustodian record
custodian = HeritageCustodian(
id=f"https://w3id.org/heritage/custodian/bg/{isil_code.lower().replace('-', '')}",
name=name_bg,
alternative_names=alt_names if alt_names else None,
institution_type=institution_type,
ghcid=ghcid,
ghcid_uuid=ghcid_uuid,
ghcid_uuid_sha256=ghcid_uuid_sha256,
ghcid_numeric=ghcid_numeric,
description=library_type_bg,
locations=[location],
identifiers=identifiers,
homepage=inst.get('website'),
contact_info=contact_info if contact_info else None,
collections=collections if collections else None,
provenance=provenance
)
results.append(custodian)
print(f"\n=== Conversion Complete ===")
print(f"Total institutions: {len(results)}")
print(f"Geocoded: {geocoded_count}/{len(results)} ({geocoded_count/len(results)*100:.1f}%)")
print(f"GHCIDs generated: {ghcid_count}/{len(results)} ({ghcid_count/len(results)*100:.1f}%)")
return results
def export_to_yaml(institutions: List[HeritageCustodian]) -> None:
"""
Export institutions to LinkML-compliant YAML format.
Note: Using custom YAML serialization to handle dataclasses.
"""
import yaml
# Convert dataclasses to dicts
institutions_dicts = []
for inst in institutions:
inst_dict = {
'id': inst.id,
'name': inst.name,
'institution_type': inst.institution_type,
}
if inst.ghcid:
inst_dict['ghcid_current'] = inst.ghcid
if inst.ghcid_uuid:
inst_dict['ghcid_uuid'] = inst.ghcid_uuid
if inst.ghcid_uuid_sha256:
inst_dict['ghcid_uuid_sha256'] = inst.ghcid_uuid_sha256
if inst.ghcid_numeric:
inst_dict['ghcid_numeric'] = inst.ghcid_numeric # type: ignore
if inst.alternative_names:
inst_dict['alternative_names'] = inst.alternative_names # type: ignore
if inst.description:
inst_dict['description'] = inst.description
if inst.homepage:
inst_dict['homepage'] = inst.homepage
# Locations
if inst.locations:
inst_dict['locations'] = [] # type: ignore
for loc in inst.locations:
loc_dict: Dict[str, Any] = {'country': loc.country}
if loc.city:
loc_dict['city'] = loc.city
if loc.street_address:
loc_dict['street_address'] = loc.street_address
if loc.region:
loc_dict['region'] = loc.region
if loc.latitude:
loc_dict['latitude'] = loc.latitude # type: ignore
if loc.longitude:
loc_dict['longitude'] = loc.longitude # type: ignore
if loc.geonames_id:
loc_dict['geonames_id'] = str(loc.geonames_id) # type: ignore
inst_dict['locations'].append(loc_dict) # type: ignore
# Identifiers
if inst.identifiers:
inst_dict['identifiers'] = [] # type: ignore
for ident in inst.identifiers:
ident_dict = {
'identifier_scheme': ident.identifier_scheme,
'identifier_value': ident.identifier_value
}
if ident.identifier_url:
ident_dict['identifier_url'] = ident.identifier_url
inst_dict['identifiers'].append(ident_dict) # type: ignore
# Contact info
if inst.contact_info:
inst_dict['contact_info'] = inst.contact_info # type: ignore
# Collections
if inst.collections:
inst_dict['collections'] = inst.collections # type: ignore
# Provenance
if inst.provenance:
prov_dict: Dict[str, Any] = {
'data_source': inst.provenance.data_source,
'data_tier': inst.provenance.data_tier,
'extraction_date': inst.provenance.extraction_date,
'extraction_method': inst.provenance.extraction_method,
'confidence_score': inst.provenance.confidence_score, # type: ignore
}
if inst.provenance.source_url:
prov_dict['source_url'] = inst.provenance.source_url
inst_dict['provenance'] = prov_dict # type: ignore
institutions_dicts.append(inst_dict)
# Write YAML
print(f"\nExporting to {OUTPUT_FILE}...")
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
f.write('---\n')
f.write('# Bulgarian ISIL Registry - Heritage Custodian Institutions\n')
f.write('# Converted to LinkML-compliant format\n')
f.write(f'# Generated: {datetime.now(timezone.utc).isoformat()}\n')
f.write(f'# Source: Bulgarian National Library ISIL Registry\n')
f.write(f'# Total institutions: {len(institutions_dicts)}\n')
f.write('\n')
yaml.dump(institutions_dicts, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Exported {len(institutions_dicts)} institutions to {OUTPUT_FILE}")
# =============================================================================
# Main Entry Point
# =============================================================================
def main():
"""Main conversion workflow."""
print("=" * 70)
print("Bulgarian ISIL Registry → LinkML Conversion")
print("=" * 70)
print()
# Check input file exists
if not INPUT_FILE.exists():
print(f"Error: Input file not found: {INPUT_FILE}")
sys.exit(1)
# Check GeoNames database exists
if not GEONAMES_DB.exists():
print(f"Warning: GeoNames database not found at {GEONAMES_DB}")
print("Geocoding will be skipped. Run scripts to build GeoNames DB first.")
# Convert institutions
institutions = convert_bulgarian_institutions()
# Export to YAML
export_to_yaml(institutions)
print()
print("=" * 70)
print("✓ Conversion Complete!")
print("=" * 70)
print()
print("Next steps:")
print("1. Review output: cat", OUTPUT_FILE)
print("2. Validate schema: linkml-validate -s schemas/heritage_custodian.yaml", OUTPUT_FILE)
print("3. Enrich with Wikidata: scripts/enrich_bulgarian_wikidata.py")
print("4. Generate RDF: scripts/export_bulgarian_to_rdf.py")
if __name__ == '__main__':
main()