694 lines
25 KiB
Python
Executable file
694 lines
25 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Convert Bulgarian ISIL Registry to LinkML-compliant YAML format.
|
||
|
||
This script performs 5 integration steps:
|
||
1. Convert JSON to LinkML-compliant YAML format
|
||
2. Map library types to GLAMORCUBESFIXPHDNT taxonomy (all → LIBRARY)
|
||
3. Geocode addresses to lat/lon coordinates
|
||
4. Generate GHCIDs for all institutions
|
||
5. Enrich missing names from Wikidata
|
||
|
||
Input: data/isil/bulgarian_isil_registry.json
|
||
Output: data/instances/bulgaria_isil_libraries.yaml
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
import sys
|
||
import unicodedata
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
from typing import Dict, List, Optional, Any
|
||
from dataclasses import dataclass, asdict
|
||
|
||
# Add project root to path
|
||
project_root = Path(__file__).parent.parent
|
||
sys.path.insert(0, str(project_root))
|
||
sys.path.insert(0, str(project_root / "src"))
|
||
|
||
from glam_extractor.identifiers.ghcid import GHCIDComponents
|
||
from glam_extractor.geocoding.geonames_lookup import GeoNamesDB
|
||
|
||
|
||
# =============================================================================
|
||
# Configuration
|
||
# =============================================================================
|
||
|
||
PROJECT_ROOT = Path(__file__).parent.parent
|
||
INPUT_FILE = PROJECT_ROOT / "data/isil/bulgarian_isil_registry.json"
|
||
OUTPUT_FILE = PROJECT_ROOT / "data/instances/bulgaria_isil_libraries.yaml"
|
||
GEONAMES_DB = PROJECT_ROOT / "data/reference/geonames.db"
|
||
CITY_REGION_LOOKUP = PROJECT_ROOT / "data/reference/bulgarian_city_regions.json"
|
||
|
||
# Bulgarian administrative regions (oblasti) → ISO 3166-2:BG codes
|
||
BULGARIAN_REGIONS = {
|
||
'Благоевград': 'BG-01', # Blagoevgrad
|
||
'Бургас': 'BG-02', # Burgas
|
||
'Варна': 'BG-03', # Varna
|
||
'Велико Търново': 'BG-04', # Veliko Tarnovo
|
||
'Видин': 'BG-05', # Vidin
|
||
'Враца': 'BG-06', # Vratsa
|
||
'Габрово': 'BG-07', # Gabrovo
|
||
'Добрич': 'BG-08', # Dobrich
|
||
'Кърджали': 'BG-09', # Kardzhali
|
||
'Кюстендил': 'BG-10', # Kyustendil
|
||
'Ловеч': 'BG-11', # Lovech
|
||
'Монтана': 'BG-12', # Montana
|
||
'Пазарджик': 'BG-13', # Pazardzhik
|
||
'Перник': 'BG-14', # Pernik
|
||
'Плевен': 'BG-15', # Pleven
|
||
'Пловдив': 'BG-16', # Plovdiv
|
||
'Разград': 'BG-17', # Razgrad
|
||
'Русе': 'BG-18', # Ruse
|
||
'Силистра': 'BG-19', # Silistra
|
||
'Сливен': 'BG-20', # Sliven
|
||
'Смолян': 'BG-21', # Smolyan
|
||
'София': 'BG-22', # Sofia (capital)
|
||
'София област': 'BG-23', # Sofia Province
|
||
'Стара Загора': 'BG-24', # Stara Zagora
|
||
'Търговище': 'BG-25', # Targovishte
|
||
'Хасково': 'BG-26', # Haskovo
|
||
'Шумен': 'BG-27', # Shumen
|
||
'Ямбол': 'BG-28', # Yambol
|
||
}
|
||
|
||
# City name mappings (Bulgarian Cyrillic → GeoNames English)
|
||
BULGARIAN_CITY_MAPPINGS = {
|
||
'София': 'Sofia',
|
||
'Бургас': 'Burgas',
|
||
'Варна': 'Varna',
|
||
'Пловдив': 'Plovdiv',
|
||
'Русе': 'Ruse',
|
||
'Стара Загора': 'Stara Zagora',
|
||
'Плевен': 'Pleven',
|
||
'Сливен': 'Sliven',
|
||
'Добрич': 'Dobrich',
|
||
'Габрово': 'Gabrovo',
|
||
'Видин': 'Vidin',
|
||
'Враца': 'Vratsa',
|
||
'Велико Търново': 'Veliko Tarnovo',
|
||
'Ловеч': 'Lovech',
|
||
'Кюстендил': 'Kyustendil',
|
||
'Благоевград': 'Blagoevgrad',
|
||
'Пазарджик': 'Pazardzhik',
|
||
'Монтана': 'Montana',
|
||
'Кърджали': 'Kardzhali',
|
||
'Смолян': 'Smolyan',
|
||
'Силистра': 'Silistra',
|
||
'Разград': 'Razgrad',
|
||
'Търговище': 'Targovishte',
|
||
'Хасково': 'Haskovo',
|
||
'Шумен': 'Shumen',
|
||
'Ямбол': 'Yambol',
|
||
'Перник': 'Pernik',
|
||
}
|
||
|
||
|
||
# =============================================================================
|
||
# Data Models
|
||
# =============================================================================
|
||
|
||
@dataclass
|
||
class Location:
|
||
"""Location information for heritage custodians."""
|
||
city: Optional[str] = None
|
||
street_address: Optional[str] = None
|
||
postal_code: Optional[str] = None
|
||
region: Optional[str] = None
|
||
country: str = 'BG'
|
||
latitude: Optional[float] = None
|
||
longitude: Optional[float] = None
|
||
geonames_id: Optional[int] = None
|
||
|
||
|
||
@dataclass
|
||
class Identifier:
|
||
"""External identifiers (ISIL, Wikidata, etc.)."""
|
||
identifier_scheme: str
|
||
identifier_value: str
|
||
identifier_url: Optional[str] = None
|
||
|
||
|
||
@dataclass
|
||
class Provenance:
|
||
"""Data provenance metadata."""
|
||
data_source: str
|
||
data_tier: str
|
||
extraction_date: str
|
||
extraction_method: str
|
||
confidence_score: float
|
||
conversation_id: Optional[str] = None
|
||
source_url: Optional[str] = None
|
||
|
||
|
||
@dataclass
|
||
class HeritageCustodian:
|
||
"""Main heritage custodian record (LinkML-compliant)."""
|
||
id: str
|
||
name: str
|
||
institution_type: str
|
||
ghcid: Optional[str] = None
|
||
ghcid_uuid: Optional[str] = None
|
||
ghcid_uuid_sha256: Optional[str] = None
|
||
ghcid_numeric: Optional[int] = None
|
||
alternative_names: Optional[List[str]] = None
|
||
description: Optional[str] = None
|
||
locations: Optional[List[Location]] = None
|
||
identifiers: Optional[List[Identifier]] = None
|
||
homepage: Optional[str] = None
|
||
contact_info: Optional[Dict[str, Any]] = None
|
||
collections: Optional[List[Dict[str, Any]]] = None
|
||
provenance: Optional[Provenance] = None
|
||
|
||
|
||
# =============================================================================
|
||
# Utilities
|
||
# =============================================================================
|
||
|
||
def extract_city_from_address(address: str) -> Optional[str]:
|
||
"""
|
||
Extract city name from Bulgarian address.
|
||
|
||
Bulgarian address format:
|
||
- "гр. София 1504, бул. ..." → city is "София"
|
||
- "Бургас 8000, ул. ..." → city is "Бургас"
|
||
- "с. Плетена 2954, община Сатовча" → city is "Плетена" (village)
|
||
|
||
Returns:
|
||
City name in Bulgarian (Cyrillic)
|
||
"""
|
||
if not address:
|
||
return None
|
||
|
||
# Remove leading "гр." (city), "с." (village), or "община" (municipality)
|
||
address = address.strip()
|
||
|
||
# Pattern 1: "гр. CityName POSTAL, ..."
|
||
match = re.match(r'(?:гр\.|с\.)\s*([А-Яа-я\s\-]+?)\s*\d{4}', address)
|
||
if match:
|
||
return match.group(1).strip()
|
||
|
||
# Pattern 2: "CityName POSTAL, ..."
|
||
match = re.match(r'([А-Яа-я\s\-]+?)\s*\d{4}', address)
|
||
if match:
|
||
return match.group(1).strip()
|
||
|
||
# Fallback: take first word sequence before digits
|
||
match = re.match(r'([А-Яа-я\s\-]+)', address)
|
||
if match:
|
||
return match.group(1).strip()
|
||
|
||
return None
|
||
|
||
|
||
def extract_region_from_library_type(library_type: str) -> Optional[str]:
|
||
"""
|
||
Extract region name from library type field.
|
||
|
||
Bulgarian regional libraries include the oblast in the type field:
|
||
"Регионална библиотека (Област Бургас)" → region is "Бургас"
|
||
"""
|
||
if not library_type:
|
||
return None
|
||
|
||
match = re.search(r'Област\s+([А-Яа-я\s]+)', library_type)
|
||
if match:
|
||
return match.group(1).strip()
|
||
|
||
return None
|
||
|
||
|
||
def transliterate_bulgarian(text: str) -> str:
|
||
"""
|
||
Transliterate Bulgarian Cyrillic to Latin alphabet for GHCID abbreviations.
|
||
|
||
Uses BGN/PCGN romanization standard for Bulgarian.
|
||
"""
|
||
cyrillic_to_latin = {
|
||
'А': 'A', 'а': 'a',
|
||
'Б': 'B', 'б': 'b',
|
||
'В': 'V', 'в': 'v',
|
||
'Г': 'G', 'г': 'g',
|
||
'Д': 'D', 'д': 'd',
|
||
'Е': 'E', 'е': 'e',
|
||
'Ж': 'Zh', 'ж': 'zh',
|
||
'З': 'Z', 'з': 'z',
|
||
'И': 'I', 'и': 'i',
|
||
'Й': 'Y', 'й': 'y',
|
||
'К': 'K', 'к': 'k',
|
||
'Л': 'L', 'л': 'l',
|
||
'М': 'M', 'м': 'm',
|
||
'Н': 'N', 'н': 'n',
|
||
'О': 'O', 'о': 'o',
|
||
'П': 'P', 'п': 'p',
|
||
'Р': 'R', 'р': 'r',
|
||
'С': 'S', 'с': 's',
|
||
'Т': 'T', 'т': 't',
|
||
'У': 'U', 'у': 'u',
|
||
'Ф': 'F', 'ф': 'f',
|
||
'Х': 'H', 'х': 'h',
|
||
'Ц': 'Ts', 'ц': 'ts',
|
||
'Ч': 'Ch', 'ч': 'ch',
|
||
'Ш': 'Sh', 'ш': 'sh',
|
||
'Щ': 'Sht', 'щ': 'sht',
|
||
'Ъ': 'A', 'ъ': 'a',
|
||
'Ь': 'Y', 'ь': 'y',
|
||
'Ю': 'Yu', 'ю': 'yu',
|
||
'Я': 'Ya', 'я': 'ya',
|
||
}
|
||
|
||
result = []
|
||
for char in text:
|
||
result.append(cyrillic_to_latin.get(char, char))
|
||
|
||
return ''.join(result)
|
||
|
||
|
||
def generate_abbreviation_from_name(name: str, isil_code: str) -> str:
|
||
"""
|
||
Generate institution abbreviation for GHCID.
|
||
|
||
Strategy:
|
||
1. If name has explicit abbreviation in parentheses, use it
|
||
2. Otherwise, use last 4 digits of ISIL code
|
||
|
||
Examples:
|
||
"Национална библиотека „Св. св. Кирил и Методий" (НБКМ)" → "NBKM"
|
||
"Библиотека при НЧ..." (BG-0130000) → "0000"
|
||
"""
|
||
# Check for abbreviation in parentheses
|
||
match = re.search(r'\(([А-Яа-яA-Za-z0-9]+)\)', name or '')
|
||
if match:
|
||
abbr = match.group(1)
|
||
# Transliterate if Cyrillic
|
||
if any('\u0400' <= c <= '\u04FF' for c in abbr):
|
||
abbr = transliterate_bulgarian(abbr)
|
||
return abbr.upper()[:10]
|
||
|
||
# Fallback: use last 4 digits of ISIL code
|
||
isil_suffix = isil_code.split('-')[-1]
|
||
return isil_suffix[-4:]
|
||
|
||
|
||
def map_bulgarian_library_type(library_type: str) -> str:
|
||
"""
|
||
Map Bulgarian library type to GLAMORCUBESFIXPHDNT taxonomy.
|
||
|
||
All Bulgarian institutions in this registry are libraries (LIBRARY class).
|
||
This includes:
|
||
- National libraries
|
||
- Regional libraries (oblast-level)
|
||
- University libraries (academic)
|
||
- Community center libraries (chitalishta)
|
||
- Municipal libraries (city-level)
|
||
- Scientific libraries (research institutes)
|
||
"""
|
||
# All institutions are LIBRARY type in this registry
|
||
return 'LIBRARY'
|
||
|
||
|
||
# =============================================================================
|
||
# Main Conversion Logic
|
||
# =============================================================================
|
||
|
||
def convert_bulgarian_institutions() -> List[HeritageCustodian]:
|
||
"""
|
||
Convert Bulgarian ISIL registry to LinkML-compliant records.
|
||
|
||
Performs 5 integration steps:
|
||
1. Parse JSON and extract fields
|
||
2. Map library types to GLAMORCUBESFIXPHDNT taxonomy
|
||
3. Geocode addresses using GeoNames
|
||
4. Generate GHCIDs with UUIDs
|
||
5. Enrich names (placeholder - Wikidata enrichment would go here)
|
||
"""
|
||
print("Loading Bulgarian ISIL registry...")
|
||
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
institutions_json = data['institutions']
|
||
print(f"Found {len(institutions_json)} institutions")
|
||
|
||
# Load city-region lookup table
|
||
print("Loading city-region lookup table...")
|
||
city_region_map = {}
|
||
if CITY_REGION_LOOKUP.exists():
|
||
with open(CITY_REGION_LOOKUP, 'r', encoding='utf-8') as f:
|
||
city_region_map = json.load(f)
|
||
print(f"Loaded {len(city_region_map)} city-region mappings")
|
||
else:
|
||
print("Warning: City-region lookup not found, limited GHCID coverage")
|
||
|
||
# Initialize geocoding
|
||
print("Initializing GeoNames database...")
|
||
if GEONAMES_DB.exists():
|
||
geonames_db = GeoNamesDB(db_path=GEONAMES_DB)
|
||
else:
|
||
print("Warning: GeoNames database not found, geocoding will be skipped")
|
||
geonames_db = None
|
||
|
||
results = []
|
||
geocoded_count = 0
|
||
ghcid_count = 0
|
||
|
||
for idx, inst in enumerate(institutions_json, 1):
|
||
if idx % 10 == 0:
|
||
print(f"Processing institution {idx}/{len(institutions_json)}...")
|
||
|
||
# ===================================================================
|
||
# STEP 1: Extract basic fields
|
||
# ===================================================================
|
||
|
||
isil_code = inst['isil']
|
||
name_bg = inst.get('name_bg') or f"Library {isil_code}"
|
||
name_en = inst.get('name_en')
|
||
library_type_bg = inst.get('library_type', '')
|
||
address = inst.get('address', '')
|
||
|
||
# Build alternative names list
|
||
alt_names = []
|
||
if name_en:
|
||
alt_names.append(name_en)
|
||
if inst.get('name_variants'):
|
||
alt_names.append(inst['name_variants'])
|
||
|
||
# ===================================================================
|
||
# STEP 2: Map to GLAMORCUBESFIXPHDNT taxonomy
|
||
# ===================================================================
|
||
|
||
institution_type = map_bulgarian_library_type(library_type_bg)
|
||
|
||
# ===================================================================
|
||
# STEP 3: Geocode address
|
||
# ===================================================================
|
||
|
||
city_bg = extract_city_from_address(address)
|
||
region_bg = extract_region_from_library_type(library_type_bg)
|
||
|
||
# Map Bulgarian city to English for GeoNames lookup
|
||
city_en = BULGARIAN_CITY_MAPPINGS.get(city_bg, city_bg) if city_bg else None
|
||
|
||
# Enhance region info using city-region lookup if available
|
||
if city_en and city_en in city_region_map and not region_bg:
|
||
# Handle both old and new field naming conventions
|
||
region_entry = city_region_map[city_en]
|
||
region_bg = region_entry.get('region_bulgarian') or region_entry.get('region_name')
|
||
|
||
location = Location(
|
||
city=city_en or city_bg,
|
||
street_address=address,
|
||
region=region_bg,
|
||
country='BG'
|
||
)
|
||
|
||
# Attempt GeoNames lookup
|
||
if city_en and geonames_db:
|
||
try:
|
||
city_info = geonames_db.lookup_city(city_en, 'BG')
|
||
if city_info:
|
||
location.latitude = city_info.latitude
|
||
location.longitude = city_info.longitude
|
||
location.geonames_id = city_info.geonames_id
|
||
geocoded_count += 1
|
||
except Exception as e:
|
||
print(f" Warning: Geocoding failed for {city_en}: {e}")
|
||
|
||
# ===================================================================
|
||
# STEP 4: Generate GHCID
|
||
# ===================================================================
|
||
|
||
ghcid = None
|
||
ghcid_uuid = None
|
||
ghcid_uuid_sha256 = None
|
||
ghcid_numeric = None
|
||
|
||
# Try to get region from lookup table or from library type field
|
||
region_iso = None
|
||
if city_en and city_en in city_region_map:
|
||
# Use city-region lookup table (handle both old and new field names)
|
||
region_entry = city_region_map[city_en]
|
||
region_iso = region_entry.get('region_numeric') or region_entry.get('region_numeric')
|
||
elif region_bg:
|
||
# Fallback to extracted region from library type
|
||
region_code = BULGARIAN_REGIONS.get(region_bg, 'BG-22')
|
||
region_iso = region_code.split('-')[1]
|
||
|
||
if city_en and region_iso and geonames_db:
|
||
try:
|
||
# Get city code from GeoNames
|
||
city_info = geonames_db.lookup_city(city_en, 'BG')
|
||
if city_info:
|
||
city_code = city_info.get_abbreviation()
|
||
|
||
# Generate abbreviation
|
||
abbreviation = generate_abbreviation_from_name(name_bg, isil_code)
|
||
|
||
# Build GHCID components
|
||
components = GHCIDComponents(
|
||
country_code='BG',
|
||
region_code=region_iso,
|
||
city_locode=city_code,
|
||
institution_type='L', # Library
|
||
abbreviation=abbreviation
|
||
)
|
||
|
||
# Generate identifiers using GHCIDComponents methods
|
||
ghcid = components.to_string()
|
||
ghcid_uuid = str(components.to_uuid())
|
||
ghcid_uuid_sha256 = str(components.to_uuid_sha256())
|
||
ghcid_numeric = components.to_numeric()
|
||
ghcid_count += 1
|
||
except Exception as e:
|
||
print(f" Warning: GHCID generation failed for {name_bg}: {e}")
|
||
|
||
# ===================================================================
|
||
# STEP 5: Enrich names (placeholder - Wikidata would go here)
|
||
# ===================================================================
|
||
|
||
# TODO: Query Wikidata for institutions with matching ISIL codes
|
||
# For now, use existing names from registry
|
||
|
||
# Build identifier list
|
||
identifiers = [
|
||
Identifier(
|
||
identifier_scheme='ISIL',
|
||
identifier_value=isil_code,
|
||
|
||
)
|
||
]
|
||
|
||
if inst.get('website'):
|
||
identifiers.append(
|
||
Identifier(
|
||
identifier_scheme='Website',
|
||
identifier_value=inst['website'],
|
||
identifier_url=inst['website'] if inst['website'].startswith('http') else f"http://{inst['website']}"
|
||
)
|
||
)
|
||
|
||
# Build contact info
|
||
contact_info = {}
|
||
if inst.get('email'):
|
||
# If multiple emails, take the first one (schema expects single email)
|
||
email = inst['email']
|
||
if ',' in email:
|
||
email = email.split(',')[0].strip()
|
||
contact_info['email'] = email
|
||
if inst.get('phone_fax'):
|
||
contact_info['phone'] = inst['phone_fax']
|
||
|
||
# Build collections metadata
|
||
collections = []
|
||
if inst.get('collections'):
|
||
collections.append({
|
||
'collection_name': 'General Collection',
|
||
'collection_type': 'bibliographic',
|
||
'collection_description': inst['collections'], # Use proper schema field name
|
||
'item_count': inst.get('collection_size', 'Not specified') # Use proper schema field name
|
||
})
|
||
|
||
# Build provenance
|
||
provenance = Provenance(
|
||
data_source='CSV_REGISTRY',
|
||
data_tier='TIER_1_AUTHORITATIVE',
|
||
extraction_date=datetime.now(timezone.utc).isoformat(),
|
||
extraction_method='HTML table parsing from Bulgarian National Library ISIL registry',
|
||
confidence_score=0.98,
|
||
source_url='https://www.nationallibrary.bg/wp/?page_id=5686'
|
||
)
|
||
|
||
# Create HeritageCustodian record
|
||
custodian = HeritageCustodian(
|
||
id=f"https://w3id.org/heritage/custodian/bg/{isil_code.lower().replace('-', '')}",
|
||
name=name_bg,
|
||
alternative_names=alt_names if alt_names else None,
|
||
institution_type=institution_type,
|
||
ghcid=ghcid,
|
||
ghcid_uuid=ghcid_uuid,
|
||
ghcid_uuid_sha256=ghcid_uuid_sha256,
|
||
ghcid_numeric=ghcid_numeric,
|
||
description=library_type_bg,
|
||
locations=[location],
|
||
identifiers=identifiers,
|
||
homepage=inst.get('website'),
|
||
contact_info=contact_info if contact_info else None,
|
||
collections=collections if collections else None,
|
||
provenance=provenance
|
||
)
|
||
|
||
results.append(custodian)
|
||
|
||
print(f"\n=== Conversion Complete ===")
|
||
print(f"Total institutions: {len(results)}")
|
||
print(f"Geocoded: {geocoded_count}/{len(results)} ({geocoded_count/len(results)*100:.1f}%)")
|
||
print(f"GHCIDs generated: {ghcid_count}/{len(results)} ({ghcid_count/len(results)*100:.1f}%)")
|
||
|
||
return results
|
||
|
||
|
||
def export_to_yaml(institutions: List[HeritageCustodian]) -> None:
|
||
"""
|
||
Export institutions to LinkML-compliant YAML format.
|
||
|
||
Note: Using custom YAML serialization to handle dataclasses.
|
||
"""
|
||
import yaml
|
||
|
||
# Convert dataclasses to dicts
|
||
institutions_dicts = []
|
||
for inst in institutions:
|
||
inst_dict = {
|
||
'id': inst.id,
|
||
'name': inst.name,
|
||
'institution_type': inst.institution_type,
|
||
}
|
||
|
||
if inst.ghcid:
|
||
inst_dict['ghcid_current'] = inst.ghcid
|
||
if inst.ghcid_uuid:
|
||
inst_dict['ghcid_uuid'] = inst.ghcid_uuid
|
||
if inst.ghcid_uuid_sha256:
|
||
inst_dict['ghcid_uuid_sha256'] = inst.ghcid_uuid_sha256
|
||
if inst.ghcid_numeric:
|
||
inst_dict['ghcid_numeric'] = inst.ghcid_numeric # type: ignore
|
||
|
||
if inst.alternative_names:
|
||
inst_dict['alternative_names'] = inst.alternative_names # type: ignore
|
||
if inst.description:
|
||
inst_dict['description'] = inst.description
|
||
if inst.homepage:
|
||
inst_dict['homepage'] = inst.homepage
|
||
|
||
# Locations
|
||
if inst.locations:
|
||
inst_dict['locations'] = [] # type: ignore
|
||
for loc in inst.locations:
|
||
loc_dict: Dict[str, Any] = {'country': loc.country}
|
||
if loc.city:
|
||
loc_dict['city'] = loc.city
|
||
if loc.street_address:
|
||
loc_dict['street_address'] = loc.street_address
|
||
if loc.region:
|
||
loc_dict['region'] = loc.region
|
||
if loc.latitude:
|
||
loc_dict['latitude'] = loc.latitude # type: ignore
|
||
if loc.longitude:
|
||
loc_dict['longitude'] = loc.longitude # type: ignore
|
||
if loc.geonames_id:
|
||
loc_dict['geonames_id'] = str(loc.geonames_id) # type: ignore
|
||
inst_dict['locations'].append(loc_dict) # type: ignore
|
||
|
||
# Identifiers
|
||
if inst.identifiers:
|
||
inst_dict['identifiers'] = [] # type: ignore
|
||
for ident in inst.identifiers:
|
||
ident_dict = {
|
||
'identifier_scheme': ident.identifier_scheme,
|
||
'identifier_value': ident.identifier_value
|
||
}
|
||
if ident.identifier_url:
|
||
ident_dict['identifier_url'] = ident.identifier_url
|
||
inst_dict['identifiers'].append(ident_dict) # type: ignore
|
||
|
||
# Contact info
|
||
if inst.contact_info:
|
||
inst_dict['contact_info'] = inst.contact_info # type: ignore
|
||
|
||
# Collections
|
||
if inst.collections:
|
||
inst_dict['collections'] = inst.collections # type: ignore
|
||
|
||
# Provenance
|
||
if inst.provenance:
|
||
prov_dict: Dict[str, Any] = {
|
||
'data_source': inst.provenance.data_source,
|
||
'data_tier': inst.provenance.data_tier,
|
||
'extraction_date': inst.provenance.extraction_date,
|
||
'extraction_method': inst.provenance.extraction_method,
|
||
'confidence_score': inst.provenance.confidence_score, # type: ignore
|
||
}
|
||
if inst.provenance.source_url:
|
||
prov_dict['source_url'] = inst.provenance.source_url
|
||
inst_dict['provenance'] = prov_dict # type: ignore
|
||
|
||
institutions_dicts.append(inst_dict)
|
||
|
||
# Write YAML
|
||
print(f"\nExporting to {OUTPUT_FILE}...")
|
||
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||
f.write('---\n')
|
||
f.write('# Bulgarian ISIL Registry - Heritage Custodian Institutions\n')
|
||
f.write('# Converted to LinkML-compliant format\n')
|
||
f.write(f'# Generated: {datetime.now(timezone.utc).isoformat()}\n')
|
||
f.write(f'# Source: Bulgarian National Library ISIL Registry\n')
|
||
f.write(f'# Total institutions: {len(institutions_dicts)}\n')
|
||
f.write('\n')
|
||
yaml.dump(institutions_dicts, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
||
|
||
print(f"✓ Exported {len(institutions_dicts)} institutions to {OUTPUT_FILE}")
|
||
|
||
|
||
# =============================================================================
|
||
# Main Entry Point
|
||
# =============================================================================
|
||
|
||
def main():
|
||
"""Main conversion workflow."""
|
||
print("=" * 70)
|
||
print("Bulgarian ISIL Registry → LinkML Conversion")
|
||
print("=" * 70)
|
||
print()
|
||
|
||
# Check input file exists
|
||
if not INPUT_FILE.exists():
|
||
print(f"Error: Input file not found: {INPUT_FILE}")
|
||
sys.exit(1)
|
||
|
||
# Check GeoNames database exists
|
||
if not GEONAMES_DB.exists():
|
||
print(f"Warning: GeoNames database not found at {GEONAMES_DB}")
|
||
print("Geocoding will be skipped. Run scripts to build GeoNames DB first.")
|
||
|
||
# Convert institutions
|
||
institutions = convert_bulgarian_institutions()
|
||
|
||
# Export to YAML
|
||
export_to_yaml(institutions)
|
||
|
||
print()
|
||
print("=" * 70)
|
||
print("✓ Conversion Complete!")
|
||
print("=" * 70)
|
||
print()
|
||
print("Next steps:")
|
||
print("1. Review output: cat", OUTPUT_FILE)
|
||
print("2. Validate schema: linkml-validate -s schemas/heritage_custodian.yaml", OUTPUT_FILE)
|
||
print("3. Enrich with Wikidata: scripts/enrich_bulgarian_wikidata.py")
|
||
print("4. Generate RDF: scripts/export_bulgarian_to_rdf.py")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|