glam/scripts/query_biblioteca_nacional_z3950.py
2025-11-19 23:25:22 +01:00

296 lines
9.6 KiB
Python

#!/usr/bin/env python3
"""
Z39.50 Client for Biblioteca Nacional de la República Argentina
Extracts ISIL codes and institution records from the Biblioteca Nacional's
Z39.50 server. ISIL codes are stored in MARC field 024 (Standard Identifier).
Server: 200.123.191.9:9991
Database: BNA01 (bibliographic) or BNA10 (authority records)
Credentials: Username 'Z39.50' / Password 'Z39.50'
Based on investigation: data/isil/AR/ARGENTINA_ISIL_INVESTIGATION.md
"""
import json
import socket
import struct
from datetime import datetime, timezone
from typing import List, Dict, Optional
from dataclasses import dataclass, asdict
from pathlib import Path
@dataclass
class Z3950Config:
"""Configuration for Z39.50 connection."""
host: str = "200.123.191.9"
port: int = 9991
database: str = "BNA10" # Authority records
username: str = "Z39.50"
password: str = "Z39.50"
charset: str = "UTF-8"
timeout: int = 30
@dataclass
class ISILRecord:
"""Represents an institution with ISIL code extracted from Z39.50."""
isil_code: str
institution_name: str
institution_type: Optional[str] = None
address: Optional[str] = None
city: Optional[str] = None
province: Optional[str] = None
country: str = "AR"
marc_record: Optional[Dict] = None
extraction_date: Optional[str] = None
def __post_init__(self):
if self.extraction_date is None:
self.extraction_date = datetime.now(timezone.utc).isoformat()
class SimpleZ3950Client:
"""
Minimal Z39.50 client for extracting ISIL codes.
NOTE: This is a simplified implementation. For production use,
consider using PyZ3950 or YAZ Python bindings.
This implementation uses raw socket communication to demonstrate
the Z39.50 protocol. It may not handle all edge cases.
"""
def __init__(self, config: Z3950Config):
self.config = config
self.socket = None
def connect(self) -> bool:
"""Establish connection to Z39.50 server."""
try:
print(f"Connecting to Z39.50 server: {self.config.host}:{self.config.port}")
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.socket.settimeout(self.config.timeout)
self.socket.connect((self.config.host, self.config.port))
print("✅ Connected to Z39.50 server")
return True
except Exception as e:
print(f"❌ Connection failed: {e}")
return False
def disconnect(self):
"""Close Z39.50 connection."""
if self.socket:
self.socket.close()
print("Disconnected from Z39.50 server")
def search(self, query: str, max_records: int = 100) -> List[Dict]:
"""
Execute Z39.50 search query.
Args:
query: Search query (e.g., "@attr 1=1003 'biblioteca'")
max_records: Maximum number of records to retrieve
Returns:
List of MARC records
"""
# This is a placeholder implementation
# Full implementation requires encoding Z39.50 protocol messages
raise NotImplementedError(
"Z39.50 protocol implementation requires PyZ3950 or YAZ library. "
"See installation instructions below."
)
def extract_isil_from_marc(self, marc_record: Dict) -> Optional[ISILRecord]:
"""
Extract ISIL code and institution info from MARC record.
MARC field 024: Standard Identifier
- $a: Standard number or code
- $2: Source of number (should be 'isil' or 'ISIL')
MARC field 110: Corporate Name (Main Entry)
- $a: Corporate name
MARC field 370: Place Associated (for archives/libraries)
- $e: Place of corporate headquarters
"""
try:
# Extract ISIL code from field 024
isil_code = None
for field in marc_record.get('fields', []):
if '024' in field:
subfields = field['024'].get('subfields', [])
for subfield in subfields:
if '$2' in subfield and subfield['$2'].lower() == 'isil':
isil_code = subfield.get('$a')
break
if not isil_code:
return None
# Extract institution name from field 110
institution_name = None
for field in marc_record.get('fields', []):
if '110' in field:
subfields = field['110'].get('subfields', [])
for subfield in subfields:
if '$a' in subfield:
institution_name = subfield['$a']
break
# Extract location from field 370
city = None
for field in marc_record.get('fields', []):
if '370' in field:
subfields = field['370'].get('subfields', [])
for subfield in subfields:
if '$e' in subfield:
city = subfield['$e']
break
return ISILRecord(
isil_code=isil_code,
institution_name=institution_name or "Unknown",
city=city,
marc_record=marc_record
)
except Exception as e:
print(f"⚠️ Error extracting ISIL from MARC: {e}")
return None
def install_instructions():
"""Print installation instructions for Z39.50 libraries."""
print("""
================================================================================
Z39.50 LIBRARY INSTALLATION REQUIRED
================================================================================
To use this script, you need a Z39.50 library. We recommend PyZ3950:
OPTION 1: PyZ3950 (Python-native, easier)
------------------------------------------
pip install PyZ3950
OPTION 2: YAZ + Python bindings (more robust)
----------------------------------------------
# macOS (via Homebrew)
brew install yaz
pip install pymarc yaz
# Ubuntu/Debian
sudo apt-get install yaz libyaz-dev
pip install pymarc yaz
OPTION 3: Use existing tools (command-line)
--------------------------------------------
# Install YAZ command-line tools
brew install yaz # macOS
# Query Biblioteca Nacional
yaz-client 200.123.191.9:9991/BNA10
# In yaz-client interactive shell:
open 200.123.191.9:9991/BNA10
find @attr 1=1003 biblioteca
show 1
================================================================================
ALTERNATIVE: Manual Web Interface
================================================================================
The Biblioteca Nacional may have a web-based catalog (OPAC) that can be
scraped as an alternative to Z39.50. Check:
https://catalogo.bn.gov.ar/
If available, we can create a web scraper instead of using Z39.50.
================================================================================
""")
def test_connection():
"""Test connection to Biblioteca Nacional Z39.50 server."""
config = Z3950Config()
client = SimpleZ3950Client(config)
print("================================================================================")
print("TESTING: Biblioteca Nacional Z39.50 Server")
print("================================================================================")
print(f"Host: {config.host}")
print(f"Port: {config.port}")
print(f"Database: {config.database}")
print()
if client.connect():
print("✅ Connection successful!")
print()
print("⚠️ However, full Z39.50 protocol implementation requires PyZ3950 library.")
print(" Run this script with --install-help for installation instructions.")
client.disconnect()
return True
else:
print("❌ Connection failed.")
print()
print("Possible issues:")
print(" 1. Server may be temporarily down")
print(" 2. Firewall blocking outbound connections")
print(" 3. Server IP/port changed")
print()
print("Alternative: Check if Biblioteca Nacional has a web catalog (OPAC)")
return False
def main():
"""Main execution flow."""
import sys
if "--install-help" in sys.argv:
install_instructions()
return
if "--test" in sys.argv:
test_connection()
return
print("""
================================================================================
BIBLIOTECA NACIONAL Z39.50 ISIL EXTRACTOR
================================================================================
This script extracts ISIL codes from Biblioteca Nacional's Z39.50 server.
USAGE:
python3 query_biblioteca_nacional_z3950.py --test # Test connection
python3 query_biblioteca_nacional_z3950.py --install-help # Installation guide
STATUS:
⚠️ Z39.50 library (PyZ3950) not yet installed.
This script provides the framework. To use it:
1. Install PyZ3950: pip install PyZ3950
2. Implement full Z39.50 search/retrieve protocol
3. Parse MARC records to extract ISIL codes
ALTERNATIVE APPROACHES:
1. Use YAZ command-line tools (yaz-client) to query manually
2. Check if Biblioteca Nacional has a web catalog (OPAC) to scrape
3. Contact Biblioteca Nacional directly for ISIL registry export
For investigation details, see:
data/isil/AR/ARGENTINA_ISIL_INVESTIGATION.md
================================================================================
""")
# Test connection
test_connection()
if __name__ == "__main__":
main()