glam/scripts/extract_trove_contributors.py
2025-11-19 23:25:22 +01:00

696 lines
24 KiB
Python

#!/usr/bin/env python3
"""
Extract Australian Heritage Custodian Organizations from Trove API
===================================================================
This script extracts all contributor organizations from the Trove API (Australian National
Library's aggregation service) and converts them to LinkML-compliant HeritageCustodian records.
Trove contributors are organizations that contribute collections data to the Australian
National Bibliographic Database (ANBD) and Trove. Each contributor has a unique NUC
(National Union Catalogue) symbol, which is Australia's implementation of the ISIL standard.
Features:
- Extracts all Trove contributors via API
- Retrieves full metadata (name, NUC code, contact details, URLs)
- Maps to LinkML HeritageCustodian schema (v0.2.1)
- Generates GHCID persistent identifiers
- Exports to YAML, JSON, and CSV formats
- Tracks provenance metadata
Data Quality:
- Tier: TIER_1_AUTHORITATIVE (official Trove registry)
- Source: National Library of Australia Trove API
- Coverage: Only organizations that contribute to Trove (subset of full ISIL registry)
Usage:
python scripts/extract_trove_contributors.py --api-key YOUR_TROVE_API_KEY
Requirements:
- Trove API key (free registration at https://trove.nla.gov.au/about/create-something/using-api)
- Python packages: requests, pyyaml, pydantic
Author: GLAM Data Extraction Project
License: CC0 1.0 Universal
Version: 1.0.0
"""
import argparse
import csv
import json
import logging
import sys
import time
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
from urllib.parse import urljoin, urlparse
import requests
import yaml
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# =============================================================================
# TROVE API CLIENT
# =============================================================================
class TroveAPIClient:
"""Client for Trove API v3."""
BASE_URL = "https://api.trove.nla.gov.au/v3/"
def __init__(self, api_key: str):
"""Initialize Trove API client.
Args:
api_key: Trove API key (obtain from https://trove.nla.gov.au/)
"""
self.api_key = api_key
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'GLAM-Heritage-Custodian-Extractor/1.0 (Research Project)'
})
def get_all_contributors(self, encoding: str = "json") -> List[Dict[str, Any]]:
"""Retrieve all Trove contributors.
Args:
encoding: Response format ('json' or 'xml')
Returns:
List of contributor dictionaries
"""
logger.info("Fetching all Trove contributors...")
url = urljoin(self.BASE_URL, "contributor")
params = {
'key': self.api_key,
'encoding': encoding,
'reclevel': 'brief' # Start with brief records
}
try:
response = self.session.get(url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
# Extract contributors from response
if 'contributor' in data:
contributors = data['contributor']
logger.info(f"Found {len(contributors)} contributors")
return contributors
else:
logger.warning("No 'contributor' key in API response")
return []
except requests.exceptions.RequestException as e:
logger.error(f"API request failed: {e}")
return []
def get_contributor_details(self, nuc_id: str, encoding: str = "json") -> Optional[Dict[str, Any]]:
"""Retrieve detailed information for a single contributor.
Args:
nuc_id: NUC (National Union Catalogue) identifier
encoding: Response format ('json' or 'xml')
Returns:
Contributor details dictionary or None if not found
"""
url = urljoin(self.BASE_URL, f"contributor/{nuc_id}")
params = {
'key': self.api_key,
'encoding': encoding,
'reclevel': 'full' # Get complete metadata
}
try:
response = self.session.get(url, params=params, timeout=30)
response.raise_for_status()
data = response.json()
if 'contributor' in data:
return data['contributor'][0] if isinstance(data['contributor'], list) else data['contributor']
else:
logger.warning(f"No data returned for NUC {nuc_id}")
return None
except requests.exceptions.RequestException as e:
logger.error(f"Failed to fetch details for NUC {nuc_id}: {e}")
return None
def get_all_contributors_with_details(self, delay: float = 0.3) -> List[Dict[str, Any]]:
"""Retrieve all contributors with full details.
Respects Trove API rate limits (200 requests per minute = ~0.3s delay).
Args:
delay: Delay in seconds between API calls (default 0.3s for 200 req/min)
Returns:
List of contributor dictionaries with full metadata
"""
# Get list of all contributors
contributors = self.get_all_contributors()
if not contributors:
logger.error("No contributors found")
return []
logger.info(f"Fetching full details for {len(contributors)} contributors...")
detailed_contributors = []
for i, contrib in enumerate(contributors, 1):
nuc_id = contrib.get('id') or contrib.get('nuc')
if not nuc_id:
logger.warning(f"Contributor {i} has no NUC ID: {contrib}")
continue
logger.info(f"[{i}/{len(contributors)}] Fetching details for {nuc_id}...")
details = self.get_contributor_details(nuc_id)
if details:
detailed_contributors.append(details)
else:
# Fallback to brief record if full details fail
logger.warning(f"Using brief record for {nuc_id}")
detailed_contributors.append(contrib)
# Rate limiting
if i < len(contributors):
time.sleep(delay)
logger.info(f"Successfully retrieved {len(detailed_contributors)} detailed records")
return detailed_contributors
# =============================================================================
# GHCID GENERATOR
# =============================================================================
def generate_ghcid_components(institution_type: str, country: str = "AU",
region: Optional[str] = None, city: Optional[str] = None,
name_abbreviation: Optional[str] = None) -> str:
"""Generate GHCID base identifier (without Q-number).
Args:
institution_type: Institution type code (G/L/A/M/etc.)
country: ISO 3166-1 alpha-2 country code
region: State/province/region code
city: City code (first 3 letters, uppercase)
name_abbreviation: Institution name abbreviation (2-3 letters)
Returns:
GHCID base string (e.g., "AU-NSW-SYD-L-NLA")
"""
components = [country]
if region:
components.append(region)
if city:
# Normalize city name to 3-letter code
city_code = city[:3].upper().replace(' ', '')
components.append(city_code)
# Institution type code
components.append(institution_type)
# Name abbreviation
if name_abbreviation:
components.append(name_abbreviation.upper().replace(' ', ''))
return '-'.join(components)
def generate_ghcid_uuid_v5(ghcid_base: str) -> str:
"""Generate deterministic UUID v5 from GHCID base.
Uses SHA-1 hashing (RFC 4122 standard).
Args:
ghcid_base: Base GHCID string
Returns:
UUID v5 string
"""
namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # DNS namespace
return str(uuid.uuid5(namespace, ghcid_base))
def generate_ghcid_numeric(ghcid_base: str) -> int:
"""Generate 64-bit numeric GHCID from base string.
Uses SHA-256 truncation for deterministic numeric ID.
Args:
ghcid_base: Base GHCID string
Returns:
64-bit integer
"""
import hashlib
hash_digest = hashlib.sha256(ghcid_base.encode('utf-8')).digest()
# Take first 8 bytes and convert to 64-bit integer
return int.from_bytes(hash_digest[:8], byteorder='big', signed=False)
# =============================================================================
# INSTITUTION TYPE CLASSIFIER
# =============================================================================
def classify_institution_type(contributor: Dict[str, Any]) -> str:
"""Classify institution type based on Trove contributor metadata.
Uses GLAMORCUBESFIXPHDNT taxonomy (19-type system).
Args:
contributor: Trove contributor dictionary
Returns:
Institution type code (G/L/A/M/etc.)
"""
name = contributor.get('name', '').lower()
nuc = contributor.get('id', '').upper()
# Library indicators
if any(keyword in name for keyword in ['library', 'bibliothek', 'biblioteca', 'bibliotheque']):
return 'L'
# Archive indicators
if any(keyword in name for keyword in ['archive', 'archiv', 'archivo', 'records']):
return 'A'
# Museum indicators
if any(keyword in name for keyword in ['museum', 'museo', 'musee', 'gallery']):
# Distinguish between museum and gallery
if 'gallery' in name and 'museum' not in name:
return 'G'
return 'M'
# University indicators (Education Provider)
if any(keyword in name for keyword in ['university', 'college', 'school', 'institut']):
return 'E'
# Official institution indicators
if any(keyword in name for keyword in ['national', 'state', 'government', 'department', 'ministry']):
return 'O'
# Research center indicators
if any(keyword in name for keyword in ['research', 'institute', 'center', 'centre']):
return 'R'
# Society indicators
if any(keyword in name for keyword in ['society', 'association', 'club', 'historical']):
return 'S'
# Default: UNKNOWN
return 'U'
# =============================================================================
# TROVE TO LINKML CONVERTER
# =============================================================================
class TroveToLinkMLConverter:
"""Convert Trove contributor data to LinkML HeritageCustodian records."""
def __init__(self):
"""Initialize converter."""
self.extraction_date = datetime.now(timezone.utc).isoformat()
def convert_contributor(self, contributor: Dict[str, Any]) -> Dict[str, Any]:
"""Convert single Trove contributor to HeritageCustodian record.
Args:
contributor: Trove API contributor dictionary
Returns:
LinkML-compliant HeritageCustodian dictionary
"""
nuc_id = contributor.get('id') or contributor.get('nuc')
name = contributor.get('name', 'Unknown Institution')
# Classify institution type
inst_type = classify_institution_type(contributor)
# Generate GHCID components
# Extract location from contributor data (if available)
location_str = contributor.get('location', '')
city = None
region = None
# Try to parse location (format varies in Trove data)
if location_str:
parts = location_str.split(',')
if len(parts) >= 2:
city = parts[0].strip()
region = parts[-1].strip().upper()[:3] # State abbreviation
# Generate abbreviated name from NUC code or name
name_abbrev = nuc_id if nuc_id else name[:3]
ghcid_base = generate_ghcid_components(
institution_type=inst_type,
country='AU',
region=region,
city=city,
name_abbreviation=name_abbrev
)
ghcid_uuid_v5 = generate_ghcid_uuid_v5(ghcid_base)
ghcid_numeric = generate_ghcid_numeric(ghcid_base)
# Build HeritageCustodian record
record = {
'id': f"https://w3id.org/heritage/custodian/au/{nuc_id.lower() if nuc_id else ghcid_uuid_v5}",
'record_id': str(uuid.uuid4()), # UUID v4 for database record
'ghcid_uuid': ghcid_uuid_v5,
'ghcid_numeric': ghcid_numeric,
'ghcid_current': ghcid_base,
'name': name,
'institution_type': inst_type,
'identifiers': [],
'locations': [],
'provenance': {
'data_source': 'TROVE_API',
'data_tier': 'TIER_1_AUTHORITATIVE',
'extraction_date': self.extraction_date,
'extraction_method': 'Trove API v3 /contributor endpoint with reclevel=full',
'confidence_score': 0.95,
'source_url': f"https://api.trove.nla.gov.au/v3/contributor/{nuc_id}" if nuc_id else None
}
}
# Add NUC identifier (Australia's ISIL equivalent)
if nuc_id:
record['identifiers'].append({
'identifier_scheme': 'NUC',
'identifier_value': nuc_id,
'identifier_url': f"https://www.nla.gov.au/apps/ilrs/?action=IlrsSearch&term={nuc_id}"
})
# NUC codes map to ISIL format AU-{NUC}
record['identifiers'].append({
'identifier_scheme': 'ISIL',
'identifier_value': f"AU-{nuc_id}",
'identifier_url': None
})
# Add alternative names
alt_names = []
if 'shortName' in contributor and contributor['shortName']:
alt_names.append(contributor['shortName'])
if alt_names:
record['alternative_names'] = alt_names
# Add official name (if different from display name)
if 'fullName' in contributor and contributor['fullName']:
record['official_name'] = contributor['fullName']
# Add homepage URL
if 'url' in contributor and contributor['url']:
record['homepage'] = contributor['url']
# Add catalogue URL as digital platform
if 'catalogueUrl' in contributor and contributor['catalogueUrl']:
record['digital_platforms'] = [{
'platform_name': 'Institutional Catalogue',
'platform_url': contributor['catalogueUrl'],
'platform_type': 'CATALOGUE'
}]
# Add location data
if location_str:
location = {
'city': city,
'region': region,
'country': 'AU'
}
record['locations'].append(location)
# Add access policy information (if available)
if 'accessPolicy' in contributor and contributor['accessPolicy']:
if 'description' not in record:
record['description'] = ''
record['description'] += f"\n\nAccess Policy: {contributor['accessPolicy']}"
# Add "open to public" flag
if 'openToPublic' in contributor:
if 'description' not in record:
record['description'] = ''
record['description'] += f"\n\nOpen to Public: {contributor['openToPublic']}"
return record
def convert_all(self, contributors: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Convert all Trove contributors to HeritageCustodian records.
Args:
contributors: List of Trove contributor dictionaries
Returns:
List of LinkML-compliant HeritageCustodian dictionaries
"""
logger.info(f"Converting {len(contributors)} contributors to LinkML format...")
records = []
for i, contrib in enumerate(contributors, 1):
try:
record = self.convert_contributor(contrib)
records.append(record)
if i % 50 == 0:
logger.info(f"Converted {i}/{len(contributors)} records...")
except Exception as e:
nuc_id = contrib.get('id', 'unknown')
logger.error(f"Failed to convert contributor {nuc_id}: {e}")
continue
logger.info(f"Successfully converted {len(records)} records")
return records
# =============================================================================
# EXPORT FUNCTIONS
# =============================================================================
def export_to_yaml(records: List[Dict[str, Any]], output_path: Path):
"""Export records to YAML format.
Args:
records: List of HeritageCustodian dictionaries
output_path: Output file path
"""
logger.info(f"Exporting to YAML: {output_path}")
with open(output_path, 'w', encoding='utf-8') as f:
yaml.safe_dump(records, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
logger.info(f"Exported {len(records)} records to {output_path}")
def export_to_json(records: List[Dict[str, Any]], output_path: Path):
"""Export records to JSON format.
Args:
records: List of HeritageCustodian dictionaries
output_path: Output file path
"""
logger.info(f"Exporting to JSON: {output_path}")
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(records, f, indent=2, ensure_ascii=False)
logger.info(f"Exported {len(records)} records to {output_path}")
def export_to_csv(records: List[Dict[str, Any]], output_path: Path):
"""Export records to CSV format (flattened).
Args:
records: List of HeritageCustodian dictionaries
output_path: Output file path
"""
logger.info(f"Exporting to CSV: {output_path}")
if not records:
logger.warning("No records to export")
return
# Define CSV columns
fieldnames = [
'id', 'record_id', 'ghcid_uuid', 'ghcid_numeric', 'ghcid_current',
'name', 'official_name', 'alternative_names', 'institution_type',
'nuc_code', 'isil_code', 'homepage', 'catalogue_url',
'city', 'region', 'country',
'data_source', 'data_tier', 'extraction_date', 'confidence_score',
'description'
]
with open(output_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for record in records:
# Flatten record for CSV
flat_record = {
'id': record.get('id'),
'record_id': record.get('record_id'),
'ghcid_uuid': record.get('ghcid_uuid'),
'ghcid_numeric': record.get('ghcid_numeric'),
'ghcid_current': record.get('ghcid_current'),
'name': record.get('name'),
'official_name': record.get('official_name'),
'alternative_names': '; '.join(record.get('alternative_names', [])),
'institution_type': record.get('institution_type'),
'description': record.get('description', '').strip()
}
# Extract NUC and ISIL codes
identifiers = record.get('identifiers', [])
for identifier in identifiers:
if identifier['identifier_scheme'] == 'NUC':
flat_record['nuc_code'] = identifier['identifier_value']
elif identifier['identifier_scheme'] == 'ISIL':
flat_record['isil_code'] = identifier['identifier_value']
# Extract homepage and catalogue URL
flat_record['homepage'] = record.get('homepage')
digital_platforms = record.get('digital_platforms', [])
if digital_platforms:
flat_record['catalogue_url'] = digital_platforms[0].get('platform_url')
# Extract location
locations = record.get('locations', [])
if locations:
location = locations[0]
flat_record['city'] = location.get('city')
flat_record['region'] = location.get('region')
flat_record['country'] = location.get('country')
# Extract provenance
provenance = record.get('provenance', {})
flat_record['data_source'] = provenance.get('data_source')
flat_record['data_tier'] = provenance.get('data_tier')
flat_record['extraction_date'] = provenance.get('extraction_date')
flat_record['confidence_score'] = provenance.get('confidence_score')
writer.writerow(flat_record)
logger.info(f"Exported {len(records)} records to {output_path}")
# =============================================================================
# MAIN EXTRACTION FUNCTION
# =============================================================================
def main():
"""Main extraction workflow."""
parser = argparse.ArgumentParser(
description='Extract Australian heritage custodians from Trove API',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument(
'--api-key',
required=True,
help='Trove API key (get from https://trove.nla.gov.au/)'
)
parser.add_argument(
'--output-dir',
type=Path,
default=Path('data/instances'),
help='Output directory (default: data/instances)'
)
parser.add_argument(
'--delay',
type=float,
default=0.3,
help='Delay between API calls in seconds (default: 0.3 for 200 req/min)'
)
parser.add_argument(
'--formats',
nargs='+',
choices=['yaml', 'json', 'csv'],
default=['yaml', 'json', 'csv'],
help='Output formats (default: all)'
)
args = parser.parse_args()
# Create output directory
args.output_dir.mkdir(parents=True, exist_ok=True)
# Initialize Trove API client
logger.info("Initializing Trove API client...")
client = TroveAPIClient(api_key=args.api_key)
# Extract all contributors with full details
contributors = client.get_all_contributors_with_details(delay=args.delay)
if not contributors:
logger.error("No contributors extracted. Exiting.")
sys.exit(1)
# Convert to LinkML format
converter = TroveToLinkMLConverter()
records = converter.convert_all(contributors)
if not records:
logger.error("No records generated. Exiting.")
sys.exit(1)
# Generate timestamp for filenames
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Export to requested formats
if 'yaml' in args.formats:
yaml_path = args.output_dir / f'trove_contributors_{timestamp}.yaml'
export_to_yaml(records, yaml_path)
if 'json' in args.formats:
json_path = args.output_dir / f'trove_contributors_{timestamp}.json'
export_to_json(records, json_path)
if 'csv' in args.formats:
csv_path = args.output_dir / f'trove_contributors_{timestamp}.csv'
export_to_csv(records, csv_path)
# Generate summary report
logger.info("\n" + "="*80)
logger.info("EXTRACTION SUMMARY")
logger.info("="*80)
logger.info(f"Total contributors extracted: {len(contributors)}")
logger.info(f"Total records converted: {len(records)}")
logger.info(f"Output directory: {args.output_dir}")
# Count by institution type
type_counts = {}
for record in records:
inst_type = record.get('institution_type', 'UNKNOWN')
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
logger.info("\nInstitution Type Distribution:")
for inst_type, count in sorted(type_counts.items()):
logger.info(f" {inst_type}: {count}")
logger.info("\nExtraction complete!")
if __name__ == '__main__':
main()