#!/usr/bin/env python3 """ Extract Australian Heritage Custodian Organizations from Trove API =================================================================== This script extracts all contributor organizations from the Trove API (Australian National Library's aggregation service) and converts them to LinkML-compliant HeritageCustodian records. Trove contributors are organizations that contribute collections data to the Australian National Bibliographic Database (ANBD) and Trove. Each contributor has a unique NUC (National Union Catalogue) symbol, which is Australia's implementation of the ISIL standard. Features: - Extracts all Trove contributors via API - Retrieves full metadata (name, NUC code, contact details, URLs) - Maps to LinkML HeritageCustodian schema (v0.2.1) - Generates GHCID persistent identifiers - Exports to YAML, JSON, and CSV formats - Tracks provenance metadata Data Quality: - Tier: TIER_1_AUTHORITATIVE (official Trove registry) - Source: National Library of Australia Trove API - Coverage: Only organizations that contribute to Trove (subset of full ISIL registry) Usage: python scripts/extract_trove_contributors.py --api-key YOUR_TROVE_API_KEY Requirements: - Trove API key (free registration at https://trove.nla.gov.au/about/create-something/using-api) - Python packages: requests, pyyaml, pydantic Author: GLAM Data Extraction Project License: CC0 1.0 Universal Version: 1.0.0 """ import argparse import csv import json import logging import sys import time import uuid from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional from urllib.parse import urljoin, urlparse import requests import yaml # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # ============================================================================= # TROVE API CLIENT # ============================================================================= class TroveAPIClient: """Client for Trove API v3.""" BASE_URL = "https://api.trove.nla.gov.au/v3/" def __init__(self, api_key: str): """Initialize Trove API client. Args: api_key: Trove API key (obtain from https://trove.nla.gov.au/) """ self.api_key = api_key self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'GLAM-Heritage-Custodian-Extractor/1.0 (Research Project)' }) def get_all_contributors(self, encoding: str = "json") -> List[Dict[str, Any]]: """Retrieve all Trove contributors. Args: encoding: Response format ('json' or 'xml') Returns: List of contributor dictionaries """ logger.info("Fetching all Trove contributors...") url = urljoin(self.BASE_URL, "contributor") params = { 'key': self.api_key, 'encoding': encoding, 'reclevel': 'brief' # Start with brief records } try: response = self.session.get(url, params=params, timeout=30) response.raise_for_status() data = response.json() # Extract contributors from response if 'contributor' in data: contributors = data['contributor'] logger.info(f"Found {len(contributors)} contributors") return contributors else: logger.warning("No 'contributor' key in API response") return [] except requests.exceptions.RequestException as e: logger.error(f"API request failed: {e}") return [] def get_contributor_details(self, nuc_id: str, encoding: str = "json") -> Optional[Dict[str, Any]]: """Retrieve detailed information for a single contributor. Args: nuc_id: NUC (National Union Catalogue) identifier encoding: Response format ('json' or 'xml') Returns: Contributor details dictionary or None if not found """ url = urljoin(self.BASE_URL, f"contributor/{nuc_id}") params = { 'key': self.api_key, 'encoding': encoding, 'reclevel': 'full' # Get complete metadata } try: response = self.session.get(url, params=params, timeout=30) response.raise_for_status() data = response.json() if 'contributor' in data: return data['contributor'][0] if isinstance(data['contributor'], list) else data['contributor'] else: logger.warning(f"No data returned for NUC {nuc_id}") return None except requests.exceptions.RequestException as e: logger.error(f"Failed to fetch details for NUC {nuc_id}: {e}") return None def get_all_contributors_with_details(self, delay: float = 0.3) -> List[Dict[str, Any]]: """Retrieve all contributors with full details. Respects Trove API rate limits (200 requests per minute = ~0.3s delay). Args: delay: Delay in seconds between API calls (default 0.3s for 200 req/min) Returns: List of contributor dictionaries with full metadata """ # Get list of all contributors contributors = self.get_all_contributors() if not contributors: logger.error("No contributors found") return [] logger.info(f"Fetching full details for {len(contributors)} contributors...") detailed_contributors = [] for i, contrib in enumerate(contributors, 1): nuc_id = contrib.get('id') or contrib.get('nuc') if not nuc_id: logger.warning(f"Contributor {i} has no NUC ID: {contrib}") continue logger.info(f"[{i}/{len(contributors)}] Fetching details for {nuc_id}...") details = self.get_contributor_details(nuc_id) if details: detailed_contributors.append(details) else: # Fallback to brief record if full details fail logger.warning(f"Using brief record for {nuc_id}") detailed_contributors.append(contrib) # Rate limiting if i < len(contributors): time.sleep(delay) logger.info(f"Successfully retrieved {len(detailed_contributors)} detailed records") return detailed_contributors # ============================================================================= # GHCID GENERATOR # ============================================================================= def generate_ghcid_components(institution_type: str, country: str = "AU", region: Optional[str] = None, city: Optional[str] = None, name_abbreviation: Optional[str] = None) -> str: """Generate GHCID base identifier (without Q-number). Args: institution_type: Institution type code (G/L/A/M/etc.) country: ISO 3166-1 alpha-2 country code region: State/province/region code city: City code (first 3 letters, uppercase) name_abbreviation: Institution name abbreviation (2-3 letters) Returns: GHCID base string (e.g., "AU-NSW-SYD-L-NLA") """ components = [country] if region: components.append(region) if city: # Normalize city name to 3-letter code city_code = city[:3].upper().replace(' ', '') components.append(city_code) # Institution type code components.append(institution_type) # Name abbreviation if name_abbreviation: components.append(name_abbreviation.upper().replace(' ', '')) return '-'.join(components) def generate_ghcid_uuid_v5(ghcid_base: str) -> str: """Generate deterministic UUID v5 from GHCID base. Uses SHA-1 hashing (RFC 4122 standard). Args: ghcid_base: Base GHCID string Returns: UUID v5 string """ namespace = uuid.UUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') # DNS namespace return str(uuid.uuid5(namespace, ghcid_base)) def generate_ghcid_numeric(ghcid_base: str) -> int: """Generate 64-bit numeric GHCID from base string. Uses SHA-256 truncation for deterministic numeric ID. Args: ghcid_base: Base GHCID string Returns: 64-bit integer """ import hashlib hash_digest = hashlib.sha256(ghcid_base.encode('utf-8')).digest() # Take first 8 bytes and convert to 64-bit integer return int.from_bytes(hash_digest[:8], byteorder='big', signed=False) # ============================================================================= # INSTITUTION TYPE CLASSIFIER # ============================================================================= def classify_institution_type(contributor: Dict[str, Any]) -> str: """Classify institution type based on Trove contributor metadata. Uses GLAMORCUBESFIXPHDNT taxonomy (19-type system). Args: contributor: Trove contributor dictionary Returns: Institution type code (G/L/A/M/etc.) """ name = contributor.get('name', '').lower() nuc = contributor.get('id', '').upper() # Library indicators if any(keyword in name for keyword in ['library', 'bibliothek', 'biblioteca', 'bibliotheque']): return 'L' # Archive indicators if any(keyword in name for keyword in ['archive', 'archiv', 'archivo', 'records']): return 'A' # Museum indicators if any(keyword in name for keyword in ['museum', 'museo', 'musee', 'gallery']): # Distinguish between museum and gallery if 'gallery' in name and 'museum' not in name: return 'G' return 'M' # University indicators (Education Provider) if any(keyword in name for keyword in ['university', 'college', 'school', 'institut']): return 'E' # Official institution indicators if any(keyword in name for keyword in ['national', 'state', 'government', 'department', 'ministry']): return 'O' # Research center indicators if any(keyword in name for keyword in ['research', 'institute', 'center', 'centre']): return 'R' # Society indicators if any(keyword in name for keyword in ['society', 'association', 'club', 'historical']): return 'S' # Default: UNKNOWN return 'U' # ============================================================================= # TROVE TO LINKML CONVERTER # ============================================================================= class TroveToLinkMLConverter: """Convert Trove contributor data to LinkML HeritageCustodian records.""" def __init__(self): """Initialize converter.""" self.extraction_date = datetime.now(timezone.utc).isoformat() def convert_contributor(self, contributor: Dict[str, Any]) -> Dict[str, Any]: """Convert single Trove contributor to HeritageCustodian record. Args: contributor: Trove API contributor dictionary Returns: LinkML-compliant HeritageCustodian dictionary """ nuc_id = contributor.get('id') or contributor.get('nuc') name = contributor.get('name', 'Unknown Institution') # Classify institution type inst_type = classify_institution_type(contributor) # Generate GHCID components # Extract location from contributor data (if available) location_str = contributor.get('location', '') city = None region = None # Try to parse location (format varies in Trove data) if location_str: parts = location_str.split(',') if len(parts) >= 2: city = parts[0].strip() region = parts[-1].strip().upper()[:3] # State abbreviation # Generate abbreviated name from NUC code or name name_abbrev = nuc_id if nuc_id else name[:3] ghcid_base = generate_ghcid_components( institution_type=inst_type, country='AU', region=region, city=city, name_abbreviation=name_abbrev ) ghcid_uuid_v5 = generate_ghcid_uuid_v5(ghcid_base) ghcid_numeric = generate_ghcid_numeric(ghcid_base) # Build HeritageCustodian record record = { 'id': f"https://w3id.org/heritage/custodian/au/{nuc_id.lower() if nuc_id else ghcid_uuid_v5}", 'record_id': str(uuid.uuid4()), # UUID v4 for database record 'ghcid_uuid': ghcid_uuid_v5, 'ghcid_numeric': ghcid_numeric, 'ghcid_current': ghcid_base, 'name': name, 'institution_type': inst_type, 'identifiers': [], 'locations': [], 'provenance': { 'data_source': 'TROVE_API', 'data_tier': 'TIER_1_AUTHORITATIVE', 'extraction_date': self.extraction_date, 'extraction_method': 'Trove API v3 /contributor endpoint with reclevel=full', 'confidence_score': 0.95, 'source_url': f"https://api.trove.nla.gov.au/v3/contributor/{nuc_id}" if nuc_id else None } } # Add NUC identifier (Australia's ISIL equivalent) if nuc_id: record['identifiers'].append({ 'identifier_scheme': 'NUC', 'identifier_value': nuc_id, 'identifier_url': f"https://www.nla.gov.au/apps/ilrs/?action=IlrsSearch&term={nuc_id}" }) # NUC codes map to ISIL format AU-{NUC} record['identifiers'].append({ 'identifier_scheme': 'ISIL', 'identifier_value': f"AU-{nuc_id}", 'identifier_url': None }) # Add alternative names alt_names = [] if 'shortName' in contributor and contributor['shortName']: alt_names.append(contributor['shortName']) if alt_names: record['alternative_names'] = alt_names # Add official name (if different from display name) if 'fullName' in contributor and contributor['fullName']: record['official_name'] = contributor['fullName'] # Add homepage URL if 'url' in contributor and contributor['url']: record['homepage'] = contributor['url'] # Add catalogue URL as digital platform if 'catalogueUrl' in contributor and contributor['catalogueUrl']: record['digital_platforms'] = [{ 'platform_name': 'Institutional Catalogue', 'platform_url': contributor['catalogueUrl'], 'platform_type': 'CATALOGUE' }] # Add location data if location_str: location = { 'city': city, 'region': region, 'country': 'AU' } record['locations'].append(location) # Add access policy information (if available) if 'accessPolicy' in contributor and contributor['accessPolicy']: if 'description' not in record: record['description'] = '' record['description'] += f"\n\nAccess Policy: {contributor['accessPolicy']}" # Add "open to public" flag if 'openToPublic' in contributor: if 'description' not in record: record['description'] = '' record['description'] += f"\n\nOpen to Public: {contributor['openToPublic']}" return record def convert_all(self, contributors: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Convert all Trove contributors to HeritageCustodian records. Args: contributors: List of Trove contributor dictionaries Returns: List of LinkML-compliant HeritageCustodian dictionaries """ logger.info(f"Converting {len(contributors)} contributors to LinkML format...") records = [] for i, contrib in enumerate(contributors, 1): try: record = self.convert_contributor(contrib) records.append(record) if i % 50 == 0: logger.info(f"Converted {i}/{len(contributors)} records...") except Exception as e: nuc_id = contrib.get('id', 'unknown') logger.error(f"Failed to convert contributor {nuc_id}: {e}") continue logger.info(f"Successfully converted {len(records)} records") return records # ============================================================================= # EXPORT FUNCTIONS # ============================================================================= def export_to_yaml(records: List[Dict[str, Any]], output_path: Path): """Export records to YAML format. Args: records: List of HeritageCustodian dictionaries output_path: Output file path """ logger.info(f"Exporting to YAML: {output_path}") with open(output_path, 'w', encoding='utf-8') as f: yaml.safe_dump(records, f, default_flow_style=False, allow_unicode=True, sort_keys=False) logger.info(f"Exported {len(records)} records to {output_path}") def export_to_json(records: List[Dict[str, Any]], output_path: Path): """Export records to JSON format. Args: records: List of HeritageCustodian dictionaries output_path: Output file path """ logger.info(f"Exporting to JSON: {output_path}") with open(output_path, 'w', encoding='utf-8') as f: json.dump(records, f, indent=2, ensure_ascii=False) logger.info(f"Exported {len(records)} records to {output_path}") def export_to_csv(records: List[Dict[str, Any]], output_path: Path): """Export records to CSV format (flattened). Args: records: List of HeritageCustodian dictionaries output_path: Output file path """ logger.info(f"Exporting to CSV: {output_path}") if not records: logger.warning("No records to export") return # Define CSV columns fieldnames = [ 'id', 'record_id', 'ghcid_uuid', 'ghcid_numeric', 'ghcid_current', 'name', 'official_name', 'alternative_names', 'institution_type', 'nuc_code', 'isil_code', 'homepage', 'catalogue_url', 'city', 'region', 'country', 'data_source', 'data_tier', 'extraction_date', 'confidence_score', 'description' ] with open(output_path, 'w', encoding='utf-8', newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for record in records: # Flatten record for CSV flat_record = { 'id': record.get('id'), 'record_id': record.get('record_id'), 'ghcid_uuid': record.get('ghcid_uuid'), 'ghcid_numeric': record.get('ghcid_numeric'), 'ghcid_current': record.get('ghcid_current'), 'name': record.get('name'), 'official_name': record.get('official_name'), 'alternative_names': '; '.join(record.get('alternative_names', [])), 'institution_type': record.get('institution_type'), 'description': record.get('description', '').strip() } # Extract NUC and ISIL codes identifiers = record.get('identifiers', []) for identifier in identifiers: if identifier['identifier_scheme'] == 'NUC': flat_record['nuc_code'] = identifier['identifier_value'] elif identifier['identifier_scheme'] == 'ISIL': flat_record['isil_code'] = identifier['identifier_value'] # Extract homepage and catalogue URL flat_record['homepage'] = record.get('homepage') digital_platforms = record.get('digital_platforms', []) if digital_platforms: flat_record['catalogue_url'] = digital_platforms[0].get('platform_url') # Extract location locations = record.get('locations', []) if locations: location = locations[0] flat_record['city'] = location.get('city') flat_record['region'] = location.get('region') flat_record['country'] = location.get('country') # Extract provenance provenance = record.get('provenance', {}) flat_record['data_source'] = provenance.get('data_source') flat_record['data_tier'] = provenance.get('data_tier') flat_record['extraction_date'] = provenance.get('extraction_date') flat_record['confidence_score'] = provenance.get('confidence_score') writer.writerow(flat_record) logger.info(f"Exported {len(records)} records to {output_path}") # ============================================================================= # MAIN EXTRACTION FUNCTION # ============================================================================= def main(): """Main extraction workflow.""" parser = argparse.ArgumentParser( description='Extract Australian heritage custodians from Trove API', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__ ) parser.add_argument( '--api-key', required=True, help='Trove API key (get from https://trove.nla.gov.au/)' ) parser.add_argument( '--output-dir', type=Path, default=Path('data/instances'), help='Output directory (default: data/instances)' ) parser.add_argument( '--delay', type=float, default=0.3, help='Delay between API calls in seconds (default: 0.3 for 200 req/min)' ) parser.add_argument( '--formats', nargs='+', choices=['yaml', 'json', 'csv'], default=['yaml', 'json', 'csv'], help='Output formats (default: all)' ) args = parser.parse_args() # Create output directory args.output_dir.mkdir(parents=True, exist_ok=True) # Initialize Trove API client logger.info("Initializing Trove API client...") client = TroveAPIClient(api_key=args.api_key) # Extract all contributors with full details contributors = client.get_all_contributors_with_details(delay=args.delay) if not contributors: logger.error("No contributors extracted. Exiting.") sys.exit(1) # Convert to LinkML format converter = TroveToLinkMLConverter() records = converter.convert_all(contributors) if not records: logger.error("No records generated. Exiting.") sys.exit(1) # Generate timestamp for filenames timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') # Export to requested formats if 'yaml' in args.formats: yaml_path = args.output_dir / f'trove_contributors_{timestamp}.yaml' export_to_yaml(records, yaml_path) if 'json' in args.formats: json_path = args.output_dir / f'trove_contributors_{timestamp}.json' export_to_json(records, json_path) if 'csv' in args.formats: csv_path = args.output_dir / f'trove_contributors_{timestamp}.csv' export_to_csv(records, csv_path) # Generate summary report logger.info("\n" + "="*80) logger.info("EXTRACTION SUMMARY") logger.info("="*80) logger.info(f"Total contributors extracted: {len(contributors)}") logger.info(f"Total records converted: {len(records)}") logger.info(f"Output directory: {args.output_dir}") # Count by institution type type_counts = {} for record in records: inst_type = record.get('institution_type', 'UNKNOWN') type_counts[inst_type] = type_counts.get(inst_type, 0) + 1 logger.info("\nInstitution Type Distribution:") for inst_type, count in sorted(type_counts.items()): logger.info(f" {inst_type}: {count}") logger.info("\nExtraction complete!") if __name__ == '__main__': main()