#!/usr/bin/env python3 """ Enrich ARON institutions with metadata from detail API This script: 1. Loads czech_unified.yaml 2. Filters for ARON institutions (549) 3. Fetches detailed metadata from ARON API 4. Extracts addresses, contacts, websites 5. Geocodes addresses with Nominatim 6. Saves enriched dataset API endpoint: GET https://portal.nacr.cz/aron/api/aron/apu/{uuid} Expected improvements: - Address coverage: 0% → ~80% - Contact info: 0% → ~50% - GPS coverage: 0% → ~75% (after geocoding) """ import yaml import requests import time from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Any, Optional # Configuration UNIFIED_FILE = Path("data/instances/czech_unified.yaml") OUTPUT_FILE = Path("data/instances/czech_unified_enriched.yaml") REPORT_FILE = Path("CZECH_ARON_ENRICHMENT_REPORT.md") API_BASE = "https://portal.nacr.cz/aron/api/aron/apu" NOMINATIM_API = "https://nominatim.openstreetmap.org/search" RATE_LIMIT = 0.5 # seconds between API calls GEOCODE_RATE_LIMIT = 1.0 # Nominatim requires 1 req/sec # User agent for Nominatim (required) HEADERS = { 'User-Agent': 'GLAM-Data-Extraction/1.0 (heritage institution research project)' } def load_unified_dataset() -> List[Dict]: """Load Czech unified dataset.""" print("Loading unified dataset...") with open(UNIFIED_FILE, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) print(f" Loaded {len(data):,} institutions") return data def is_aron_institution(inst: Dict) -> bool: """Check if institution is from ARON (not merged with ADR).""" source_url = inst.get('provenance', {}).get('source_url', '') return 'aron' in source_url and 'adr.cz' not in source_url def extract_uuid(inst: Dict) -> Optional[str]: """Extract ARON UUID from identifiers.""" for identifier in inst.get('identifiers', []): if identifier.get('identifier_scheme') == 'ARON_UUID': return identifier.get('identifier_value') return None def fetch_aron_detail(uuid: str) -> Dict[str, Any]: """Fetch detailed metadata from ARON API.""" url = f"{API_BASE}/{uuid}" try: response = requests.get(url, timeout=30) response.raise_for_status() return response.json() except requests.RequestException as e: print(f" Error fetching {uuid}: {e}") return {} def parse_aron_metadata(detail: Dict) -> Dict[str, Any]: """ Parse metadata from ARON API response. Looks for: - INST~ADDRESS: Street address - INST~PHONE: Phone number - INST~EMAIL: Email - INST~URL: Website - INST~CODE: Institution code (already have this) """ metadata = {} parts = detail.get('parts', []) for part in parts: items = part.get('items', []) for item in items: item_type = item.get('type', '') value = item.get('value', '') if not value: continue if item_type == 'INST~ADDRESS': metadata['address'] = value elif item_type == 'INST~PHONE': metadata['phone'] = value elif item_type == 'INST~EMAIL': metadata['email'] = value elif item_type == 'INST~URL': metadata['website'] = value return metadata def geocode_address(address: str, country: str = 'Czech Republic') -> Optional[Dict]: """ Geocode address using Nominatim API. Returns dict with latitude, longitude, or None if failed. """ if not address: return None # Build query query = f"{address}, {country}" params = { 'q': query, 'format': 'json', 'limit': 1, 'countrycodes': 'cz' } try: response = requests.get( NOMINATIM_API, params=params, headers=HEADERS, timeout=10 ) response.raise_for_status() results = response.json() if results and len(results) > 0: result = results[0] return { 'latitude': float(result['lat']), 'longitude': float(result['lon']), 'display_name': result.get('display_name', '') } except Exception as e: print(f" Geocoding error: {e}") return None def enrich_institution(inst: Dict, metadata: Dict) -> Dict: """Add enriched metadata to institution record.""" enriched = inst.copy() # Initialize locations if not present if 'locations' not in enriched: enriched['locations'] = [] # Update or create location entry if len(enriched['locations']) == 0: enriched['locations'].append({}) location = enriched['locations'][0] # Add address if metadata.get('address'): location['street_address'] = metadata['address'] # Try to parse city from address (Czech format: "Street, PostalCode City") # Example: "Nám. Svobody 4, 669 02 Znojmo" address_parts = metadata['address'].split(',') if len(address_parts) >= 2: city_part = address_parts[-1].strip() # Remove postal code (5 digits) city = ' '.join([p for p in city_part.split() if not p.isdigit() or len(p) != 5]) if city: location['city'] = city # Extract postal code postal_codes = [p for p in city_part.split() if p.isdigit() and len(p) == 5] if postal_codes: location['postal_code'] = postal_codes[0] location['country'] = 'CZ' # Add website to identifiers if metadata.get('website'): if 'identifiers' not in enriched: enriched['identifiers'] = [] # Check if website already exists if not any(i.get('identifier_scheme') == 'Website' for i in enriched['identifiers']): enriched['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': metadata['website'], 'identifier_url': metadata['website'] }) # Store phone/email in notes (no dedicated fields in schema) notes = [] if metadata.get('phone'): notes.append(f"Phone: {metadata['phone']}") if metadata.get('email'): notes.append(f"Email: {metadata['email']}") if notes: existing_desc = enriched.get('description', '') if existing_desc: enriched['description'] = f"{existing_desc}\n\nContact: {'; '.join(notes)}" else: enriched['description'] = f"Contact: {'; '.join(notes)}" # Update provenance enriched['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat() enriched['provenance']['enrichment_method'] = 'ARON API detail endpoint scraping' return enriched def enrich_aron_institutions(): """Main enrichment workflow.""" print("=" * 70) print("ARON Institution Metadata Enrichment") print("=" * 70) # Load dataset data = load_unified_dataset() # Filter for ARON institutions print("\nFiltering ARON institutions...") aron_institutions = [(i, inst) for i, inst in enumerate(data) if is_aron_institution(inst)] print(f" Found {len(aron_institutions)} ARON institutions") # Enrichment statistics stats = { 'total': len(aron_institutions), 'with_address': 0, 'with_phone': 0, 'with_email': 0, 'with_website': 0, 'geocoded': 0, 'failed': 0 } # Enrich each ARON institution print(f"\nEnriching {len(aron_institutions)} institutions...") for idx, (i, inst) in enumerate(aron_institutions, 1): uuid = extract_uuid(inst) if not uuid: print(f" [{idx}/{len(aron_institutions)}] No UUID for {inst['name']}") stats['failed'] += 1 continue # Fetch detail detail = fetch_aron_detail(uuid) if not detail: stats['failed'] += 1 continue # Parse metadata metadata = parse_aron_metadata(detail) # Track what we found if metadata.get('address'): stats['with_address'] += 1 if metadata.get('phone'): stats['with_phone'] += 1 if metadata.get('email'): stats['with_email'] += 1 if metadata.get('website'): stats['with_website'] += 1 # Enrich institution data[i] = enrich_institution(inst, metadata) # Geocode if we have address if metadata.get('address'): time.sleep(GEOCODE_RATE_LIMIT) # Nominatim rate limit geocode_result = geocode_address(metadata['address']) if geocode_result: location = data[i]['locations'][0] location['latitude'] = geocode_result['latitude'] location['longitude'] = geocode_result['longitude'] stats['geocoded'] += 1 # Progress print(f" [{idx}/{len(aron_institutions)}] {inst['name'][:50]:50} " f"[Addr: {'✓' if metadata.get('address') else '✗'} " f"Web: {'✓' if metadata.get('website') else '✗'} " f"GPS: {'✓' if stats['geocoded'] >= idx else '✗'}]", end='\r') # Rate limit time.sleep(RATE_LIMIT) print() # Clear progress line print(f"\nEnrichment complete!") print(f" Address: {stats['with_address']}/{stats['total']} ({stats['with_address']/stats['total']*100:.1f}%)") print(f" Phone: {stats['with_phone']}/{stats['total']} ({stats['with_phone']/stats['total']*100:.1f}%)") print(f" Email: {stats['with_email']}/{stats['total']} ({stats['with_email']/stats['total']*100:.1f}%)") print(f" Website: {stats['with_website']}/{stats['total']} ({stats['with_website']/stats['total']*100:.1f}%)") print(f" Geocoded: {stats['geocoded']}/{stats['total']} ({stats['geocoded']/stats['total']*100:.1f}%)") print(f" Failed: {stats['failed']}/{stats['total']}") # Save enriched dataset print(f"\nSaving to {OUTPUT_FILE}...") OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True) with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False) print(f"Saved {len(data):,} institutions") # Calculate overall GPS coverage total_with_gps = sum(1 for inst in data if inst.get('locations') and any(loc.get('latitude') for loc in inst['locations'])) overall_gps = total_with_gps / len(data) * 100 print(f"\nOverall GPS coverage: {total_with_gps:,}/{len(data):,} ({overall_gps:.1f}%)") # Generate report generate_report(stats, overall_gps, len(data)) print("\n✅ Enrichment complete!") def generate_report(stats: Dict, overall_gps: float, total_institutions: int): """Generate enrichment report.""" print(f"\nGenerating report to {REPORT_FILE}...") report = f"""# ARON Metadata Enrichment Report **Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} **Status**: ✅ COMPLETE --- ## Summary Successfully enriched {stats['total']} ARON institutions with metadata from detail API. ### Enrichment Results | Metric | Count | Percentage | |--------|-------|------------| | **Addresses** | {stats['with_address']} | {stats['with_address']/stats['total']*100:.1f}% | | **Phone numbers** | {stats['with_phone']} | {stats['with_phone']/stats['total']*100:.1f}% | | **Emails** | {stats['with_email']} | {stats['with_email']/stats['total']*100:.1f}% | | **Websites** | {stats['with_website']} | {stats['with_website']/stats['total']*100:.1f}% | | **Geocoded** | {stats['geocoded']} | {stats['geocoded']/stats['total']*100:.1f}% | | **Failed** | {stats['failed']} | {stats['failed']/stats['total']*100:.1f}% | --- ## GPS Coverage Improvement ### Before Enrichment - Czech unified: 76.2% GPS coverage - ARON institutions: 0% GPS coverage ### After Enrichment - ARON institutions: {stats['geocoded']/stats['total']*100:.1f}% GPS coverage - Overall Czech dataset: **{overall_gps:.1f}%** GPS coverage **Improvement**: +{overall_gps - 76.2:.1f} percentage points --- ## Metadata Completeness ### ARON Institutions | Field | Before | After | Improvement | |-------|--------|-------|-------------| | Address | 0% | {stats['with_address']/stats['total']*100:.1f}% | +{stats['with_address']/stats['total']*100:.1f}pp | | Contact (phone/email) | 0% | {(stats['with_phone']+stats['with_email'])/stats['total']/2*100:.1f}% | +{(stats['with_phone']+stats['with_email'])/stats['total']/2*100:.1f}pp | | Website | 0% | {stats['with_website']/stats['total']*100:.1f}% | +{stats['with_website']/stats['total']*100:.1f}pp | | GPS coordinates | 0% | {stats['geocoded']/stats['total']*100:.1f}% | +{stats['geocoded']/stats['total']*100:.1f}pp | **Overall ARON completeness**: ~40% → ~{(stats['with_address'] + stats['with_website'] + stats['geocoded'])/(stats['total']*3)*100:.0f}% --- ## Files Created 1. **`{OUTPUT_FILE}`** - Enriched Czech dataset ({total_institutions:,} institutions) 2. **`{REPORT_FILE}`** - This enrichment report --- ## Next Steps ### Priority 2 ✅ Task 4 COMPLETE - [x] Enrich ARON metadata - [x] Geocode ARON addresses - [ ] Wikidata enrichment (Task 5 - next) - [ ] ISIL code investigation (Task 6) ### Recommended Next: Task 5 - Wikidata Enrichment - Query Wikidata for Czech institutions - Fuzzy match by name + location - Add Q-numbers for GHCID collision resolution - Estimated time: 1-2 hours --- **Report generated**: {datetime.now().isoformat()} **Script**: `scripts/scrapers/enrich_aron_metadata.py` """ with open(REPORT_FILE, 'w', encoding='utf-8') as f: f.write(report) print(f"Report saved: {REPORT_FILE}") if __name__ == "__main__": enrich_aron_institutions()