glam/scripts/scrapers/enrich_aron_metadata.py
2025-11-21 22:12:33 +01:00

428 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Enrich ARON institutions with metadata from detail API
This script:
1. Loads czech_unified.yaml
2. Filters for ARON institutions (549)
3. Fetches detailed metadata from ARON API
4. Extracts addresses, contacts, websites
5. Geocodes addresses with Nominatim
6. Saves enriched dataset
API endpoint: GET https://portal.nacr.cz/aron/api/aron/apu/{uuid}
Expected improvements:
- Address coverage: 0% → ~80%
- Contact info: 0% → ~50%
- GPS coverage: 0% → ~75% (after geocoding)
"""
import yaml
import requests
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional
# Configuration
UNIFIED_FILE = Path("data/instances/czech_unified.yaml")
OUTPUT_FILE = Path("data/instances/czech_unified_enriched.yaml")
REPORT_FILE = Path("CZECH_ARON_ENRICHMENT_REPORT.md")
API_BASE = "https://portal.nacr.cz/aron/api/aron/apu"
NOMINATIM_API = "https://nominatim.openstreetmap.org/search"
RATE_LIMIT = 0.5 # seconds between API calls
GEOCODE_RATE_LIMIT = 1.0 # Nominatim requires 1 req/sec
# User agent for Nominatim (required)
HEADERS = {
'User-Agent': 'GLAM-Data-Extraction/1.0 (heritage institution research project)'
}
def load_unified_dataset() -> List[Dict]:
"""Load Czech unified dataset."""
print("Loading unified dataset...")
with open(UNIFIED_FILE, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
print(f" Loaded {len(data):,} institutions")
return data
def is_aron_institution(inst: Dict) -> bool:
"""Check if institution is from ARON (not merged with ADR)."""
source_url = inst.get('provenance', {}).get('source_url', '')
return 'aron' in source_url and 'adr.cz' not in source_url
def extract_uuid(inst: Dict) -> Optional[str]:
"""Extract ARON UUID from identifiers."""
for identifier in inst.get('identifiers', []):
if identifier.get('identifier_scheme') == 'ARON_UUID':
return identifier.get('identifier_value')
return None
def fetch_aron_detail(uuid: str) -> Dict[str, Any]:
"""Fetch detailed metadata from ARON API."""
url = f"{API_BASE}/{uuid}"
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
return response.json()
except requests.RequestException as e:
print(f" Error fetching {uuid}: {e}")
return {}
def parse_aron_metadata(detail: Dict) -> Dict[str, Any]:
"""
Parse metadata from ARON API response.
Looks for:
- INST~ADDRESS: Street address
- INST~PHONE: Phone number
- INST~EMAIL: Email
- INST~URL: Website
- INST~CODE: Institution code (already have this)
"""
metadata = {}
parts = detail.get('parts', [])
for part in parts:
items = part.get('items', [])
for item in items:
item_type = item.get('type', '')
value = item.get('value', '')
if not value:
continue
if item_type == 'INST~ADDRESS':
metadata['address'] = value
elif item_type == 'INST~PHONE':
metadata['phone'] = value
elif item_type == 'INST~EMAIL':
metadata['email'] = value
elif item_type == 'INST~URL':
metadata['website'] = value
return metadata
def geocode_address(address: str, country: str = 'Czech Republic') -> Optional[Dict]:
"""
Geocode address using Nominatim API.
Returns dict with latitude, longitude, or None if failed.
"""
if not address:
return None
# Build query
query = f"{address}, {country}"
params = {
'q': query,
'format': 'json',
'limit': 1,
'countrycodes': 'cz'
}
try:
response = requests.get(
NOMINATIM_API,
params=params,
headers=HEADERS,
timeout=10
)
response.raise_for_status()
results = response.json()
if results and len(results) > 0:
result = results[0]
return {
'latitude': float(result['lat']),
'longitude': float(result['lon']),
'display_name': result.get('display_name', '')
}
except Exception as e:
print(f" Geocoding error: {e}")
return None
def enrich_institution(inst: Dict, metadata: Dict) -> Dict:
"""Add enriched metadata to institution record."""
enriched = inst.copy()
# Initialize locations if not present
if 'locations' not in enriched:
enriched['locations'] = []
# Update or create location entry
if len(enriched['locations']) == 0:
enriched['locations'].append({})
location = enriched['locations'][0]
# Add address
if metadata.get('address'):
location['street_address'] = metadata['address']
# Try to parse city from address (Czech format: "Street, PostalCode City")
# Example: "Nám. Svobody 4, 669 02 Znojmo"
address_parts = metadata['address'].split(',')
if len(address_parts) >= 2:
city_part = address_parts[-1].strip()
# Remove postal code (5 digits)
city = ' '.join([p for p in city_part.split() if not p.isdigit() or len(p) != 5])
if city:
location['city'] = city
# Extract postal code
postal_codes = [p for p in city_part.split() if p.isdigit() and len(p) == 5]
if postal_codes:
location['postal_code'] = postal_codes[0]
location['country'] = 'CZ'
# Add website to identifiers
if metadata.get('website'):
if 'identifiers' not in enriched:
enriched['identifiers'] = []
# Check if website already exists
if not any(i.get('identifier_scheme') == 'Website' for i in enriched['identifiers']):
enriched['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': metadata['website'],
'identifier_url': metadata['website']
})
# Store phone/email in notes (no dedicated fields in schema)
notes = []
if metadata.get('phone'):
notes.append(f"Phone: {metadata['phone']}")
if metadata.get('email'):
notes.append(f"Email: {metadata['email']}")
if notes:
existing_desc = enriched.get('description', '')
if existing_desc:
enriched['description'] = f"{existing_desc}\n\nContact: {'; '.join(notes)}"
else:
enriched['description'] = f"Contact: {'; '.join(notes)}"
# Update provenance
enriched['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
enriched['provenance']['enrichment_method'] = 'ARON API detail endpoint scraping'
return enriched
def enrich_aron_institutions():
"""Main enrichment workflow."""
print("=" * 70)
print("ARON Institution Metadata Enrichment")
print("=" * 70)
# Load dataset
data = load_unified_dataset()
# Filter for ARON institutions
print("\nFiltering ARON institutions...")
aron_institutions = [(i, inst) for i, inst in enumerate(data) if is_aron_institution(inst)]
print(f" Found {len(aron_institutions)} ARON institutions")
# Enrichment statistics
stats = {
'total': len(aron_institutions),
'with_address': 0,
'with_phone': 0,
'with_email': 0,
'with_website': 0,
'geocoded': 0,
'failed': 0
}
# Enrich each ARON institution
print(f"\nEnriching {len(aron_institutions)} institutions...")
for idx, (i, inst) in enumerate(aron_institutions, 1):
uuid = extract_uuid(inst)
if not uuid:
print(f" [{idx}/{len(aron_institutions)}] No UUID for {inst['name']}")
stats['failed'] += 1
continue
# Fetch detail
detail = fetch_aron_detail(uuid)
if not detail:
stats['failed'] += 1
continue
# Parse metadata
metadata = parse_aron_metadata(detail)
# Track what we found
if metadata.get('address'):
stats['with_address'] += 1
if metadata.get('phone'):
stats['with_phone'] += 1
if metadata.get('email'):
stats['with_email'] += 1
if metadata.get('website'):
stats['with_website'] += 1
# Enrich institution
data[i] = enrich_institution(inst, metadata)
# Geocode if we have address
if metadata.get('address'):
time.sleep(GEOCODE_RATE_LIMIT) # Nominatim rate limit
geocode_result = geocode_address(metadata['address'])
if geocode_result:
location = data[i]['locations'][0]
location['latitude'] = geocode_result['latitude']
location['longitude'] = geocode_result['longitude']
stats['geocoded'] += 1
# Progress
print(f" [{idx}/{len(aron_institutions)}] {inst['name'][:50]:50} "
f"[Addr: {'' if metadata.get('address') else ''} "
f"Web: {'' if metadata.get('website') else ''} "
f"GPS: {'' if stats['geocoded'] >= idx else ''}]",
end='\r')
# Rate limit
time.sleep(RATE_LIMIT)
print() # Clear progress line
print(f"\nEnrichment complete!")
print(f" Address: {stats['with_address']}/{stats['total']} ({stats['with_address']/stats['total']*100:.1f}%)")
print(f" Phone: {stats['with_phone']}/{stats['total']} ({stats['with_phone']/stats['total']*100:.1f}%)")
print(f" Email: {stats['with_email']}/{stats['total']} ({stats['with_email']/stats['total']*100:.1f}%)")
print(f" Website: {stats['with_website']}/{stats['total']} ({stats['with_website']/stats['total']*100:.1f}%)")
print(f" Geocoded: {stats['geocoded']}/{stats['total']} ({stats['geocoded']/stats['total']*100:.1f}%)")
print(f" Failed: {stats['failed']}/{stats['total']}")
# Save enriched dataset
print(f"\nSaving to {OUTPUT_FILE}...")
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print(f"Saved {len(data):,} institutions")
# Calculate overall GPS coverage
total_with_gps = sum(1 for inst in data
if inst.get('locations') and
any(loc.get('latitude') for loc in inst['locations']))
overall_gps = total_with_gps / len(data) * 100
print(f"\nOverall GPS coverage: {total_with_gps:,}/{len(data):,} ({overall_gps:.1f}%)")
# Generate report
generate_report(stats, overall_gps, len(data))
print("\n✅ Enrichment complete!")
def generate_report(stats: Dict, overall_gps: float, total_institutions: int):
"""Generate enrichment report."""
print(f"\nGenerating report to {REPORT_FILE}...")
report = f"""# ARON Metadata Enrichment Report
**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
**Status**: ✅ COMPLETE
---
## Summary
Successfully enriched {stats['total']} ARON institutions with metadata from detail API.
### Enrichment Results
| Metric | Count | Percentage |
|--------|-------|------------|
| **Addresses** | {stats['with_address']} | {stats['with_address']/stats['total']*100:.1f}% |
| **Phone numbers** | {stats['with_phone']} | {stats['with_phone']/stats['total']*100:.1f}% |
| **Emails** | {stats['with_email']} | {stats['with_email']/stats['total']*100:.1f}% |
| **Websites** | {stats['with_website']} | {stats['with_website']/stats['total']*100:.1f}% |
| **Geocoded** | {stats['geocoded']} | {stats['geocoded']/stats['total']*100:.1f}% |
| **Failed** | {stats['failed']} | {stats['failed']/stats['total']*100:.1f}% |
---
## GPS Coverage Improvement
### Before Enrichment
- Czech unified: 76.2% GPS coverage
- ARON institutions: 0% GPS coverage
### After Enrichment
- ARON institutions: {stats['geocoded']/stats['total']*100:.1f}% GPS coverage
- Overall Czech dataset: **{overall_gps:.1f}%** GPS coverage
**Improvement**: +{overall_gps - 76.2:.1f} percentage points
---
## Metadata Completeness
### ARON Institutions
| Field | Before | After | Improvement |
|-------|--------|-------|-------------|
| Address | 0% | {stats['with_address']/stats['total']*100:.1f}% | +{stats['with_address']/stats['total']*100:.1f}pp |
| Contact (phone/email) | 0% | {(stats['with_phone']+stats['with_email'])/stats['total']/2*100:.1f}% | +{(stats['with_phone']+stats['with_email'])/stats['total']/2*100:.1f}pp |
| Website | 0% | {stats['with_website']/stats['total']*100:.1f}% | +{stats['with_website']/stats['total']*100:.1f}pp |
| GPS coordinates | 0% | {stats['geocoded']/stats['total']*100:.1f}% | +{stats['geocoded']/stats['total']*100:.1f}pp |
**Overall ARON completeness**: ~40% → ~{(stats['with_address'] + stats['with_website'] + stats['geocoded'])/(stats['total']*3)*100:.0f}%
---
## Files Created
1. **`{OUTPUT_FILE}`** - Enriched Czech dataset ({total_institutions:,} institutions)
2. **`{REPORT_FILE}`** - This enrichment report
---
## Next Steps
### Priority 2 ✅ Task 4 COMPLETE
- [x] Enrich ARON metadata
- [x] Geocode ARON addresses
- [ ] Wikidata enrichment (Task 5 - next)
- [ ] ISIL code investigation (Task 6)
### Recommended Next: Task 5 - Wikidata Enrichment
- Query Wikidata for Czech institutions
- Fuzzy match by name + location
- Add Q-numbers for GHCID collision resolution
- Estimated time: 1-2 hours
---
**Report generated**: {datetime.now().isoformat()}
**Script**: `scripts/scrapers/enrich_aron_metadata.py`
"""
with open(REPORT_FILE, 'w', encoding='utf-8') as f:
f.write(report)
print(f"Report saved: {REPORT_FILE}")
if __name__ == "__main__":
enrich_aron_institutions()