428 lines
14 KiB
Python
428 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich ARON institutions with metadata from detail API
|
|
|
|
This script:
|
|
1. Loads czech_unified.yaml
|
|
2. Filters for ARON institutions (549)
|
|
3. Fetches detailed metadata from ARON API
|
|
4. Extracts addresses, contacts, websites
|
|
5. Geocodes addresses with Nominatim
|
|
6. Saves enriched dataset
|
|
|
|
API endpoint: GET https://portal.nacr.cz/aron/api/aron/apu/{uuid}
|
|
|
|
Expected improvements:
|
|
- Address coverage: 0% → ~80%
|
|
- Contact info: 0% → ~50%
|
|
- GPS coverage: 0% → ~75% (after geocoding)
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Any, Optional
|
|
|
|
# Configuration
|
|
UNIFIED_FILE = Path("data/instances/czech_unified.yaml")
|
|
OUTPUT_FILE = Path("data/instances/czech_unified_enriched.yaml")
|
|
REPORT_FILE = Path("CZECH_ARON_ENRICHMENT_REPORT.md")
|
|
|
|
API_BASE = "https://portal.nacr.cz/aron/api/aron/apu"
|
|
NOMINATIM_API = "https://nominatim.openstreetmap.org/search"
|
|
|
|
RATE_LIMIT = 0.5 # seconds between API calls
|
|
GEOCODE_RATE_LIMIT = 1.0 # Nominatim requires 1 req/sec
|
|
|
|
# User agent for Nominatim (required)
|
|
HEADERS = {
|
|
'User-Agent': 'GLAM-Data-Extraction/1.0 (heritage institution research project)'
|
|
}
|
|
|
|
|
|
def load_unified_dataset() -> List[Dict]:
|
|
"""Load Czech unified dataset."""
|
|
print("Loading unified dataset...")
|
|
with open(UNIFIED_FILE, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
print(f" Loaded {len(data):,} institutions")
|
|
return data
|
|
|
|
|
|
def is_aron_institution(inst: Dict) -> bool:
|
|
"""Check if institution is from ARON (not merged with ADR)."""
|
|
source_url = inst.get('provenance', {}).get('source_url', '')
|
|
return 'aron' in source_url and 'adr.cz' not in source_url
|
|
|
|
|
|
def extract_uuid(inst: Dict) -> Optional[str]:
|
|
"""Extract ARON UUID from identifiers."""
|
|
for identifier in inst.get('identifiers', []):
|
|
if identifier.get('identifier_scheme') == 'ARON_UUID':
|
|
return identifier.get('identifier_value')
|
|
return None
|
|
|
|
|
|
def fetch_aron_detail(uuid: str) -> Dict[str, Any]:
|
|
"""Fetch detailed metadata from ARON API."""
|
|
url = f"{API_BASE}/{uuid}"
|
|
|
|
try:
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except requests.RequestException as e:
|
|
print(f" Error fetching {uuid}: {e}")
|
|
return {}
|
|
|
|
|
|
def parse_aron_metadata(detail: Dict) -> Dict[str, Any]:
|
|
"""
|
|
Parse metadata from ARON API response.
|
|
|
|
Looks for:
|
|
- INST~ADDRESS: Street address
|
|
- INST~PHONE: Phone number
|
|
- INST~EMAIL: Email
|
|
- INST~URL: Website
|
|
- INST~CODE: Institution code (already have this)
|
|
"""
|
|
metadata = {}
|
|
|
|
parts = detail.get('parts', [])
|
|
for part in parts:
|
|
items = part.get('items', [])
|
|
for item in items:
|
|
item_type = item.get('type', '')
|
|
value = item.get('value', '')
|
|
|
|
if not value:
|
|
continue
|
|
|
|
if item_type == 'INST~ADDRESS':
|
|
metadata['address'] = value
|
|
elif item_type == 'INST~PHONE':
|
|
metadata['phone'] = value
|
|
elif item_type == 'INST~EMAIL':
|
|
metadata['email'] = value
|
|
elif item_type == 'INST~URL':
|
|
metadata['website'] = value
|
|
|
|
return metadata
|
|
|
|
|
|
def geocode_address(address: str, country: str = 'Czech Republic') -> Optional[Dict]:
|
|
"""
|
|
Geocode address using Nominatim API.
|
|
|
|
Returns dict with latitude, longitude, or None if failed.
|
|
"""
|
|
if not address:
|
|
return None
|
|
|
|
# Build query
|
|
query = f"{address}, {country}"
|
|
|
|
params = {
|
|
'q': query,
|
|
'format': 'json',
|
|
'limit': 1,
|
|
'countrycodes': 'cz'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(
|
|
NOMINATIM_API,
|
|
params=params,
|
|
headers=HEADERS,
|
|
timeout=10
|
|
)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
if results and len(results) > 0:
|
|
result = results[0]
|
|
return {
|
|
'latitude': float(result['lat']),
|
|
'longitude': float(result['lon']),
|
|
'display_name': result.get('display_name', '')
|
|
}
|
|
except Exception as e:
|
|
print(f" Geocoding error: {e}")
|
|
|
|
return None
|
|
|
|
|
|
def enrich_institution(inst: Dict, metadata: Dict) -> Dict:
|
|
"""Add enriched metadata to institution record."""
|
|
enriched = inst.copy()
|
|
|
|
# Initialize locations if not present
|
|
if 'locations' not in enriched:
|
|
enriched['locations'] = []
|
|
|
|
# Update or create location entry
|
|
if len(enriched['locations']) == 0:
|
|
enriched['locations'].append({})
|
|
|
|
location = enriched['locations'][0]
|
|
|
|
# Add address
|
|
if metadata.get('address'):
|
|
location['street_address'] = metadata['address']
|
|
|
|
# Try to parse city from address (Czech format: "Street, PostalCode City")
|
|
# Example: "Nám. Svobody 4, 669 02 Znojmo"
|
|
address_parts = metadata['address'].split(',')
|
|
if len(address_parts) >= 2:
|
|
city_part = address_parts[-1].strip()
|
|
# Remove postal code (5 digits)
|
|
city = ' '.join([p for p in city_part.split() if not p.isdigit() or len(p) != 5])
|
|
if city:
|
|
location['city'] = city
|
|
|
|
# Extract postal code
|
|
postal_codes = [p for p in city_part.split() if p.isdigit() and len(p) == 5]
|
|
if postal_codes:
|
|
location['postal_code'] = postal_codes[0]
|
|
|
|
location['country'] = 'CZ'
|
|
|
|
# Add website to identifiers
|
|
if metadata.get('website'):
|
|
if 'identifiers' not in enriched:
|
|
enriched['identifiers'] = []
|
|
|
|
# Check if website already exists
|
|
if not any(i.get('identifier_scheme') == 'Website' for i in enriched['identifiers']):
|
|
enriched['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': metadata['website'],
|
|
'identifier_url': metadata['website']
|
|
})
|
|
|
|
# Store phone/email in notes (no dedicated fields in schema)
|
|
notes = []
|
|
if metadata.get('phone'):
|
|
notes.append(f"Phone: {metadata['phone']}")
|
|
if metadata.get('email'):
|
|
notes.append(f"Email: {metadata['email']}")
|
|
|
|
if notes:
|
|
existing_desc = enriched.get('description', '')
|
|
if existing_desc:
|
|
enriched['description'] = f"{existing_desc}\n\nContact: {'; '.join(notes)}"
|
|
else:
|
|
enriched['description'] = f"Contact: {'; '.join(notes)}"
|
|
|
|
# Update provenance
|
|
enriched['provenance']['enrichment_date'] = datetime.now(timezone.utc).isoformat()
|
|
enriched['provenance']['enrichment_method'] = 'ARON API detail endpoint scraping'
|
|
|
|
return enriched
|
|
|
|
|
|
def enrich_aron_institutions():
|
|
"""Main enrichment workflow."""
|
|
print("=" * 70)
|
|
print("ARON Institution Metadata Enrichment")
|
|
print("=" * 70)
|
|
|
|
# Load dataset
|
|
data = load_unified_dataset()
|
|
|
|
# Filter for ARON institutions
|
|
print("\nFiltering ARON institutions...")
|
|
aron_institutions = [(i, inst) for i, inst in enumerate(data) if is_aron_institution(inst)]
|
|
print(f" Found {len(aron_institutions)} ARON institutions")
|
|
|
|
# Enrichment statistics
|
|
stats = {
|
|
'total': len(aron_institutions),
|
|
'with_address': 0,
|
|
'with_phone': 0,
|
|
'with_email': 0,
|
|
'with_website': 0,
|
|
'geocoded': 0,
|
|
'failed': 0
|
|
}
|
|
|
|
# Enrich each ARON institution
|
|
print(f"\nEnriching {len(aron_institutions)} institutions...")
|
|
|
|
for idx, (i, inst) in enumerate(aron_institutions, 1):
|
|
uuid = extract_uuid(inst)
|
|
if not uuid:
|
|
print(f" [{idx}/{len(aron_institutions)}] No UUID for {inst['name']}")
|
|
stats['failed'] += 1
|
|
continue
|
|
|
|
# Fetch detail
|
|
detail = fetch_aron_detail(uuid)
|
|
if not detail:
|
|
stats['failed'] += 1
|
|
continue
|
|
|
|
# Parse metadata
|
|
metadata = parse_aron_metadata(detail)
|
|
|
|
# Track what we found
|
|
if metadata.get('address'):
|
|
stats['with_address'] += 1
|
|
if metadata.get('phone'):
|
|
stats['with_phone'] += 1
|
|
if metadata.get('email'):
|
|
stats['with_email'] += 1
|
|
if metadata.get('website'):
|
|
stats['with_website'] += 1
|
|
|
|
# Enrich institution
|
|
data[i] = enrich_institution(inst, metadata)
|
|
|
|
# Geocode if we have address
|
|
if metadata.get('address'):
|
|
time.sleep(GEOCODE_RATE_LIMIT) # Nominatim rate limit
|
|
|
|
geocode_result = geocode_address(metadata['address'])
|
|
if geocode_result:
|
|
location = data[i]['locations'][0]
|
|
location['latitude'] = geocode_result['latitude']
|
|
location['longitude'] = geocode_result['longitude']
|
|
stats['geocoded'] += 1
|
|
|
|
# Progress
|
|
print(f" [{idx}/{len(aron_institutions)}] {inst['name'][:50]:50} "
|
|
f"[Addr: {'✓' if metadata.get('address') else '✗'} "
|
|
f"Web: {'✓' if metadata.get('website') else '✗'} "
|
|
f"GPS: {'✓' if stats['geocoded'] >= idx else '✗'}]",
|
|
end='\r')
|
|
|
|
# Rate limit
|
|
time.sleep(RATE_LIMIT)
|
|
|
|
print() # Clear progress line
|
|
print(f"\nEnrichment complete!")
|
|
print(f" Address: {stats['with_address']}/{stats['total']} ({stats['with_address']/stats['total']*100:.1f}%)")
|
|
print(f" Phone: {stats['with_phone']}/{stats['total']} ({stats['with_phone']/stats['total']*100:.1f}%)")
|
|
print(f" Email: {stats['with_email']}/{stats['total']} ({stats['with_email']/stats['total']*100:.1f}%)")
|
|
print(f" Website: {stats['with_website']}/{stats['total']} ({stats['with_website']/stats['total']*100:.1f}%)")
|
|
print(f" Geocoded: {stats['geocoded']}/{stats['total']} ({stats['geocoded']/stats['total']*100:.1f}%)")
|
|
print(f" Failed: {stats['failed']}/{stats['total']}")
|
|
|
|
# Save enriched dataset
|
|
print(f"\nSaving to {OUTPUT_FILE}...")
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print(f"Saved {len(data):,} institutions")
|
|
|
|
# Calculate overall GPS coverage
|
|
total_with_gps = sum(1 for inst in data
|
|
if inst.get('locations') and
|
|
any(loc.get('latitude') for loc in inst['locations']))
|
|
overall_gps = total_with_gps / len(data) * 100
|
|
|
|
print(f"\nOverall GPS coverage: {total_with_gps:,}/{len(data):,} ({overall_gps:.1f}%)")
|
|
|
|
# Generate report
|
|
generate_report(stats, overall_gps, len(data))
|
|
|
|
print("\n✅ Enrichment complete!")
|
|
|
|
|
|
def generate_report(stats: Dict, overall_gps: float, total_institutions: int):
|
|
"""Generate enrichment report."""
|
|
print(f"\nGenerating report to {REPORT_FILE}...")
|
|
|
|
report = f"""# ARON Metadata Enrichment Report
|
|
|
|
**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
|
**Status**: ✅ COMPLETE
|
|
|
|
---
|
|
|
|
## Summary
|
|
|
|
Successfully enriched {stats['total']} ARON institutions with metadata from detail API.
|
|
|
|
### Enrichment Results
|
|
|
|
| Metric | Count | Percentage |
|
|
|--------|-------|------------|
|
|
| **Addresses** | {stats['with_address']} | {stats['with_address']/stats['total']*100:.1f}% |
|
|
| **Phone numbers** | {stats['with_phone']} | {stats['with_phone']/stats['total']*100:.1f}% |
|
|
| **Emails** | {stats['with_email']} | {stats['with_email']/stats['total']*100:.1f}% |
|
|
| **Websites** | {stats['with_website']} | {stats['with_website']/stats['total']*100:.1f}% |
|
|
| **Geocoded** | {stats['geocoded']} | {stats['geocoded']/stats['total']*100:.1f}% |
|
|
| **Failed** | {stats['failed']} | {stats['failed']/stats['total']*100:.1f}% |
|
|
|
|
---
|
|
|
|
## GPS Coverage Improvement
|
|
|
|
### Before Enrichment
|
|
- Czech unified: 76.2% GPS coverage
|
|
- ARON institutions: 0% GPS coverage
|
|
|
|
### After Enrichment
|
|
- ARON institutions: {stats['geocoded']/stats['total']*100:.1f}% GPS coverage
|
|
- Overall Czech dataset: **{overall_gps:.1f}%** GPS coverage
|
|
|
|
**Improvement**: +{overall_gps - 76.2:.1f} percentage points
|
|
|
|
---
|
|
|
|
## Metadata Completeness
|
|
|
|
### ARON Institutions
|
|
|
|
| Field | Before | After | Improvement |
|
|
|-------|--------|-------|-------------|
|
|
| Address | 0% | {stats['with_address']/stats['total']*100:.1f}% | +{stats['with_address']/stats['total']*100:.1f}pp |
|
|
| Contact (phone/email) | 0% | {(stats['with_phone']+stats['with_email'])/stats['total']/2*100:.1f}% | +{(stats['with_phone']+stats['with_email'])/stats['total']/2*100:.1f}pp |
|
|
| Website | 0% | {stats['with_website']/stats['total']*100:.1f}% | +{stats['with_website']/stats['total']*100:.1f}pp |
|
|
| GPS coordinates | 0% | {stats['geocoded']/stats['total']*100:.1f}% | +{stats['geocoded']/stats['total']*100:.1f}pp |
|
|
|
|
**Overall ARON completeness**: ~40% → ~{(stats['with_address'] + stats['with_website'] + stats['geocoded'])/(stats['total']*3)*100:.0f}%
|
|
|
|
---
|
|
|
|
## Files Created
|
|
|
|
1. **`{OUTPUT_FILE}`** - Enriched Czech dataset ({total_institutions:,} institutions)
|
|
2. **`{REPORT_FILE}`** - This enrichment report
|
|
|
|
---
|
|
|
|
## Next Steps
|
|
|
|
### Priority 2 ✅ Task 4 COMPLETE
|
|
- [x] Enrich ARON metadata
|
|
- [x] Geocode ARON addresses
|
|
- [ ] Wikidata enrichment (Task 5 - next)
|
|
- [ ] ISIL code investigation (Task 6)
|
|
|
|
### Recommended Next: Task 5 - Wikidata Enrichment
|
|
- Query Wikidata for Czech institutions
|
|
- Fuzzy match by name + location
|
|
- Add Q-numbers for GHCID collision resolution
|
|
- Estimated time: 1-2 hours
|
|
|
|
---
|
|
|
|
**Report generated**: {datetime.now().isoformat()}
|
|
**Script**: `scripts/scrapers/enrich_aron_metadata.py`
|
|
"""
|
|
|
|
with open(REPORT_FILE, 'w', encoding='utf-8') as f:
|
|
f.write(report)
|
|
|
|
print(f"Report saved: {REPORT_FILE}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
enrich_aron_institutions()
|