#!/usr/bin/env python3 """ Normalize Custodian Files to Canonical Schema v1.0.0 This script adds a canonical `location` object to all custodian files, extracting coordinates and address data from various enrichment sources using a priority-based cascade. The canonical `location` object is THE authoritative source for coordinates and location data. After normalization, downstream tools should ONLY use `location.latitude` and `location.longitude`. Coordinate Source Priority (highest to lowest): 1. manual_location_override.coordinates - Human-verified overrides 2. google_maps_enrichment.coordinates - Google Maps API 3. wikidata_enrichment.wikidata_coordinates - Wikidata 4. wikidata_enrichment.coordinates - Legacy Wikidata format 5. ghcid.location_resolution.source_coordinates - GHCID resolution 6. ghcid.location_resolution (direct lat/lon) - GHCID resolution alt 7. original_entry.locations[0] - CH-Annotator source 8. locations[0] (root level) - Root level locations array 9. location (root level) - Existing location object Usage: python normalize_custodian_files.py --dry-run # Preview changes python normalize_custodian_files.py # Apply changes python normalize_custodian_files.py --file NL-ZH-SCH-A-GAS.yaml # Single file """ import argparse import os import sys from datetime import datetime, timezone from pathlib import Path from typing import Any, Optional from ruamel.yaml import YAML # Use ruamel.yaml for round-trip preservation (keeps comments, anchors, formatting) yaml = YAML() yaml.preserve_quotes = True yaml.width = 120 # Configuration CUSTODIAN_DIR = Path("/Users/kempersc/apps/glam/data/custodian") # Coordinate extraction priority (highest to lowest) COORDINATE_SOURCES = [ ("manual_location_override.coordinates", "MANUAL_OVERRIDE"), ("google_maps_enrichment.coordinates", "GOOGLE_MAPS"), ("wikidata_enrichment.wikidata_coordinates", "WIKIDATA"), ("wikidata_enrichment.coordinates", "WIKIDATA_LEGACY"), ("ghcid.location_resolution.source_coordinates", "GHCID_RESOLUTION"), ("ghcid.location_resolution", "GHCID_RESOLUTION_DIRECT"), ("original_entry.locations[0]", "ORIGINAL_ENTRY"), ("locations[0]", "ROOT_LOCATIONS"), ("location", "EXISTING_LOCATION"), ] def get_nested(data: dict, path: str) -> Optional[Any]: """Get a nested value from a dict using dot notation with array support.""" parts = path.replace("[0]", ".0").split(".") current = data for part in parts: if current is None: return None if part == "0": if isinstance(current, list) and len(current) > 0: current = current[0] else: return None elif isinstance(current, dict): current = current.get(part) else: return None return current def extract_coordinates(data: dict) -> tuple[Optional[float], Optional[float], Optional[dict]]: """ Extract coordinates from data using priority-based cascade. Returns: Tuple of (latitude, longitude, provenance_dict) or (None, None, None) if not found. provenance_dict includes: - source_type: e.g., "google_maps_api", "wikidata_api", "manual_override" - source_path: The data path where coordinates were found - original_timestamp: When the source data was fetched (if available) - api_endpoint: The API endpoint used (if applicable) - entity_id: Wikidata ID, Google place_id, etc. (if applicable) """ for path, source_name in COORDINATE_SOURCES: value = get_nested(data, path) if value is None: continue lat = None lon = None if isinstance(value, dict): # Check for latitude/longitude keys lat = value.get("latitude") or value.get("lat") lon = value.get("longitude") or value.get("lon") or value.get("lng") if lat is not None and lon is not None: try: lat_float = float(lat) lon_float = float(lon) # Validate coordinates are in valid ranges if -90 <= lat_float <= 90 and -180 <= lon_float <= 180: # Build rich provenance based on source provenance = build_coordinate_provenance(data, path, source_name) return lat_float, lon_float, provenance except (ValueError, TypeError): continue return None, None, None def build_coordinate_provenance(data: dict, path: str, source_name: str) -> dict: """ Build rich provenance metadata for coordinate extraction. Returns a dictionary with: - source_type: Normalized source type identifier - source_path: The data path where coordinates were found - original_timestamp: When the source data was originally fetched - api_endpoint: API endpoint (if applicable) - entity_id: External identifier (Wikidata Q-ID, Google place_id, etc.) """ provenance = { "source_type": source_name, "source_path": path, } if source_name == "GOOGLE_MAPS": gm = data.get("google_maps_enrichment", {}) # Get timestamp and place_id directly from enrichment block first provenance["original_timestamp"] = gm.get("fetch_timestamp") provenance["entity_id"] = gm.get("place_id") # Try to get api_endpoint from provenance.sources.google_maps gm_sources = get_nested(data, "provenance.sources.google_maps") if isinstance(gm_sources, list) and len(gm_sources) > 0: gm_source = gm_sources[0] provenance["api_endpoint"] = gm_source.get("api_endpoint") # Backfill if not found in enrichment block if not provenance.get("original_timestamp"): provenance["original_timestamp"] = gm_source.get("fetch_timestamp") if not provenance.get("entity_id"): provenance["entity_id"] = gm_source.get("place_id") elif source_name in ("WIKIDATA", "WIKIDATA_LEGACY"): wd = data.get("wikidata_enrichment", {}) # Try api_metadata first (newer format) api_meta = wd.get("api_metadata", {}) if api_meta: provenance["original_timestamp"] = api_meta.get("fetch_timestamp") provenance["api_endpoint"] = api_meta.get("api_endpoint") # Get entity_id from wikidata_entity_id or wikidata_id provenance["entity_id"] = ( wd.get("wikidata_entity_id") or wd.get("wikidata_id") or get_nested(data, "original_entry.wikidata_id") ) # Fallback to provenance.sources.wikidata if api_metadata not found if not provenance.get("original_timestamp"): wd_sources = get_nested(data, "provenance.sources.wikidata") if isinstance(wd_sources, list) and len(wd_sources) > 0: wd_source = wd_sources[0] provenance["original_timestamp"] = wd_source.get("fetch_timestamp") provenance["api_endpoint"] = wd_source.get("api_endpoint") if not provenance.get("entity_id"): provenance["entity_id"] = wd_source.get("entity_id") elif source_name == "MANUAL_OVERRIDE": override = data.get("manual_location_override", {}) provenance["original_timestamp"] = override.get("override_timestamp") provenance["override_reason"] = override.get("reason") provenance["override_by"] = override.get("verified_by") elif source_name in ("GHCID_RESOLUTION", "GHCID_RESOLUTION_DIRECT"): ghcid_loc = get_nested(data, "ghcid.location_resolution") if isinstance(ghcid_loc, dict): provenance["original_timestamp"] = ghcid_loc.get("resolution_timestamp") provenance["entity_id"] = ghcid_loc.get("geonames_id") provenance["resolution_method"] = ghcid_loc.get("method") elif source_name == "ORIGINAL_ENTRY": # Try to get source info from provenance.sources.original_entry oe_sources = get_nested(data, "provenance.sources.original_entry") if isinstance(oe_sources, list) and len(oe_sources) > 0: oe_source = oe_sources[0] provenance["source_type"] = oe_source.get("source_type", source_name) provenance["data_tier"] = oe_source.get("data_tier") # Clean up None values return {k: v for k, v in provenance.items() if v is not None} def extract_address_components(data: dict) -> dict: """ Extract address components from various sources. Returns: Dictionary with address components (city, region, country, etc.) """ address = {} # Try Google Maps first (most detailed) gm = data.get("google_maps_enrichment", {}) if gm: address["formatted_address"] = gm.get("formatted_address") address["street_address"] = gm.get("short_address") # Extract from address_components components = gm.get("address_components", []) for comp in components: types = comp.get("types", []) if "locality" in types: address["city"] = comp.get("long_name") elif "administrative_area_level_1" in types: address["region"] = comp.get("long_name") address["region_code"] = comp.get("short_name") elif "country" in types: address["country"] = comp.get("short_name") elif "postal_code" in types: address["postal_code"] = comp.get("long_name") # Fallback to GHCID location resolution ghcid_loc = get_nested(data, "ghcid.location_resolution") if ghcid_loc and isinstance(ghcid_loc, dict): if not address.get("city"): address["city"] = ghcid_loc.get("city_name") or ghcid_loc.get("geonames_name") or ghcid_loc.get("google_maps_locality") if not address.get("region"): address["region"] = ghcid_loc.get("region_name") if not address.get("region_code"): address["region_code"] = ghcid_loc.get("region_code") if not address.get("country"): address["country"] = ghcid_loc.get("country_code") if not address.get("geonames_id"): address["geonames_id"] = ghcid_loc.get("geonames_id") if not address.get("geonames_name"): address["geonames_name"] = ghcid_loc.get("geonames_name") if not address.get("feature_code"): address["feature_code"] = ghcid_loc.get("feature_code") # Fallback to original_entry.locations orig_loc = get_nested(data, "original_entry.locations[0]") if orig_loc and isinstance(orig_loc, dict): if not address.get("city"): address["city"] = orig_loc.get("city") if not address.get("region"): address["region"] = orig_loc.get("region") if not address.get("country"): address["country"] = orig_loc.get("country") if not address.get("postal_code"): address["postal_code"] = orig_loc.get("postal_code") if not address.get("street_address"): address["street_address"] = orig_loc.get("street_address") # Fallback to root-level locations root_loc = get_nested(data, "locations[0]") if root_loc and isinstance(root_loc, dict): if not address.get("city"): address["city"] = root_loc.get("city") if not address.get("region"): address["region"] = root_loc.get("region") if not address.get("country"): address["country"] = root_loc.get("country") # Fallback to original_entry plaatsnaam (NDE files) orig_entry = data.get("original_entry", {}) if not address.get("city"): address["city"] = orig_entry.get("plaatsnaam_bezoekadres") # Clean up None values return {k: v for k, v in address.items() if v is not None} def create_canonical_location(data: dict) -> Optional[dict]: """ Create a canonical location object from all available sources. Returns: Canonical location dictionary or None if no coordinates found. The location object includes: - latitude/longitude: The coordinates - coordinate_provenance: Rich provenance for coordinates (source, timestamp, entity_id, etc.) - city, region, country: Address components - geonames_id, geonames_name: GeoNames reference - normalization_timestamp: When this canonical location was created """ lat, lon, coord_provenance = extract_coordinates(data) address = extract_address_components(data) # Build canonical location object location = {} # Coordinates (if available) if lat is not None and lon is not None: location["latitude"] = lat location["longitude"] = lon # Rich provenance for coordinates if coord_provenance: location["coordinate_provenance"] = coord_provenance # Address components if address.get("city"): location["city"] = address["city"] if address.get("region"): location["region"] = address["region"] if address.get("region_code"): location["region_code"] = address["region_code"] if address.get("country"): location["country"] = address["country"] if address.get("postal_code"): location["postal_code"] = address["postal_code"] if address.get("street_address"): location["street_address"] = address["street_address"] if address.get("formatted_address"): location["formatted_address"] = address["formatted_address"] # GeoNames reference if address.get("geonames_id"): location["geonames_id"] = address["geonames_id"] if address.get("geonames_name"): location["geonames_name"] = address["geonames_name"] if address.get("feature_code"): location["feature_code"] = address["feature_code"] # Normalization timestamp (when this canonical location was created) if location: location["normalization_timestamp"] = datetime.now(timezone.utc).isoformat() return location if location else None def normalize_file(filepath: Path, dry_run: bool = False) -> dict: """ Normalize a single custodian file. Uses ruamel.yaml for round-trip preservation - preserves comments, anchors, aliases, and formatting from the original file. Returns: Dictionary with normalization results: - success: bool - changed: bool - has_coordinates: bool - coordinate_source: str or None (source_type from provenance) - error: str or None """ result = { "success": False, "changed": False, "has_coordinates": False, "coordinate_source": None, "error": None, } try: with open(filepath, 'r', encoding='utf-8') as f: data = yaml.load(f) if not isinstance(data, dict): result["error"] = "File is not a valid YAML dictionary" return result # Create canonical location canonical_location = create_canonical_location(data) if canonical_location: result["has_coordinates"] = "latitude" in canonical_location # Extract source_type from coordinate_provenance for stats coord_prov = canonical_location.get("coordinate_provenance", {}) result["coordinate_source"] = coord_prov.get("source_type") # Check if location already exists and is identical (compare key values) existing_location = data.get("location") if existing_location: # Compare key fields to see if update is needed existing_lat = existing_location.get("latitude") existing_lon = existing_location.get("longitude") new_lat = canonical_location.get("latitude") new_lon = canonical_location.get("longitude") if existing_lat == new_lat and existing_lon == new_lon: # Coordinates match, check if provenance exists if existing_location.get("coordinate_provenance"): result["success"] = True result["changed"] = False return result # Update or add the location block data["location"] = canonical_location result["changed"] = True # Add normalization provenance note (handle both list and string cases) if "provenance" not in data: data["provenance"] = {} notes = data["provenance"].get("notes") normalization_note = f"Canonical location normalized on {datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ')}" if notes is None: data["provenance"]["notes"] = [normalization_note] elif isinstance(notes, str): # Convert string to list and append data["provenance"]["notes"] = [notes, normalization_note] elif isinstance(notes, list): # Check if we already have a normalization note (avoid duplicates) has_norm_note = any("Canonical location" in str(n) for n in notes) if not has_norm_note: notes.append(normalization_note) if not dry_run: with open(filepath, 'w', encoding='utf-8') as f: yaml.dump(data, f) result["success"] = True return result except Exception as e: result["error"] = str(e) return result result["success"] = True return result except Exception as e: result["error"] = str(e) return result def main(): parser = argparse.ArgumentParser( description="Normalize custodian files to canonical schema v1.0.0" ) parser.add_argument( "--dry-run", action="store_true", help="Preview changes without writing to files" ) parser.add_argument( "--file", type=str, help="Process a single file (filename only, e.g., NL-ZH-SCH-A-GAS.yaml)" ) parser.add_argument( "--limit", type=int, default=0, help="Limit processing to N files (0 = no limit)" ) parser.add_argument( "--verbose", action="store_true", help="Show detailed output for each file" ) args = parser.parse_args() if args.dry_run: print("DRY RUN - No files will be modified\n") # Get list of files to process if args.file: files = [CUSTODIAN_DIR / args.file] if not files[0].exists(): print(f"Error: File not found: {files[0]}") sys.exit(1) else: files = sorted(CUSTODIAN_DIR.glob("*.yaml")) if args.limit > 0: files = files[:args.limit] print(f"Processing {len(files)} files...\n") # Statistics stats = { "total": len(files), "success": 0, "changed": 0, "with_coordinates": 0, "without_coordinates": 0, "errors": 0, "by_source": {}, } errors = [] for i, filepath in enumerate(files): result = normalize_file(filepath, dry_run=args.dry_run) if result["success"]: stats["success"] += 1 if result["changed"]: stats["changed"] += 1 if result["has_coordinates"]: stats["with_coordinates"] += 1 source = result["coordinate_source"] stats["by_source"][source] = stats["by_source"].get(source, 0) + 1 else: stats["without_coordinates"] += 1 else: stats["errors"] += 1 errors.append((filepath.name, result["error"])) if args.verbose: status = "OK" if result["success"] else "ERROR" changed = " [CHANGED]" if result["changed"] else "" coords = f" coords={result['coordinate_source']}" if result["has_coordinates"] else " NO_COORDS" print(f"[{i+1}/{len(files)}] {filepath.name}: {status}{changed}{coords}") elif (i + 1) % 1000 == 0: print(f"Processed {i+1}/{len(files)} files...") # Print summary print("\n" + "=" * 60) print("NORMALIZATION SUMMARY") print("=" * 60) print(f"Total files: {stats['total']}") print(f"Successfully processed: {stats['success']}") print(f"Files changed: {stats['changed']}") print(f"With coordinates: {stats['with_coordinates']} ({100*stats['with_coordinates']/stats['total']:.1f}%)") print(f"Without coordinates: {stats['without_coordinates']} ({100*stats['without_coordinates']/stats['total']:.1f}%)") print(f"Errors: {stats['errors']}") if stats["by_source"]: print("\nCoordinate sources:") for source, count in sorted(stats["by_source"].items(), key=lambda x: -x[1]): print(f" {source}: {count}") if errors: print(f"\nFirst {min(10, len(errors))} errors:") for filename, error in errors[:10]: print(f" {filename}: {error}") if args.dry_run: print("\n(DRY RUN - No files were modified)") if __name__ == "__main__": main()