glam/scripts/fix_gmaps_false_matches.py

#!/usr/bin/env python3
"""
Fix Google Maps false matches in Type I custodian files.

This script marks Google Maps enrichment as FALSE_MATCH when the domain
from Google Maps doesn't match the official domain from KIEN registry.

Per Rule 40: KIEN is TIER_1_AUTHORITATIVE for Type I custodians.
"""

import yaml
import sys
from pathlib import Path
from urllib.parse import urlparse


def extract_domain(url: str) -> str:
    """Extract domain from URL, handling None and empty strings."""
    if not url:
        return ""
    try:
        parsed = urlparse(url)
        domain = parsed.netloc or parsed.path
        domain = domain.lower().replace("www.", "")
        return domain
    except Exception:
        return ""


def fix_gmaps_false_match(file_path: Path, dry_run: bool = False) -> dict:
    """
    Fix a single file's Google Maps enrichment if it's a false match.

    Returns dict with status info.
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        data = yaml.safe_load(f)

    result = {
        'file': file_path.name,
        'action': 'skipped',
        'reason': None
    }

    # Check if already marked as FALSE_MATCH
    gmaps = data.get('google_maps_enrichment', {})
    if gmaps.get('status') == 'FALSE_MATCH':
        result['reason'] = 'Already marked as FALSE_MATCH'
        return result

    # Get domains to compare
    gmaps_website = gmaps.get('website', '')

    # Check contact website first
    official_website = data.get('contact', {}).get('website', '')

    # If no contact website, check digital_platforms for WEBSITE type
    if not official_website:
        for p in data.get('digital_platforms', []):
            if p.get('platform_type') == 'WEBSITE':
                official_website = p.get('platform_url', '')
                break

    gmaps_domain = extract_domain(gmaps_website)
    official_domain = extract_domain(official_website)

    if not gmaps_domain or not official_domain:
        result['reason'] = 'Missing domain info'
        return result

    # Check if domains match
    if gmaps_domain == official_domain:
        result['reason'] = 'Domains match'
        return result

    # Domains don't match - this is a false match
    gmaps_name = gmaps.get('name', 'Unknown')
    custodian_name = data.get('custodian_name', {}).get('claim_value', 'Unknown')

    # Create the FALSE_MATCH structure
    data['google_maps_enrichment'] = {
        'status': 'FALSE_MATCH',
        'false_match_reason': (
            f'Google Maps returned "{gmaps_name}" (website: {gmaps_website}) '
            f'instead of "{custodian_name}" (official website: {official_website}). '
            f'Domain mismatch: {gmaps_domain} vs {official_domain}. '
            'Per Rule 40: KIEN is authoritative for Type I intangible heritage custodians.'
        ),
        'original_false_match': gmaps,
        'correction_timestamp': '2025-01-08T00:00:00Z',
        'correction_agent': 'opencode-claude-sonnet-4'
    }

    # Fix location if it has Google Maps coordinates
    location = data.get('location', {})
    coord_prov = location.get('coordinate_provenance', {})
    if coord_prov.get('source_type') == 'GOOGLE_MAPS':
        # Remove coordinates, keep city info
        data['location'] = {
            'city': location.get('city'),
            'region_code': location.get('region_code'),
            'country': location.get('country', 'NL'),
            'geonames_id': location.get('geonames_id'),
            'geonames_name': location.get('geonames_name'),
            'feature_code': location.get('feature_code'),
            'note': (
                'Coordinates removed due to Google Maps false match. '
                f'Original coordinates were from "{gmaps_name}".'
            ),
            'coordinate_provenance_removed': {
                'reason': 'FALSE_MATCH',
                'original_latitude': location.get('latitude'),
                'original_longitude': location.get('longitude'),
            },
            'normalization_timestamp': '2025-01-08T00:00:00Z'
        }

    # Add provenance correction
    if 'provenance' not in data:
        data['provenance'] = {}
    if 'corrections' not in data['provenance']:
        data['provenance']['corrections'] = []

    data['provenance']['corrections'].append({
        'correction_date': '2025-01-08T00:00:00Z',
        'correction_type': 'google_maps_false_match',
        'description': (
            f'Marked Google Maps enrichment as FALSE_MATCH. '
            f'GMaps returned "{gmaps_name}" ({gmaps_domain}) instead of '
            f'"{custodian_name}" ({official_domain}).'
        ),
        'corrected_by': 'opencode-claude-sonnet-4'
    })

    if not dry_run:
        with open(file_path, 'w', encoding='utf-8') as f:
            yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)

    result['action'] = 'fixed'
    result['gmaps_name'] = gmaps_name
    result['gmaps_domain'] = gmaps_domain
    result['official_domain'] = official_domain

    return result


def main():
    import argparse
    parser = argparse.ArgumentParser(description='Fix Google Maps false matches')
    parser.add_argument('files', nargs='*', help='Specific files to fix (or all Type I if none)')
    parser.add_argument('--dry-run', action='store_true', help='Show what would be fixed without changing files')
    args = parser.parse_args()

    custodian_dir = Path("/Users/kempersc/apps/glam/data/custodian")

    if args.files:
        files = [custodian_dir / f for f in args.files]
    else:
        # Find all Type I files
        files = list(custodian_dir.glob("*-I-*.yaml"))

    print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(files)} files...")
    print()

    fixed = []
    skipped = []

    for file_path in sorted(files):
        if not file_path.exists():
            print(f"⚠️  {file_path.name}: File not found")
            continue

        try:
            result = fix_gmaps_false_match(file_path, dry_run=args.dry_run)

            if result['action'] == 'fixed':
                fixed.append(result)
                print(f"✅ {result['file']}")
                print(f"   GMaps: {result['gmaps_name']} ({result['gmaps_domain']})")
                print(f"   Official: {result['official_domain']}")
            else:
                skipped.append(result)
                if args.dry_run:
                    print(f"⏭️  {result['file']}: {result['reason']}")
        except Exception as e:
            print(f"❌ {file_path.name}: {e}")

    print()
    print(f"{'[DRY RUN] ' if args.dry_run else ''}Summary:")
    print(f"  Fixed: {len(fixed)}")
    print(f"  Skipped: {len(skipped)}")


if __name__ == '__main__':
    main()