glam/scripts/detect_gmaps_mismatches.py

#!/usr/bin/env python3
"""
Detect Google Maps domain mismatches for Type I (Intangible Heritage) custodians.

Per Rule 40: KIEN Registry is authoritative for Type I custodians.
Google Maps frequently returns false matches for virtual/volunteer organizations.

This script:
1. Reads all NL-*-I-*.yaml files
2. Compares google_maps_enrichment.website with contact.website
3. Reports domain mismatches indicating likely false Google Maps matches
"""

import os
import sys
from pathlib import Path
from urllib.parse import urlparse
import yaml

# Add parent dir to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))


def extract_domain(url: str | None) -> str | None:
    """Extract domain from URL, handling common edge cases."""
    if not url:
        return None

    # Normalize URL
    url = url.strip()
    if not url.startswith(('http://', 'https://')):
        url = 'https://' + url

    try:
        parsed = urlparse(url)
        domain = parsed.netloc.lower()
        # Remove www. prefix for comparison
        if domain.startswith('www.'):
            domain = domain[4:]
        return domain
    except Exception:
        return None


def load_yaml_file(filepath: Path) -> dict | None:
    """Load a YAML file, handling errors gracefully."""
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            return yaml.safe_load(f)
    except Exception as e:
        print(f"  Error loading {filepath}: {e}")
        return None


def check_domain_mismatch(data: dict) -> dict | None:
    """
    Check if Google Maps website differs from official contact website.

    Returns mismatch details if found, None otherwise.
    """
    # Get Google Maps website
    gmaps = data.get('google_maps_enrichment', {})
    gmaps_website = gmaps.get('website')
    gmaps_name = gmaps.get('name', '')
    gmaps_status = gmaps.get('status', '')

    # Skip if already marked as FALSE_MATCH
    if gmaps_status == 'FALSE_MATCH':
        return None

    # Get official website from multiple possible sources (priority order)
    official_website = None

    # 1. Check custodian_name.official_website
    custodian_name = data.get('custodian_name', {})
    if isinstance(custodian_name, dict):
        official_website = custodian_name.get('official_website')

    # 2. Check original_entry.webadres_organisatie (KIEN source)
    if not official_website:
        original_entry = data.get('original_entry', {})
        if isinstance(original_entry, dict):
            official_website = original_entry.get('webadres_organisatie')

    # 3. Check contact.website
    if not official_website:
        contact = data.get('contact', {})
        if isinstance(contact, dict):
            official_website = contact.get('website')

    # 4. Check digital_platforms for WEBSITE type
    if not official_website:
        platforms = data.get('digital_platforms', []) or []
        for p in platforms:
            if isinstance(p, dict) and p.get('platform_type') == 'WEBSITE':
                official_website = p.get('platform_url')
                break

    # Extract domains
    gmaps_domain = extract_domain(gmaps_website)
    official_domain = extract_domain(official_website)

    # Skip if no Google Maps website
    if not gmaps_domain:
        return None

    # Skip if no official website to compare
    if not official_domain:
        return {
            'type': 'NO_OFFICIAL_WEBSITE',
            'gmaps_domain': gmaps_domain,
            'gmaps_name': gmaps_name,
            'gmaps_website': gmaps_website,
        }

    # Compare domains
    if gmaps_domain != official_domain:
        return {
            'type': 'DOMAIN_MISMATCH',
            'gmaps_domain': gmaps_domain,
            'official_domain': official_domain,
            'gmaps_name': gmaps_name,
            'gmaps_website': gmaps_website,
            'official_website': official_website,
        }

    return None


def main():
    """Main function to scan all Type I custodian files."""
    # Find all Type I files
    custodian_dir = Path(__file__).parent.parent / 'data' / 'custodian'
    type_i_files = list(custodian_dir.glob('NL-*-I-*.yaml'))

    print(f"Scanning {len(type_i_files)} Type I custodian files...\n")

    mismatches = []
    no_gmaps = []
    already_fixed = []
    no_official = []
    ok = []

    for filepath in sorted(type_i_files):
        data = load_yaml_file(filepath)
        if not data:
            continue

        # Check for Google Maps enrichment
        gmaps = data.get('google_maps_enrichment', {})
        if not gmaps:
            no_gmaps.append(filepath.name)
            continue

        # Check if already marked as FALSE_MATCH
        if gmaps.get('status') == 'FALSE_MATCH':
            already_fixed.append(filepath.name)
            continue

        # Check for domain mismatch
        result = check_domain_mismatch(data)

        if result:
            if result['type'] == 'DOMAIN_MISMATCH':
                mismatches.append({
                    'file': filepath.name,
                    'custodian_name': data.get('custodian_name', {}).get('emic_name', filepath.stem),
                    **result
                })
            elif result['type'] == 'NO_OFFICIAL_WEBSITE':
                no_official.append({
                    'file': filepath.name,
                    'custodian_name': data.get('custodian_name', {}).get('emic_name', filepath.stem),
                    **result
                })
        else:
            ok.append(filepath.name)

    # Print results
    print("=" * 80)
    print("DOMAIN MISMATCHES (Likely False Google Maps Matches)")
    print("=" * 80)

    if mismatches:
        for m in mismatches:
            print(f"\n📛 {m['file']}")
            print(f"   Custodian: {m['custodian_name']}")
            print(f"   GMaps domain: {m['gmaps_domain']} ({m['gmaps_website']})")
            print(f"   Official domain: {m['official_domain']} ({m['official_website']})")
            print(f"   GMaps name: {m['gmaps_name']}")
    else:
        print("\nNo domain mismatches found!")

    print("\n" + "=" * 80)
    print("NO OFFICIAL WEBSITE (Cannot verify Google Maps match)")
    print("=" * 80)

    if no_official:
        for item in no_official[:10]:  # Show first 10
            print(f"\n⚠️  {item['file']}")
            print(f"   Custodian: {item['custodian_name']}")
            print(f"   GMaps: {item['gmaps_domain']} ({item['gmaps_website']})")
        if len(no_official) > 10:
            print(f"\n   ... and {len(no_official) - 10} more")

    print("\n" + "=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"\n✅ Domain matches OK: {len(ok)}")
    print(f"📛 Domain mismatches: {len(mismatches)}")
    print(f"⚠️  No official website: {len(no_official)}")
    print(f"🔧 Already fixed (FALSE_MATCH): {len(already_fixed)}")
    print(f"📭 No Google Maps data: {len(no_gmaps)}")
    print(f"\nTotal files scanned: {len(type_i_files)}")

    # Return exit code based on findings
    if mismatches:
        print(f"\n⚠️  {len(mismatches)} files need review for potential false Google Maps matches!")
        return 1
    return 0


if __name__ == '__main__':
    sys.exit(main())