#!/usr/bin/env python3 """ Detect Google Maps domain mismatches for Type I (Intangible Heritage) custodians. Per Rule 40: KIEN Registry is authoritative for Type I custodians. Google Maps frequently returns false matches for virtual/volunteer organizations. This script: 1. Reads all NL-*-I-*.yaml files 2. Compares google_maps_enrichment.website with contact.website 3. Reports domain mismatches indicating likely false Google Maps matches """ import os import sys from pathlib import Path from urllib.parse import urlparse import yaml # Add parent dir to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) def extract_domain(url: str | None) -> str | None: """Extract domain from URL, handling common edge cases.""" if not url: return None # Normalize URL url = url.strip() if not url.startswith(('http://', 'https://')): url = 'https://' + url try: parsed = urlparse(url) domain = parsed.netloc.lower() # Remove www. prefix for comparison if domain.startswith('www.'): domain = domain[4:] return domain except Exception: return None def load_yaml_file(filepath: Path) -> dict | None: """Load a YAML file, handling errors gracefully.""" try: with open(filepath, 'r', encoding='utf-8') as f: return yaml.safe_load(f) except Exception as e: print(f" Error loading {filepath}: {e}") return None def check_domain_mismatch(data: dict) -> dict | None: """ Check if Google Maps website differs from official contact website. Returns mismatch details if found, None otherwise. """ # Get Google Maps website gmaps = data.get('google_maps_enrichment', {}) gmaps_website = gmaps.get('website') gmaps_name = gmaps.get('name', '') gmaps_status = gmaps.get('status', '') # Skip if already marked as FALSE_MATCH if gmaps_status == 'FALSE_MATCH': return None # Get official website from multiple possible sources (priority order) official_website = None # 1. Check custodian_name.official_website custodian_name = data.get('custodian_name', {}) if isinstance(custodian_name, dict): official_website = custodian_name.get('official_website') # 2. Check original_entry.webadres_organisatie (KIEN source) if not official_website: original_entry = data.get('original_entry', {}) if isinstance(original_entry, dict): official_website = original_entry.get('webadres_organisatie') # 3. Check contact.website if not official_website: contact = data.get('contact', {}) if isinstance(contact, dict): official_website = contact.get('website') # 4. Check digital_platforms for WEBSITE type if not official_website: platforms = data.get('digital_platforms', []) or [] for p in platforms: if isinstance(p, dict) and p.get('platform_type') == 'WEBSITE': official_website = p.get('platform_url') break # Extract domains gmaps_domain = extract_domain(gmaps_website) official_domain = extract_domain(official_website) # Skip if no Google Maps website if not gmaps_domain: return None # Skip if no official website to compare if not official_domain: return { 'type': 'NO_OFFICIAL_WEBSITE', 'gmaps_domain': gmaps_domain, 'gmaps_name': gmaps_name, 'gmaps_website': gmaps_website, } # Compare domains if gmaps_domain != official_domain: return { 'type': 'DOMAIN_MISMATCH', 'gmaps_domain': gmaps_domain, 'official_domain': official_domain, 'gmaps_name': gmaps_name, 'gmaps_website': gmaps_website, 'official_website': official_website, } return None def main(): """Main function to scan all Type I custodian files.""" # Find all Type I files custodian_dir = Path(__file__).parent.parent / 'data' / 'custodian' type_i_files = list(custodian_dir.glob('NL-*-I-*.yaml')) print(f"Scanning {len(type_i_files)} Type I custodian files...\n") mismatches = [] no_gmaps = [] already_fixed = [] no_official = [] ok = [] for filepath in sorted(type_i_files): data = load_yaml_file(filepath) if not data: continue # Check for Google Maps enrichment gmaps = data.get('google_maps_enrichment', {}) if not gmaps: no_gmaps.append(filepath.name) continue # Check if already marked as FALSE_MATCH if gmaps.get('status') == 'FALSE_MATCH': already_fixed.append(filepath.name) continue # Check for domain mismatch result = check_domain_mismatch(data) if result: if result['type'] == 'DOMAIN_MISMATCH': mismatches.append({ 'file': filepath.name, 'custodian_name': data.get('custodian_name', {}).get('emic_name', filepath.stem), **result }) elif result['type'] == 'NO_OFFICIAL_WEBSITE': no_official.append({ 'file': filepath.name, 'custodian_name': data.get('custodian_name', {}).get('emic_name', filepath.stem), **result }) else: ok.append(filepath.name) # Print results print("=" * 80) print("DOMAIN MISMATCHES (Likely False Google Maps Matches)") print("=" * 80) if mismatches: for m in mismatches: print(f"\n📛 {m['file']}") print(f" Custodian: {m['custodian_name']}") print(f" GMaps domain: {m['gmaps_domain']} ({m['gmaps_website']})") print(f" Official domain: {m['official_domain']} ({m['official_website']})") print(f" GMaps name: {m['gmaps_name']}") else: print("\nNo domain mismatches found!") print("\n" + "=" * 80) print("NO OFFICIAL WEBSITE (Cannot verify Google Maps match)") print("=" * 80) if no_official: for item in no_official[:10]: # Show first 10 print(f"\n⚠️ {item['file']}") print(f" Custodian: {item['custodian_name']}") print(f" GMaps: {item['gmaps_domain']} ({item['gmaps_website']})") if len(no_official) > 10: print(f"\n ... and {len(no_official) - 10} more") print("\n" + "=" * 80) print("SUMMARY") print("=" * 80) print(f"\n✅ Domain matches OK: {len(ok)}") print(f"📛 Domain mismatches: {len(mismatches)}") print(f"⚠️ No official website: {len(no_official)}") print(f"🔧 Already fixed (FALSE_MATCH): {len(already_fixed)}") print(f"📭 No Google Maps data: {len(no_gmaps)}") print(f"\nTotal files scanned: {len(type_i_files)}") # Return exit code based on findings if mismatches: print(f"\n⚠️ {len(mismatches)} files need review for potential false Google Maps matches!") return 1 return 0 if __name__ == '__main__': sys.exit(main())