glam/scripts/detect_gmaps_mismatches.py
kempersc 85d9cee82f fix: mark 8 more Google Maps false matches detected via name mismatch
Additional Type I custodian files with obvious name mismatches between
KIEN registry entries and Google Maps results. These couldn't be
auto-detected via domain mismatch because they lack official websites.

Fixes:
- Dick Timmerman (person) → carpentry business
- Ria Bos (cigar maker) → money transfer agent
- Stichting Kracom (Krampuslauf) → Happy Caps retail
- Fed. Nederlandse Vertelorganisaties → NET Foundation
- Stichting dodenherdenking Alphen → wrong memorial
- Sao Joao Rotterdam → Heemraadsplein (location not org)
- sport en spel (heritage) → equipment rental
- Eiertikken Ommen → restaurant

Also adds detection and fix scripts for Google Maps false matches.
2026-01-08 13:26:53 +01:00

224 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Detect Google Maps domain mismatches for Type I (Intangible Heritage) custodians.
Per Rule 40: KIEN Registry is authoritative for Type I custodians.
Google Maps frequently returns false matches for virtual/volunteer organizations.
This script:
1. Reads all NL-*-I-*.yaml files
2. Compares google_maps_enrichment.website with contact.website
3. Reports domain mismatches indicating likely false Google Maps matches
"""
import os
import sys
from pathlib import Path
from urllib.parse import urlparse
import yaml
# Add parent dir to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
def extract_domain(url: str | None) -> str | None:
"""Extract domain from URL, handling common edge cases."""
if not url:
return None
# Normalize URL
url = url.strip()
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove www. prefix for comparison
if domain.startswith('www.'):
domain = domain[4:]
return domain
except Exception:
return None
def load_yaml_file(filepath: Path) -> dict | None:
"""Load a YAML file, handling errors gracefully."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
except Exception as e:
print(f" Error loading {filepath}: {e}")
return None
def check_domain_mismatch(data: dict) -> dict | None:
"""
Check if Google Maps website differs from official contact website.
Returns mismatch details if found, None otherwise.
"""
# Get Google Maps website
gmaps = data.get('google_maps_enrichment', {})
gmaps_website = gmaps.get('website')
gmaps_name = gmaps.get('name', '')
gmaps_status = gmaps.get('status', '')
# Skip if already marked as FALSE_MATCH
if gmaps_status == 'FALSE_MATCH':
return None
# Get official website from multiple possible sources (priority order)
official_website = None
# 1. Check custodian_name.official_website
custodian_name = data.get('custodian_name', {})
if isinstance(custodian_name, dict):
official_website = custodian_name.get('official_website')
# 2. Check original_entry.webadres_organisatie (KIEN source)
if not official_website:
original_entry = data.get('original_entry', {})
if isinstance(original_entry, dict):
official_website = original_entry.get('webadres_organisatie')
# 3. Check contact.website
if not official_website:
contact = data.get('contact', {})
if isinstance(contact, dict):
official_website = contact.get('website')
# 4. Check digital_platforms for WEBSITE type
if not official_website:
platforms = data.get('digital_platforms', []) or []
for p in platforms:
if isinstance(p, dict) and p.get('platform_type') == 'WEBSITE':
official_website = p.get('platform_url')
break
# Extract domains
gmaps_domain = extract_domain(gmaps_website)
official_domain = extract_domain(official_website)
# Skip if no Google Maps website
if not gmaps_domain:
return None
# Skip if no official website to compare
if not official_domain:
return {
'type': 'NO_OFFICIAL_WEBSITE',
'gmaps_domain': gmaps_domain,
'gmaps_name': gmaps_name,
'gmaps_website': gmaps_website,
}
# Compare domains
if gmaps_domain != official_domain:
return {
'type': 'DOMAIN_MISMATCH',
'gmaps_domain': gmaps_domain,
'official_domain': official_domain,
'gmaps_name': gmaps_name,
'gmaps_website': gmaps_website,
'official_website': official_website,
}
return None
def main():
"""Main function to scan all Type I custodian files."""
# Find all Type I files
custodian_dir = Path(__file__).parent.parent / 'data' / 'custodian'
type_i_files = list(custodian_dir.glob('NL-*-I-*.yaml'))
print(f"Scanning {len(type_i_files)} Type I custodian files...\n")
mismatches = []
no_gmaps = []
already_fixed = []
no_official = []
ok = []
for filepath in sorted(type_i_files):
data = load_yaml_file(filepath)
if not data:
continue
# Check for Google Maps enrichment
gmaps = data.get('google_maps_enrichment', {})
if not gmaps:
no_gmaps.append(filepath.name)
continue
# Check if already marked as FALSE_MATCH
if gmaps.get('status') == 'FALSE_MATCH':
already_fixed.append(filepath.name)
continue
# Check for domain mismatch
result = check_domain_mismatch(data)
if result:
if result['type'] == 'DOMAIN_MISMATCH':
mismatches.append({
'file': filepath.name,
'custodian_name': data.get('custodian_name', {}).get('emic_name', filepath.stem),
**result
})
elif result['type'] == 'NO_OFFICIAL_WEBSITE':
no_official.append({
'file': filepath.name,
'custodian_name': data.get('custodian_name', {}).get('emic_name', filepath.stem),
**result
})
else:
ok.append(filepath.name)
# Print results
print("=" * 80)
print("DOMAIN MISMATCHES (Likely False Google Maps Matches)")
print("=" * 80)
if mismatches:
for m in mismatches:
print(f"\n📛 {m['file']}")
print(f" Custodian: {m['custodian_name']}")
print(f" GMaps domain: {m['gmaps_domain']} ({m['gmaps_website']})")
print(f" Official domain: {m['official_domain']} ({m['official_website']})")
print(f" GMaps name: {m['gmaps_name']}")
else:
print("\nNo domain mismatches found!")
print("\n" + "=" * 80)
print("NO OFFICIAL WEBSITE (Cannot verify Google Maps match)")
print("=" * 80)
if no_official:
for item in no_official[:10]: # Show first 10
print(f"\n⚠️ {item['file']}")
print(f" Custodian: {item['custodian_name']}")
print(f" GMaps: {item['gmaps_domain']} ({item['gmaps_website']})")
if len(no_official) > 10:
print(f"\n ... and {len(no_official) - 10} more")
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"\n✅ Domain matches OK: {len(ok)}")
print(f"📛 Domain mismatches: {len(mismatches)}")
print(f"⚠️ No official website: {len(no_official)}")
print(f"🔧 Already fixed (FALSE_MATCH): {len(already_fixed)}")
print(f"📭 No Google Maps data: {len(no_gmaps)}")
print(f"\nTotal files scanned: {len(type_i_files)}")
# Return exit code based on findings
if mismatches:
print(f"\n⚠️ {len(mismatches)} files need review for potential false Google Maps matches!")
return 1
return 0
if __name__ == '__main__':
sys.exit(main())