Additional Type I custodian files with obvious name mismatches between KIEN registry entries and Google Maps results. These couldn't be auto-detected via domain mismatch because they lack official websites. Fixes: - Dick Timmerman (person) → carpentry business - Ria Bos (cigar maker) → money transfer agent - Stichting Kracom (Krampuslauf) → Happy Caps retail - Fed. Nederlandse Vertelorganisaties → NET Foundation - Stichting dodenherdenking Alphen → wrong memorial - Sao Joao Rotterdam → Heemraadsplein (location not org) - sport en spel (heritage) → equipment rental - Eiertikken Ommen → restaurant Also adds detection and fix scripts for Google Maps false matches.
224 lines
7.2 KiB
Python
224 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Detect Google Maps domain mismatches for Type I (Intangible Heritage) custodians.
|
|
|
|
Per Rule 40: KIEN Registry is authoritative for Type I custodians.
|
|
Google Maps frequently returns false matches for virtual/volunteer organizations.
|
|
|
|
This script:
|
|
1. Reads all NL-*-I-*.yaml files
|
|
2. Compares google_maps_enrichment.website with contact.website
|
|
3. Reports domain mismatches indicating likely false Google Maps matches
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
import yaml
|
|
|
|
# Add parent dir to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
|
|
def extract_domain(url: str | None) -> str | None:
|
|
"""Extract domain from URL, handling common edge cases."""
|
|
if not url:
|
|
return None
|
|
|
|
# Normalize URL
|
|
url = url.strip()
|
|
if not url.startswith(('http://', 'https://')):
|
|
url = 'https://' + url
|
|
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc.lower()
|
|
# Remove www. prefix for comparison
|
|
if domain.startswith('www.'):
|
|
domain = domain[4:]
|
|
return domain
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def load_yaml_file(filepath: Path) -> dict | None:
|
|
"""Load a YAML file, handling errors gracefully."""
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
except Exception as e:
|
|
print(f" Error loading {filepath}: {e}")
|
|
return None
|
|
|
|
|
|
def check_domain_mismatch(data: dict) -> dict | None:
|
|
"""
|
|
Check if Google Maps website differs from official contact website.
|
|
|
|
Returns mismatch details if found, None otherwise.
|
|
"""
|
|
# Get Google Maps website
|
|
gmaps = data.get('google_maps_enrichment', {})
|
|
gmaps_website = gmaps.get('website')
|
|
gmaps_name = gmaps.get('name', '')
|
|
gmaps_status = gmaps.get('status', '')
|
|
|
|
# Skip if already marked as FALSE_MATCH
|
|
if gmaps_status == 'FALSE_MATCH':
|
|
return None
|
|
|
|
# Get official website from multiple possible sources (priority order)
|
|
official_website = None
|
|
|
|
# 1. Check custodian_name.official_website
|
|
custodian_name = data.get('custodian_name', {})
|
|
if isinstance(custodian_name, dict):
|
|
official_website = custodian_name.get('official_website')
|
|
|
|
# 2. Check original_entry.webadres_organisatie (KIEN source)
|
|
if not official_website:
|
|
original_entry = data.get('original_entry', {})
|
|
if isinstance(original_entry, dict):
|
|
official_website = original_entry.get('webadres_organisatie')
|
|
|
|
# 3. Check contact.website
|
|
if not official_website:
|
|
contact = data.get('contact', {})
|
|
if isinstance(contact, dict):
|
|
official_website = contact.get('website')
|
|
|
|
# 4. Check digital_platforms for WEBSITE type
|
|
if not official_website:
|
|
platforms = data.get('digital_platforms', []) or []
|
|
for p in platforms:
|
|
if isinstance(p, dict) and p.get('platform_type') == 'WEBSITE':
|
|
official_website = p.get('platform_url')
|
|
break
|
|
|
|
# Extract domains
|
|
gmaps_domain = extract_domain(gmaps_website)
|
|
official_domain = extract_domain(official_website)
|
|
|
|
# Skip if no Google Maps website
|
|
if not gmaps_domain:
|
|
return None
|
|
|
|
# Skip if no official website to compare
|
|
if not official_domain:
|
|
return {
|
|
'type': 'NO_OFFICIAL_WEBSITE',
|
|
'gmaps_domain': gmaps_domain,
|
|
'gmaps_name': gmaps_name,
|
|
'gmaps_website': gmaps_website,
|
|
}
|
|
|
|
# Compare domains
|
|
if gmaps_domain != official_domain:
|
|
return {
|
|
'type': 'DOMAIN_MISMATCH',
|
|
'gmaps_domain': gmaps_domain,
|
|
'official_domain': official_domain,
|
|
'gmaps_name': gmaps_name,
|
|
'gmaps_website': gmaps_website,
|
|
'official_website': official_website,
|
|
}
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
"""Main function to scan all Type I custodian files."""
|
|
# Find all Type I files
|
|
custodian_dir = Path(__file__).parent.parent / 'data' / 'custodian'
|
|
type_i_files = list(custodian_dir.glob('NL-*-I-*.yaml'))
|
|
|
|
print(f"Scanning {len(type_i_files)} Type I custodian files...\n")
|
|
|
|
mismatches = []
|
|
no_gmaps = []
|
|
already_fixed = []
|
|
no_official = []
|
|
ok = []
|
|
|
|
for filepath in sorted(type_i_files):
|
|
data = load_yaml_file(filepath)
|
|
if not data:
|
|
continue
|
|
|
|
# Check for Google Maps enrichment
|
|
gmaps = data.get('google_maps_enrichment', {})
|
|
if not gmaps:
|
|
no_gmaps.append(filepath.name)
|
|
continue
|
|
|
|
# Check if already marked as FALSE_MATCH
|
|
if gmaps.get('status') == 'FALSE_MATCH':
|
|
already_fixed.append(filepath.name)
|
|
continue
|
|
|
|
# Check for domain mismatch
|
|
result = check_domain_mismatch(data)
|
|
|
|
if result:
|
|
if result['type'] == 'DOMAIN_MISMATCH':
|
|
mismatches.append({
|
|
'file': filepath.name,
|
|
'custodian_name': data.get('custodian_name', {}).get('emic_name', filepath.stem),
|
|
**result
|
|
})
|
|
elif result['type'] == 'NO_OFFICIAL_WEBSITE':
|
|
no_official.append({
|
|
'file': filepath.name,
|
|
'custodian_name': data.get('custodian_name', {}).get('emic_name', filepath.stem),
|
|
**result
|
|
})
|
|
else:
|
|
ok.append(filepath.name)
|
|
|
|
# Print results
|
|
print("=" * 80)
|
|
print("DOMAIN MISMATCHES (Likely False Google Maps Matches)")
|
|
print("=" * 80)
|
|
|
|
if mismatches:
|
|
for m in mismatches:
|
|
print(f"\n📛 {m['file']}")
|
|
print(f" Custodian: {m['custodian_name']}")
|
|
print(f" GMaps domain: {m['gmaps_domain']} ({m['gmaps_website']})")
|
|
print(f" Official domain: {m['official_domain']} ({m['official_website']})")
|
|
print(f" GMaps name: {m['gmaps_name']}")
|
|
else:
|
|
print("\nNo domain mismatches found!")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("NO OFFICIAL WEBSITE (Cannot verify Google Maps match)")
|
|
print("=" * 80)
|
|
|
|
if no_official:
|
|
for item in no_official[:10]: # Show first 10
|
|
print(f"\n⚠️ {item['file']}")
|
|
print(f" Custodian: {item['custodian_name']}")
|
|
print(f" GMaps: {item['gmaps_domain']} ({item['gmaps_website']})")
|
|
if len(no_official) > 10:
|
|
print(f"\n ... and {len(no_official) - 10} more")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"\n✅ Domain matches OK: {len(ok)}")
|
|
print(f"📛 Domain mismatches: {len(mismatches)}")
|
|
print(f"⚠️ No official website: {len(no_official)}")
|
|
print(f"🔧 Already fixed (FALSE_MATCH): {len(already_fixed)}")
|
|
print(f"📭 No Google Maps data: {len(no_gmaps)}")
|
|
print(f"\nTotal files scanned: {len(type_i_files)}")
|
|
|
|
# Return exit code based on findings
|
|
if mismatches:
|
|
print(f"\n⚠️ {len(mismatches)} files need review for potential false Google Maps matches!")
|
|
return 1
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|