glam/scripts/fix_gmaps_false_matches.py
kempersc 85d9cee82f fix: mark 8 more Google Maps false matches detected via name mismatch
Additional Type I custodian files with obvious name mismatches between
KIEN registry entries and Google Maps results. These couldn't be
auto-detected via domain mismatch because they lack official websites.

Fixes:
- Dick Timmerman (person) → carpentry business
- Ria Bos (cigar maker) → money transfer agent
- Stichting Kracom (Krampuslauf) → Happy Caps retail
- Fed. Nederlandse Vertelorganisaties → NET Foundation
- Stichting dodenherdenking Alphen → wrong memorial
- Sao Joao Rotterdam → Heemraadsplein (location not org)
- sport en spel (heritage) → equipment rental
- Eiertikken Ommen → restaurant

Also adds detection and fix scripts for Google Maps false matches.
2026-01-08 13:26:53 +01:00

195 lines
6.7 KiB
Python

#!/usr/bin/env python3
"""
Fix Google Maps false matches in Type I custodian files.
This script marks Google Maps enrichment as FALSE_MATCH when the domain
from Google Maps doesn't match the official domain from KIEN registry.
Per Rule 40: KIEN is TIER_1_AUTHORITATIVE for Type I custodians.
"""
import yaml
import sys
from pathlib import Path
from urllib.parse import urlparse
def extract_domain(url: str) -> str:
"""Extract domain from URL, handling None and empty strings."""
if not url:
return ""
try:
parsed = urlparse(url)
domain = parsed.netloc or parsed.path
domain = domain.lower().replace("www.", "")
return domain
except Exception:
return ""
def fix_gmaps_false_match(file_path: Path, dry_run: bool = False) -> dict:
"""
Fix a single file's Google Maps enrichment if it's a false match.
Returns dict with status info.
"""
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
result = {
'file': file_path.name,
'action': 'skipped',
'reason': None
}
# Check if already marked as FALSE_MATCH
gmaps = data.get('google_maps_enrichment', {})
if gmaps.get('status') == 'FALSE_MATCH':
result['reason'] = 'Already marked as FALSE_MATCH'
return result
# Get domains to compare
gmaps_website = gmaps.get('website', '')
# Check contact website first
official_website = data.get('contact', {}).get('website', '')
# If no contact website, check digital_platforms for WEBSITE type
if not official_website:
for p in data.get('digital_platforms', []):
if p.get('platform_type') == 'WEBSITE':
official_website = p.get('platform_url', '')
break
gmaps_domain = extract_domain(gmaps_website)
official_domain = extract_domain(official_website)
if not gmaps_domain or not official_domain:
result['reason'] = 'Missing domain info'
return result
# Check if domains match
if gmaps_domain == official_domain:
result['reason'] = 'Domains match'
return result
# Domains don't match - this is a false match
gmaps_name = gmaps.get('name', 'Unknown')
custodian_name = data.get('custodian_name', {}).get('claim_value', 'Unknown')
# Create the FALSE_MATCH structure
data['google_maps_enrichment'] = {
'status': 'FALSE_MATCH',
'false_match_reason': (
f'Google Maps returned "{gmaps_name}" (website: {gmaps_website}) '
f'instead of "{custodian_name}" (official website: {official_website}). '
f'Domain mismatch: {gmaps_domain} vs {official_domain}. '
'Per Rule 40: KIEN is authoritative for Type I intangible heritage custodians.'
),
'original_false_match': gmaps,
'correction_timestamp': '2025-01-08T00:00:00Z',
'correction_agent': 'opencode-claude-sonnet-4'
}
# Fix location if it has Google Maps coordinates
location = data.get('location', {})
coord_prov = location.get('coordinate_provenance', {})
if coord_prov.get('source_type') == 'GOOGLE_MAPS':
# Remove coordinates, keep city info
data['location'] = {
'city': location.get('city'),
'region_code': location.get('region_code'),
'country': location.get('country', 'NL'),
'geonames_id': location.get('geonames_id'),
'geonames_name': location.get('geonames_name'),
'feature_code': location.get('feature_code'),
'note': (
'Coordinates removed due to Google Maps false match. '
f'Original coordinates were from "{gmaps_name}".'
),
'coordinate_provenance_removed': {
'reason': 'FALSE_MATCH',
'original_latitude': location.get('latitude'),
'original_longitude': location.get('longitude'),
},
'normalization_timestamp': '2025-01-08T00:00:00Z'
}
# Add provenance correction
if 'provenance' not in data:
data['provenance'] = {}
if 'corrections' not in data['provenance']:
data['provenance']['corrections'] = []
data['provenance']['corrections'].append({
'correction_date': '2025-01-08T00:00:00Z',
'correction_type': 'google_maps_false_match',
'description': (
f'Marked Google Maps enrichment as FALSE_MATCH. '
f'GMaps returned "{gmaps_name}" ({gmaps_domain}) instead of '
f'"{custodian_name}" ({official_domain}).'
),
'corrected_by': 'opencode-claude-sonnet-4'
})
if not dry_run:
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)
result['action'] = 'fixed'
result['gmaps_name'] = gmaps_name
result['gmaps_domain'] = gmaps_domain
result['official_domain'] = official_domain
return result
def main():
import argparse
parser = argparse.ArgumentParser(description='Fix Google Maps false matches')
parser.add_argument('files', nargs='*', help='Specific files to fix (or all Type I if none)')
parser.add_argument('--dry-run', action='store_true', help='Show what would be fixed without changing files')
args = parser.parse_args()
custodian_dir = Path("/Users/kempersc/apps/glam/data/custodian")
if args.files:
files = [custodian_dir / f for f in args.files]
else:
# Find all Type I files
files = list(custodian_dir.glob("*-I-*.yaml"))
print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(files)} files...")
print()
fixed = []
skipped = []
for file_path in sorted(files):
if not file_path.exists():
print(f"⚠️ {file_path.name}: File not found")
continue
try:
result = fix_gmaps_false_match(file_path, dry_run=args.dry_run)
if result['action'] == 'fixed':
fixed.append(result)
print(f"{result['file']}")
print(f" GMaps: {result['gmaps_name']} ({result['gmaps_domain']})")
print(f" Official: {result['official_domain']}")
else:
skipped.append(result)
if args.dry_run:
print(f"⏭️ {result['file']}: {result['reason']}")
except Exception as e:
print(f"{file_path.name}: {e}")
print()
print(f"{'[DRY RUN] ' if args.dry_run else ''}Summary:")
print(f" Fixed: {len(fixed)}")
print(f" Skipped: {len(skipped)}")
if __name__ == '__main__':
main()