#!/usr/bin/env python3 """ Fix Google Maps false matches in Type I custodian files. This script marks Google Maps enrichment as FALSE_MATCH when the domain from Google Maps doesn't match the official domain from KIEN registry. Per Rule 40: KIEN is TIER_1_AUTHORITATIVE for Type I custodians. """ import yaml import sys from pathlib import Path from urllib.parse import urlparse def extract_domain(url: str) -> str: """Extract domain from URL, handling None and empty strings.""" if not url: return "" try: parsed = urlparse(url) domain = parsed.netloc or parsed.path domain = domain.lower().replace("www.", "") return domain except Exception: return "" def fix_gmaps_false_match(file_path: Path, dry_run: bool = False) -> dict: """ Fix a single file's Google Maps enrichment if it's a false match. Returns dict with status info. """ with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) result = { 'file': file_path.name, 'action': 'skipped', 'reason': None } # Check if already marked as FALSE_MATCH gmaps = data.get('google_maps_enrichment', {}) if gmaps.get('status') == 'FALSE_MATCH': result['reason'] = 'Already marked as FALSE_MATCH' return result # Get domains to compare gmaps_website = gmaps.get('website', '') # Check contact website first official_website = data.get('contact', {}).get('website', '') # If no contact website, check digital_platforms for WEBSITE type if not official_website: for p in data.get('digital_platforms', []): if p.get('platform_type') == 'WEBSITE': official_website = p.get('platform_url', '') break gmaps_domain = extract_domain(gmaps_website) official_domain = extract_domain(official_website) if not gmaps_domain or not official_domain: result['reason'] = 'Missing domain info' return result # Check if domains match if gmaps_domain == official_domain: result['reason'] = 'Domains match' return result # Domains don't match - this is a false match gmaps_name = gmaps.get('name', 'Unknown') custodian_name = data.get('custodian_name', {}).get('claim_value', 'Unknown') # Create the FALSE_MATCH structure data['google_maps_enrichment'] = { 'status': 'FALSE_MATCH', 'false_match_reason': ( f'Google Maps returned "{gmaps_name}" (website: {gmaps_website}) ' f'instead of "{custodian_name}" (official website: {official_website}). ' f'Domain mismatch: {gmaps_domain} vs {official_domain}. ' 'Per Rule 40: KIEN is authoritative for Type I intangible heritage custodians.' ), 'original_false_match': gmaps, 'correction_timestamp': '2025-01-08T00:00:00Z', 'correction_agent': 'opencode-claude-sonnet-4' } # Fix location if it has Google Maps coordinates location = data.get('location', {}) coord_prov = location.get('coordinate_provenance', {}) if coord_prov.get('source_type') == 'GOOGLE_MAPS': # Remove coordinates, keep city info data['location'] = { 'city': location.get('city'), 'region_code': location.get('region_code'), 'country': location.get('country', 'NL'), 'geonames_id': location.get('geonames_id'), 'geonames_name': location.get('geonames_name'), 'feature_code': location.get('feature_code'), 'note': ( 'Coordinates removed due to Google Maps false match. ' f'Original coordinates were from "{gmaps_name}".' ), 'coordinate_provenance_removed': { 'reason': 'FALSE_MATCH', 'original_latitude': location.get('latitude'), 'original_longitude': location.get('longitude'), }, 'normalization_timestamp': '2025-01-08T00:00:00Z' } # Add provenance correction if 'provenance' not in data: data['provenance'] = {} if 'corrections' not in data['provenance']: data['provenance']['corrections'] = [] data['provenance']['corrections'].append({ 'correction_date': '2025-01-08T00:00:00Z', 'correction_type': 'google_maps_false_match', 'description': ( f'Marked Google Maps enrichment as FALSE_MATCH. ' f'GMaps returned "{gmaps_name}" ({gmaps_domain}) instead of ' f'"{custodian_name}" ({official_domain}).' ), 'corrected_by': 'opencode-claude-sonnet-4' }) if not dry_run: with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100) result['action'] = 'fixed' result['gmaps_name'] = gmaps_name result['gmaps_domain'] = gmaps_domain result['official_domain'] = official_domain return result def main(): import argparse parser = argparse.ArgumentParser(description='Fix Google Maps false matches') parser.add_argument('files', nargs='*', help='Specific files to fix (or all Type I if none)') parser.add_argument('--dry-run', action='store_true', help='Show what would be fixed without changing files') args = parser.parse_args() custodian_dir = Path("/Users/kempersc/apps/glam/data/custodian") if args.files: files = [custodian_dir / f for f in args.files] else: # Find all Type I files files = list(custodian_dir.glob("*-I-*.yaml")) print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(files)} files...") print() fixed = [] skipped = [] for file_path in sorted(files): if not file_path.exists(): print(f"⚠️ {file_path.name}: File not found") continue try: result = fix_gmaps_false_match(file_path, dry_run=args.dry_run) if result['action'] == 'fixed': fixed.append(result) print(f"✅ {result['file']}") print(f" GMaps: {result['gmaps_name']} ({result['gmaps_domain']})") print(f" Official: {result['official_domain']}") else: skipped.append(result) if args.dry_run: print(f"⏭️ {result['file']}: {result['reason']}") except Exception as e: print(f"❌ {file_path.name}: {e}") print() print(f"{'[DRY RUN] ' if args.dry_run else ''}Summary:") print(f" Fixed: {len(fixed)}") print(f" Skipped: {len(skipped)}") if __name__ == '__main__': main()