glam/scripts/fix_moza_false_matches.py
kempersc 85d9cee82f fix: mark 8 more Google Maps false matches detected via name mismatch
Additional Type I custodian files with obvious name mismatches between
KIEN registry entries and Google Maps results. These couldn't be
auto-detected via domain mismatch because they lack official websites.

Fixes:
- Dick Timmerman (person) → carpentry business
- Ria Bos (cigar maker) → money transfer agent
- Stichting Kracom (Krampuslauf) → Happy Caps retail
- Fed. Nederlandse Vertelorganisaties → NET Foundation
- Stichting dodenherdenking Alphen → wrong memorial
- Sao Joao Rotterdam → Heemraadsplein (location not org)
- sport en spel (heritage) → equipment rental
- Eiertikken Ommen → restaurant

Also adds detection and fix scripts for Google Maps false matches.
2026-01-08 13:26:53 +01:00

154 lines
6.9 KiB
Python

#!/usr/bin/env python3
"""
Fix false matches in NL-GE-TIE-I-M.yaml (Stichting MOZA)
Issues to fix:
1. Google Maps FALSE MATCH: MOZA Makelaardij (real estate) vs Stichting MOZA (heritage)
2. YouTube FALSE MATCH: Wolfgang Amadeus Mozart - Topic vs Stichting MOZA
3. Location coordinates: Wrong Wapenveld coords from Google Maps
"""
import yaml
from pathlib import Path
from datetime import datetime
# Custom representer to handle multi-line strings nicely
def str_representer(dumper, data):
if '\n' in data:
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='>')
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
yaml.add_representer(str, str_representer)
def fix_moza_file():
file_path = Path("/Users/kempersc/apps/glam/data/custodian/NL-GE-TIE-I-M.yaml")
with open(file_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# 1. Fix Google Maps enrichment - mark as FALSE_MATCH
original_gmaps = data.get('google_maps_enrichment', {})
data['google_maps_enrichment'] = {
'status': 'FALSE_MATCH',
'false_match_reason': (
'Google Maps returned "MOZA Makelaardij" (real estate agency at moza.nl in Wapenveld) '
'instead of "Stichting MOZA" (Molukse heritage foundation at moza.nu in Tiel). '
'These are completely different organizations - one is a real estate agency, '
'the other is an intangible heritage custodian for Moluccan neighborhood culture.'
),
'original_false_match': original_gmaps,
'correction_timestamp': '2025-01-08T00:00:00Z',
'correction_agent': 'opencode-claude-sonnet-4'
}
# 2. Fix YouTube enrichment - mark as FALSE_MATCH
original_youtube = data.get('youtube_enrichment', {})
# Remove the massive videos list from original - just keep metadata
videos_removed = []
if 'videos' in original_youtube:
videos_removed = original_youtube.pop('videos')
data['youtube_enrichment'] = {
'status': 'FALSE_MATCH',
'false_match_reason': (
'YouTube search returned "Wolfgang Amadeus Mozart - Topic" (classical music auto-generated '
'channel with 190K subscribers, 5709 videos of Mozart compositions) instead of a channel '
'for "Stichting MOZA" (Molukse heritage foundation). The search matched "MOZA" to "Mozart" '
'due to name similarity. No YouTube channel found for Stichting MOZA.'
),
'original_false_match': {
'fetch_timestamp': original_youtube.get('fetch_timestamp'),
'api_endpoint': original_youtube.get('api_endpoint'),
'api_version': original_youtube.get('api_version'),
'channel_id': original_youtube.get('channel_id'),
'channel_url': original_youtube.get('channel_url'),
'title': original_youtube.get('title'),
'subscriber_count': original_youtube.get('subscriber_count'),
'video_count': original_youtube.get('video_count'),
'view_count': original_youtube.get('view_count'),
'status': original_youtube.get('status'),
},
'videos_removed_note': (
f'Removed {len(videos_removed)} Mozart classical music videos from original data as they have '
'no relevance to Stichting MOZA. Videos were from YouTube auto-generated Mozart Topic channel.'
),
'correction_timestamp': '2025-01-08T00:00:00Z',
'correction_agent': 'opencode-claude-sonnet-4'
}
# 3. Fix location - remove wrong Google Maps coordinates, use KIEN data
# The correct location is in 'locations' array (from KIEN), but 'location' has wrong coords
if 'location' in data:
original_location = data['location']
# Keep only the correct parts, remove Google Maps coords
data['location'] = {
'city': 'Tiel',
'region_code': 'GE',
'country': 'NL',
'geonames_id': 2746331,
'geonames_name': 'Tiel',
'feature_code': 'PPL',
'note': (
'Coordinates removed due to Google Maps false match. '
'Location derived from KIEN registry (Tiel). '
'Original false coordinates were from MOZA Makelaardij in Wapenveld.'
),
'coordinate_provenance_removed': {
'reason': 'FALSE_MATCH - coordinates were from wrong organization',
'original_latitude': original_location.get('latitude'),
'original_longitude': original_location.get('longitude'),
'original_source': original_location.get('coordinate_provenance', {}).get('source_type'),
'original_entity_id': original_location.get('coordinate_provenance', {}).get('entity_id'),
},
'normalization_timestamp': '2025-01-08T00:00:00Z'
}
# 4. Add provenance note about corrections
if 'provenance' not in data:
data['provenance'] = {}
if 'corrections' not in data['provenance']:
data['provenance']['corrections'] = []
data['provenance']['corrections'].append({
'correction_date': '2025-01-08T00:00:00Z',
'correction_type': 'google_maps_false_match',
'description': (
'Marked Google Maps enrichment as FALSE_MATCH. Google Maps returned "MOZA Makelaardij" '
'(real estate agency at moza.nl) instead of "Stichting MOZA" (heritage foundation at moza.nu). '
'Per Rule 40: KIEN is authoritative for Type I custodians.'
),
'corrected_by': 'opencode-claude-sonnet-4'
})
data['provenance']['corrections'].append({
'correction_date': '2025-01-08T00:00:00Z',
'correction_type': 'youtube_false_match',
'description': (
'Marked YouTube enrichment as FALSE_MATCH. YouTube search returned "Wolfgang Amadeus Mozart - Topic" '
'channel instead of Stichting MOZA. This is a name similarity false match (MOZA → Mozart).'
),
'corrected_by': 'opencode-claude-sonnet-4'
})
data['provenance']['corrections'].append({
'correction_date': '2025-01-08T00:00:00Z',
'correction_type': 'location_coordinates_removed',
'description': (
'Removed incorrect coordinates from location section. Coordinates were from Google Maps '
'false match (MOZA Makelaardij in Wapenveld). Correct location is Tiel per KIEN registry.'
),
'corrected_by': 'opencode-claude-sonnet-4'
})
# Write back
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)
print(f"Fixed {file_path}")
print("- Marked Google Maps as FALSE_MATCH")
print("- Marked YouTube as FALSE_MATCH (removed Mozart videos)")
print("- Fixed location coordinates")
print("- Added provenance corrections")
if __name__ == '__main__':
fix_moza_file()