#!/usr/bin/env python3 """ Fix false matches in NL-GE-TIE-I-M.yaml (Stichting MOZA) Issues to fix: 1. Google Maps FALSE MATCH: MOZA Makelaardij (real estate) vs Stichting MOZA (heritage) 2. YouTube FALSE MATCH: Wolfgang Amadeus Mozart - Topic vs Stichting MOZA 3. Location coordinates: Wrong Wapenveld coords from Google Maps """ import yaml from pathlib import Path from datetime import datetime # Custom representer to handle multi-line strings nicely def str_representer(dumper, data): if '\n' in data: return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='>') return dumper.represent_scalar('tag:yaml.org,2002:str', data) yaml.add_representer(str, str_representer) def fix_moza_file(): file_path = Path("/Users/kempersc/apps/glam/data/custodian/NL-GE-TIE-I-M.yaml") with open(file_path, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) # 1. Fix Google Maps enrichment - mark as FALSE_MATCH original_gmaps = data.get('google_maps_enrichment', {}) data['google_maps_enrichment'] = { 'status': 'FALSE_MATCH', 'false_match_reason': ( 'Google Maps returned "MOZA Makelaardij" (real estate agency at moza.nl in Wapenveld) ' 'instead of "Stichting MOZA" (Molukse heritage foundation at moza.nu in Tiel). ' 'These are completely different organizations - one is a real estate agency, ' 'the other is an intangible heritage custodian for Moluccan neighborhood culture.' ), 'original_false_match': original_gmaps, 'correction_timestamp': '2025-01-08T00:00:00Z', 'correction_agent': 'opencode-claude-sonnet-4' } # 2. Fix YouTube enrichment - mark as FALSE_MATCH original_youtube = data.get('youtube_enrichment', {}) # Remove the massive videos list from original - just keep metadata videos_removed = [] if 'videos' in original_youtube: videos_removed = original_youtube.pop('videos') data['youtube_enrichment'] = { 'status': 'FALSE_MATCH', 'false_match_reason': ( 'YouTube search returned "Wolfgang Amadeus Mozart - Topic" (classical music auto-generated ' 'channel with 190K subscribers, 5709 videos of Mozart compositions) instead of a channel ' 'for "Stichting MOZA" (Molukse heritage foundation). The search matched "MOZA" to "Mozart" ' 'due to name similarity. No YouTube channel found for Stichting MOZA.' ), 'original_false_match': { 'fetch_timestamp': original_youtube.get('fetch_timestamp'), 'api_endpoint': original_youtube.get('api_endpoint'), 'api_version': original_youtube.get('api_version'), 'channel_id': original_youtube.get('channel_id'), 'channel_url': original_youtube.get('channel_url'), 'title': original_youtube.get('title'), 'subscriber_count': original_youtube.get('subscriber_count'), 'video_count': original_youtube.get('video_count'), 'view_count': original_youtube.get('view_count'), 'status': original_youtube.get('status'), }, 'videos_removed_note': ( f'Removed {len(videos_removed)} Mozart classical music videos from original data as they have ' 'no relevance to Stichting MOZA. Videos were from YouTube auto-generated Mozart Topic channel.' ), 'correction_timestamp': '2025-01-08T00:00:00Z', 'correction_agent': 'opencode-claude-sonnet-4' } # 3. Fix location - remove wrong Google Maps coordinates, use KIEN data # The correct location is in 'locations' array (from KIEN), but 'location' has wrong coords if 'location' in data: original_location = data['location'] # Keep only the correct parts, remove Google Maps coords data['location'] = { 'city': 'Tiel', 'region_code': 'GE', 'country': 'NL', 'geonames_id': 2746331, 'geonames_name': 'Tiel', 'feature_code': 'PPL', 'note': ( 'Coordinates removed due to Google Maps false match. ' 'Location derived from KIEN registry (Tiel). ' 'Original false coordinates were from MOZA Makelaardij in Wapenveld.' ), 'coordinate_provenance_removed': { 'reason': 'FALSE_MATCH - coordinates were from wrong organization', 'original_latitude': original_location.get('latitude'), 'original_longitude': original_location.get('longitude'), 'original_source': original_location.get('coordinate_provenance', {}).get('source_type'), 'original_entity_id': original_location.get('coordinate_provenance', {}).get('entity_id'), }, 'normalization_timestamp': '2025-01-08T00:00:00Z' } # 4. Add provenance note about corrections if 'provenance' not in data: data['provenance'] = {} if 'corrections' not in data['provenance']: data['provenance']['corrections'] = [] data['provenance']['corrections'].append({ 'correction_date': '2025-01-08T00:00:00Z', 'correction_type': 'google_maps_false_match', 'description': ( 'Marked Google Maps enrichment as FALSE_MATCH. Google Maps returned "MOZA Makelaardij" ' '(real estate agency at moza.nl) instead of "Stichting MOZA" (heritage foundation at moza.nu). ' 'Per Rule 40: KIEN is authoritative for Type I custodians.' ), 'corrected_by': 'opencode-claude-sonnet-4' }) data['provenance']['corrections'].append({ 'correction_date': '2025-01-08T00:00:00Z', 'correction_type': 'youtube_false_match', 'description': ( 'Marked YouTube enrichment as FALSE_MATCH. YouTube search returned "Wolfgang Amadeus Mozart - Topic" ' 'channel instead of Stichting MOZA. This is a name similarity false match (MOZA → Mozart).' ), 'corrected_by': 'opencode-claude-sonnet-4' }) data['provenance']['corrections'].append({ 'correction_date': '2025-01-08T00:00:00Z', 'correction_type': 'location_coordinates_removed', 'description': ( 'Removed incorrect coordinates from location section. Coordinates were from Google Maps ' 'false match (MOZA Makelaardij in Wapenveld). Correct location is Tiel per KIEN registry.' ), 'corrected_by': 'opencode-claude-sonnet-4' }) # Write back with open(file_path, 'w', encoding='utf-8') as f: yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100) print(f"Fixed {file_path}") print("- Marked Google Maps as FALSE_MATCH") print("- Marked YouTube as FALSE_MATCH (removed Mozart videos)") print("- Fixed location coordinates") print("- Added provenance corrections") if __name__ == '__main__': fix_moza_file()