Additional Type I custodian files with obvious name mismatches between KIEN registry entries and Google Maps results. These couldn't be auto-detected via domain mismatch because they lack official websites. Fixes: - Dick Timmerman (person) → carpentry business - Ria Bos (cigar maker) → money transfer agent - Stichting Kracom (Krampuslauf) → Happy Caps retail - Fed. Nederlandse Vertelorganisaties → NET Foundation - Stichting dodenherdenking Alphen → wrong memorial - Sao Joao Rotterdam → Heemraadsplein (location not org) - sport en spel (heritage) → equipment rental - Eiertikken Ommen → restaurant Also adds detection and fix scripts for Google Maps false matches.
154 lines
6.9 KiB
Python
154 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix false matches in NL-GE-TIE-I-M.yaml (Stichting MOZA)
|
|
|
|
Issues to fix:
|
|
1. Google Maps FALSE MATCH: MOZA Makelaardij (real estate) vs Stichting MOZA (heritage)
|
|
2. YouTube FALSE MATCH: Wolfgang Amadeus Mozart - Topic vs Stichting MOZA
|
|
3. Location coordinates: Wrong Wapenveld coords from Google Maps
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Custom representer to handle multi-line strings nicely
|
|
def str_representer(dumper, data):
|
|
if '\n' in data:
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data, style='>')
|
|
return dumper.represent_scalar('tag:yaml.org,2002:str', data)
|
|
|
|
yaml.add_representer(str, str_representer)
|
|
|
|
def fix_moza_file():
|
|
file_path = Path("/Users/kempersc/apps/glam/data/custodian/NL-GE-TIE-I-M.yaml")
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# 1. Fix Google Maps enrichment - mark as FALSE_MATCH
|
|
original_gmaps = data.get('google_maps_enrichment', {})
|
|
data['google_maps_enrichment'] = {
|
|
'status': 'FALSE_MATCH',
|
|
'false_match_reason': (
|
|
'Google Maps returned "MOZA Makelaardij" (real estate agency at moza.nl in Wapenveld) '
|
|
'instead of "Stichting MOZA" (Molukse heritage foundation at moza.nu in Tiel). '
|
|
'These are completely different organizations - one is a real estate agency, '
|
|
'the other is an intangible heritage custodian for Moluccan neighborhood culture.'
|
|
),
|
|
'original_false_match': original_gmaps,
|
|
'correction_timestamp': '2025-01-08T00:00:00Z',
|
|
'correction_agent': 'opencode-claude-sonnet-4'
|
|
}
|
|
|
|
# 2. Fix YouTube enrichment - mark as FALSE_MATCH
|
|
original_youtube = data.get('youtube_enrichment', {})
|
|
# Remove the massive videos list from original - just keep metadata
|
|
videos_removed = []
|
|
if 'videos' in original_youtube:
|
|
videos_removed = original_youtube.pop('videos')
|
|
|
|
data['youtube_enrichment'] = {
|
|
'status': 'FALSE_MATCH',
|
|
'false_match_reason': (
|
|
'YouTube search returned "Wolfgang Amadeus Mozart - Topic" (classical music auto-generated '
|
|
'channel with 190K subscribers, 5709 videos of Mozart compositions) instead of a channel '
|
|
'for "Stichting MOZA" (Molukse heritage foundation). The search matched "MOZA" to "Mozart" '
|
|
'due to name similarity. No YouTube channel found for Stichting MOZA.'
|
|
),
|
|
'original_false_match': {
|
|
'fetch_timestamp': original_youtube.get('fetch_timestamp'),
|
|
'api_endpoint': original_youtube.get('api_endpoint'),
|
|
'api_version': original_youtube.get('api_version'),
|
|
'channel_id': original_youtube.get('channel_id'),
|
|
'channel_url': original_youtube.get('channel_url'),
|
|
'title': original_youtube.get('title'),
|
|
'subscriber_count': original_youtube.get('subscriber_count'),
|
|
'video_count': original_youtube.get('video_count'),
|
|
'view_count': original_youtube.get('view_count'),
|
|
'status': original_youtube.get('status'),
|
|
},
|
|
'videos_removed_note': (
|
|
f'Removed {len(videos_removed)} Mozart classical music videos from original data as they have '
|
|
'no relevance to Stichting MOZA. Videos were from YouTube auto-generated Mozart Topic channel.'
|
|
),
|
|
'correction_timestamp': '2025-01-08T00:00:00Z',
|
|
'correction_agent': 'opencode-claude-sonnet-4'
|
|
}
|
|
|
|
# 3. Fix location - remove wrong Google Maps coordinates, use KIEN data
|
|
# The correct location is in 'locations' array (from KIEN), but 'location' has wrong coords
|
|
if 'location' in data:
|
|
original_location = data['location']
|
|
# Keep only the correct parts, remove Google Maps coords
|
|
data['location'] = {
|
|
'city': 'Tiel',
|
|
'region_code': 'GE',
|
|
'country': 'NL',
|
|
'geonames_id': 2746331,
|
|
'geonames_name': 'Tiel',
|
|
'feature_code': 'PPL',
|
|
'note': (
|
|
'Coordinates removed due to Google Maps false match. '
|
|
'Location derived from KIEN registry (Tiel). '
|
|
'Original false coordinates were from MOZA Makelaardij in Wapenveld.'
|
|
),
|
|
'coordinate_provenance_removed': {
|
|
'reason': 'FALSE_MATCH - coordinates were from wrong organization',
|
|
'original_latitude': original_location.get('latitude'),
|
|
'original_longitude': original_location.get('longitude'),
|
|
'original_source': original_location.get('coordinate_provenance', {}).get('source_type'),
|
|
'original_entity_id': original_location.get('coordinate_provenance', {}).get('entity_id'),
|
|
},
|
|
'normalization_timestamp': '2025-01-08T00:00:00Z'
|
|
}
|
|
|
|
# 4. Add provenance note about corrections
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'corrections' not in data['provenance']:
|
|
data['provenance']['corrections'] = []
|
|
|
|
data['provenance']['corrections'].append({
|
|
'correction_date': '2025-01-08T00:00:00Z',
|
|
'correction_type': 'google_maps_false_match',
|
|
'description': (
|
|
'Marked Google Maps enrichment as FALSE_MATCH. Google Maps returned "MOZA Makelaardij" '
|
|
'(real estate agency at moza.nl) instead of "Stichting MOZA" (heritage foundation at moza.nu). '
|
|
'Per Rule 40: KIEN is authoritative for Type I custodians.'
|
|
),
|
|
'corrected_by': 'opencode-claude-sonnet-4'
|
|
})
|
|
|
|
data['provenance']['corrections'].append({
|
|
'correction_date': '2025-01-08T00:00:00Z',
|
|
'correction_type': 'youtube_false_match',
|
|
'description': (
|
|
'Marked YouTube enrichment as FALSE_MATCH. YouTube search returned "Wolfgang Amadeus Mozart - Topic" '
|
|
'channel instead of Stichting MOZA. This is a name similarity false match (MOZA → Mozart).'
|
|
),
|
|
'corrected_by': 'opencode-claude-sonnet-4'
|
|
})
|
|
|
|
data['provenance']['corrections'].append({
|
|
'correction_date': '2025-01-08T00:00:00Z',
|
|
'correction_type': 'location_coordinates_removed',
|
|
'description': (
|
|
'Removed incorrect coordinates from location section. Coordinates were from Google Maps '
|
|
'false match (MOZA Makelaardij in Wapenveld). Correct location is Tiel per KIEN registry.'
|
|
),
|
|
'corrected_by': 'opencode-claude-sonnet-4'
|
|
})
|
|
|
|
# Write back
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)
|
|
|
|
print(f"Fixed {file_path}")
|
|
print("- Marked Google Maps as FALSE_MATCH")
|
|
print("- Marked YouTube as FALSE_MATCH (removed Mozart videos)")
|
|
print("- Fixed location coordinates")
|
|
print("- Added provenance corrections")
|
|
|
|
if __name__ == '__main__':
|
|
fix_moza_file()
|