Additional Type I custodian files with obvious name mismatches between KIEN registry entries and Google Maps results. These couldn't be auto-detected via domain mismatch because they lack official websites. Fixes: - Dick Timmerman (person) → carpentry business - Ria Bos (cigar maker) → money transfer agent - Stichting Kracom (Krampuslauf) → Happy Caps retail - Fed. Nederlandse Vertelorganisaties → NET Foundation - Stichting dodenherdenking Alphen → wrong memorial - Sao Joao Rotterdam → Heemraadsplein (location not org) - sport en spel (heritage) → equipment rental - Eiertikken Ommen → restaurant Also adds detection and fix scripts for Google Maps false matches.
195 lines
6.7 KiB
Python
195 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fix Google Maps false matches in Type I custodian files.
|
|
|
|
This script marks Google Maps enrichment as FALSE_MATCH when the domain
|
|
from Google Maps doesn't match the official domain from KIEN registry.
|
|
|
|
Per Rule 40: KIEN is TIER_1_AUTHORITATIVE for Type I custodians.
|
|
"""
|
|
|
|
import yaml
|
|
import sys
|
|
from pathlib import Path
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
def extract_domain(url: str) -> str:
|
|
"""Extract domain from URL, handling None and empty strings."""
|
|
if not url:
|
|
return ""
|
|
try:
|
|
parsed = urlparse(url)
|
|
domain = parsed.netloc or parsed.path
|
|
domain = domain.lower().replace("www.", "")
|
|
return domain
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
def fix_gmaps_false_match(file_path: Path, dry_run: bool = False) -> dict:
|
|
"""
|
|
Fix a single file's Google Maps enrichment if it's a false match.
|
|
|
|
Returns dict with status info.
|
|
"""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
result = {
|
|
'file': file_path.name,
|
|
'action': 'skipped',
|
|
'reason': None
|
|
}
|
|
|
|
# Check if already marked as FALSE_MATCH
|
|
gmaps = data.get('google_maps_enrichment', {})
|
|
if gmaps.get('status') == 'FALSE_MATCH':
|
|
result['reason'] = 'Already marked as FALSE_MATCH'
|
|
return result
|
|
|
|
# Get domains to compare
|
|
gmaps_website = gmaps.get('website', '')
|
|
|
|
# Check contact website first
|
|
official_website = data.get('contact', {}).get('website', '')
|
|
|
|
# If no contact website, check digital_platforms for WEBSITE type
|
|
if not official_website:
|
|
for p in data.get('digital_platforms', []):
|
|
if p.get('platform_type') == 'WEBSITE':
|
|
official_website = p.get('platform_url', '')
|
|
break
|
|
|
|
gmaps_domain = extract_domain(gmaps_website)
|
|
official_domain = extract_domain(official_website)
|
|
|
|
if not gmaps_domain or not official_domain:
|
|
result['reason'] = 'Missing domain info'
|
|
return result
|
|
|
|
# Check if domains match
|
|
if gmaps_domain == official_domain:
|
|
result['reason'] = 'Domains match'
|
|
return result
|
|
|
|
# Domains don't match - this is a false match
|
|
gmaps_name = gmaps.get('name', 'Unknown')
|
|
custodian_name = data.get('custodian_name', {}).get('claim_value', 'Unknown')
|
|
|
|
# Create the FALSE_MATCH structure
|
|
data['google_maps_enrichment'] = {
|
|
'status': 'FALSE_MATCH',
|
|
'false_match_reason': (
|
|
f'Google Maps returned "{gmaps_name}" (website: {gmaps_website}) '
|
|
f'instead of "{custodian_name}" (official website: {official_website}). '
|
|
f'Domain mismatch: {gmaps_domain} vs {official_domain}. '
|
|
'Per Rule 40: KIEN is authoritative for Type I intangible heritage custodians.'
|
|
),
|
|
'original_false_match': gmaps,
|
|
'correction_timestamp': '2025-01-08T00:00:00Z',
|
|
'correction_agent': 'opencode-claude-sonnet-4'
|
|
}
|
|
|
|
# Fix location if it has Google Maps coordinates
|
|
location = data.get('location', {})
|
|
coord_prov = location.get('coordinate_provenance', {})
|
|
if coord_prov.get('source_type') == 'GOOGLE_MAPS':
|
|
# Remove coordinates, keep city info
|
|
data['location'] = {
|
|
'city': location.get('city'),
|
|
'region_code': location.get('region_code'),
|
|
'country': location.get('country', 'NL'),
|
|
'geonames_id': location.get('geonames_id'),
|
|
'geonames_name': location.get('geonames_name'),
|
|
'feature_code': location.get('feature_code'),
|
|
'note': (
|
|
'Coordinates removed due to Google Maps false match. '
|
|
f'Original coordinates were from "{gmaps_name}".'
|
|
),
|
|
'coordinate_provenance_removed': {
|
|
'reason': 'FALSE_MATCH',
|
|
'original_latitude': location.get('latitude'),
|
|
'original_longitude': location.get('longitude'),
|
|
},
|
|
'normalization_timestamp': '2025-01-08T00:00:00Z'
|
|
}
|
|
|
|
# Add provenance correction
|
|
if 'provenance' not in data:
|
|
data['provenance'] = {}
|
|
if 'corrections' not in data['provenance']:
|
|
data['provenance']['corrections'] = []
|
|
|
|
data['provenance']['corrections'].append({
|
|
'correction_date': '2025-01-08T00:00:00Z',
|
|
'correction_type': 'google_maps_false_match',
|
|
'description': (
|
|
f'Marked Google Maps enrichment as FALSE_MATCH. '
|
|
f'GMaps returned "{gmaps_name}" ({gmaps_domain}) instead of '
|
|
f'"{custodian_name}" ({official_domain}).'
|
|
),
|
|
'corrected_by': 'opencode-claude-sonnet-4'
|
|
})
|
|
|
|
if not dry_run:
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False, width=100)
|
|
|
|
result['action'] = 'fixed'
|
|
result['gmaps_name'] = gmaps_name
|
|
result['gmaps_domain'] = gmaps_domain
|
|
result['official_domain'] = official_domain
|
|
|
|
return result
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description='Fix Google Maps false matches')
|
|
parser.add_argument('files', nargs='*', help='Specific files to fix (or all Type I if none)')
|
|
parser.add_argument('--dry-run', action='store_true', help='Show what would be fixed without changing files')
|
|
args = parser.parse_args()
|
|
|
|
custodian_dir = Path("/Users/kempersc/apps/glam/data/custodian")
|
|
|
|
if args.files:
|
|
files = [custodian_dir / f for f in args.files]
|
|
else:
|
|
# Find all Type I files
|
|
files = list(custodian_dir.glob("*-I-*.yaml"))
|
|
|
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}Processing {len(files)} files...")
|
|
print()
|
|
|
|
fixed = []
|
|
skipped = []
|
|
|
|
for file_path in sorted(files):
|
|
if not file_path.exists():
|
|
print(f"⚠️ {file_path.name}: File not found")
|
|
continue
|
|
|
|
try:
|
|
result = fix_gmaps_false_match(file_path, dry_run=args.dry_run)
|
|
|
|
if result['action'] == 'fixed':
|
|
fixed.append(result)
|
|
print(f"✅ {result['file']}")
|
|
print(f" GMaps: {result['gmaps_name']} ({result['gmaps_domain']})")
|
|
print(f" Official: {result['official_domain']}")
|
|
else:
|
|
skipped.append(result)
|
|
if args.dry_run:
|
|
print(f"⏭️ {result['file']}: {result['reason']}")
|
|
except Exception as e:
|
|
print(f"❌ {file_path.name}: {e}")
|
|
|
|
print()
|
|
print(f"{'[DRY RUN] ' if args.dry_run else ''}Summary:")
|
|
print(f" Fixed: {len(fixed)}")
|
|
print(f" Skipped: {len(skipped)}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|