#!/usr/bin/env python3 """ Analyze geographic data quality for Mexican heritage institutions. Identifies missing cities, suspicious coordinates, and non-Mexican institutions. """ import yaml from collections import defaultdict from typing import Dict, List, Tuple def analyze_mexican_geography(filepath: str) -> Dict: """Analyze geographic data for Mexican institutions.""" with open(filepath, 'r', encoding='utf-8') as f: data = yaml.safe_load(f) results = { 'total_institutions': len(data), 'mexican_institutions': [], 'missing_city': [], 'suspicious_coordinates': [], 'non_mexican_in_dataset': [], 'region_distribution': defaultdict(int), 'city_distribution': defaultdict(int) } # Known US-based institutions that might appear in Mexican dataset us_institutions = [ 'Library of Congress', 'Getty Research Institute', 'FAMSI', # Foundation for Advancement of Mesoamerican Studies (USA-based) ] for inst in data: if not inst.get('locations'): continue for loc in inst['locations']: country = loc.get('country') # Check for Mexican institutions if country == 'MX': results['mexican_institutions'].append(inst['name']) # Check for missing city if not loc.get('city'): results['missing_city'].append({ 'name': inst['name'], 'id': inst.get('id'), 'region': loc.get('region', 'N/A'), 'latitude': loc.get('latitude'), 'longitude': loc.get('longitude'), 'extraction_date': inst.get('provenance', {}).get('extraction_date'), 'conversation_id': inst.get('provenance', {}).get('conversation_id') }) # Track region distribution region = loc.get('region', 'Unknown') results['region_distribution'][region] += 1 # Track city distribution city = loc.get('city', 'No City') results['city_distribution'][city] += 1 # Check for suspicious coordinates (state-level centroids) lat = loc.get('latitude') lon = loc.get('longitude') # Known problematic coordinates (Zacatecas state centroid) if lat and lon: # Check if coordinates match state centroids (simplified check) if abs(lat - 23.0916177) < 0.01 and abs(lon - (-102.9333954)) < 0.01: results['suspicious_coordinates'].append({ 'name': inst['name'], 'region': loc.get('region'), 'lat': lat, 'lon': lon, 'note': 'Matches Zacatecas state centroid' }) # Check for non-Mexican institutions elif country != 'MX': # Check if institution name suggests it's US-based but related to Mexico for us_inst in us_institutions: if us_inst.lower() in inst['name'].lower(): results['non_mexican_in_dataset'].append({ 'name': inst['name'], 'country': country, 'city': loc.get('city', 'N/A'), 'region': loc.get('region', 'N/A'), 'note': 'US-based institution with Mexican content' }) return results def print_report(results: Dict): """Print analysis report.""" print("=" * 80) print("MEXICAN HERITAGE INSTITUTIONS - GEOGRAPHIC DATA ANALYSIS") print("=" * 80) print() print(f"Total institutions in dataset: {results['total_institutions']:,}") print(f"Mexican institutions (country: MX): {len(results['mexican_institutions'])}") print() print("-" * 80) print("ISSUE 1: Missing City Field") print("-" * 80) print(f"Institutions without city field: {len(results['missing_city'])}") print() if results['missing_city']: # Group by region by_region = defaultdict(list) for item in results['missing_city']: by_region[item['region']].append(item) for region, institutions in sorted(by_region.items()): print(f"\n{region} ({len(institutions)} institutions):") for inst in institutions[:5]: # Show first 5 print(f" - {inst['name']}") if inst['latitude'] and inst['longitude']: print(f" Coords: {inst['latitude']}, {inst['longitude']}") if len(institutions) > 5: print(f" ... and {len(institutions) - 5} more") print() print("-" * 80) print("ISSUE 2: Suspicious Coordinates") print("-" * 80) print(f"Institutions with suspicious coordinates: {len(results['suspicious_coordinates'])}") print() if results['suspicious_coordinates']: for item in results['suspicious_coordinates']: print(f" - {item['name']}") print(f" Region: {item['region']}, Coords: {item['lat']}, {item['lon']}") print(f" Note: {item['note']}") print() print("-" * 80) print("ISSUE 3: Non-Mexican Institutions in Dataset") print("-" * 80) print(f"US-based institutions with Mexican content: {len(results['non_mexican_in_dataset'])}") print() if results['non_mexican_in_dataset']: for item in results['non_mexican_in_dataset']: print(f" - {item['name']}") print(f" Location: {item['city']}, {item['region']}, {item['country']}") print(f" Note: {item['note']}") print() print("-" * 80) print("CITY DISTRIBUTION (Top 15)") print("-" * 80) city_dist = sorted(results['city_distribution'].items(), key=lambda x: x[1], reverse=True) for city, count in city_dist[:15]: print(f" {city:30s} : {count:3d}") print() print("-" * 80) print("REGION DISTRIBUTION") print("-" * 80) region_dist = sorted(results['region_distribution'].items(), key=lambda x: x[1], reverse=True) for region, count in region_dist: print(f" {region:30s} : {count:3d}") print() print("=" * 80) def main(): filepath = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml' print("Analyzing Mexican geographic data...") results = analyze_mexican_geography(filepath) print_report(results) # Save detailed results to file output_file = 'data/mexican_geography_analysis.yaml' with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(results, f, allow_unicode=True, sort_keys=False) print(f"\nDetailed results saved to: {output_file}") if __name__ == '__main__': main()