glam/scripts/analyze_mexican_geography.py
2025-11-19 23:25:22 +01:00

192 lines
7.1 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Analyze geographic data quality for Mexican heritage institutions.
Identifies missing cities, suspicious coordinates, and non-Mexican institutions.
"""
import yaml
from collections import defaultdict
from typing import Dict, List, Tuple
def analyze_mexican_geography(filepath: str) -> Dict:
"""Analyze geographic data for Mexican institutions."""
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
results = {
'total_institutions': len(data),
'mexican_institutions': [],
'missing_city': [],
'suspicious_coordinates': [],
'non_mexican_in_dataset': [],
'region_distribution': defaultdict(int),
'city_distribution': defaultdict(int)
}
# Known US-based institutions that might appear in Mexican dataset
us_institutions = [
'Library of Congress',
'Getty Research Institute',
'FAMSI', # Foundation for Advancement of Mesoamerican Studies (USA-based)
]
for inst in data:
if not inst.get('locations'):
continue
for loc in inst['locations']:
country = loc.get('country')
# Check for Mexican institutions
if country == 'MX':
results['mexican_institutions'].append(inst['name'])
# Check for missing city
if not loc.get('city'):
results['missing_city'].append({
'name': inst['name'],
'id': inst.get('id'),
'region': loc.get('region', 'N/A'),
'latitude': loc.get('latitude'),
'longitude': loc.get('longitude'),
'extraction_date': inst.get('provenance', {}).get('extraction_date'),
'conversation_id': inst.get('provenance', {}).get('conversation_id')
})
# Track region distribution
region = loc.get('region', 'Unknown')
results['region_distribution'][region] += 1
# Track city distribution
city = loc.get('city', 'No City')
results['city_distribution'][city] += 1
# Check for suspicious coordinates (state-level centroids)
lat = loc.get('latitude')
lon = loc.get('longitude')
# Known problematic coordinates (Zacatecas state centroid)
if lat and lon:
# Check if coordinates match state centroids (simplified check)
if abs(lat - 23.0916177) < 0.01 and abs(lon - (-102.9333954)) < 0.01:
results['suspicious_coordinates'].append({
'name': inst['name'],
'region': loc.get('region'),
'lat': lat,
'lon': lon,
'note': 'Matches Zacatecas state centroid'
})
# Check for non-Mexican institutions
elif country != 'MX':
# Check if institution name suggests it's US-based but related to Mexico
for us_inst in us_institutions:
if us_inst.lower() in inst['name'].lower():
results['non_mexican_in_dataset'].append({
'name': inst['name'],
'country': country,
'city': loc.get('city', 'N/A'),
'region': loc.get('region', 'N/A'),
'note': 'US-based institution with Mexican content'
})
return results
def print_report(results: Dict):
"""Print analysis report."""
print("=" * 80)
print("MEXICAN HERITAGE INSTITUTIONS - GEOGRAPHIC DATA ANALYSIS")
print("=" * 80)
print()
print(f"Total institutions in dataset: {results['total_institutions']:,}")
print(f"Mexican institutions (country: MX): {len(results['mexican_institutions'])}")
print()
print("-" * 80)
print("ISSUE 1: Missing City Field")
print("-" * 80)
print(f"Institutions without city field: {len(results['missing_city'])}")
print()
if results['missing_city']:
# Group by region
by_region = defaultdict(list)
for item in results['missing_city']:
by_region[item['region']].append(item)
for region, institutions in sorted(by_region.items()):
print(f"\n{region} ({len(institutions)} institutions):")
for inst in institutions[:5]: # Show first 5
print(f" - {inst['name']}")
if inst['latitude'] and inst['longitude']:
print(f" Coords: {inst['latitude']}, {inst['longitude']}")
if len(institutions) > 5:
print(f" ... and {len(institutions) - 5} more")
print()
print("-" * 80)
print("ISSUE 2: Suspicious Coordinates")
print("-" * 80)
print(f"Institutions with suspicious coordinates: {len(results['suspicious_coordinates'])}")
print()
if results['suspicious_coordinates']:
for item in results['suspicious_coordinates']:
print(f" - {item['name']}")
print(f" Region: {item['region']}, Coords: {item['lat']}, {item['lon']}")
print(f" Note: {item['note']}")
print()
print("-" * 80)
print("ISSUE 3: Non-Mexican Institutions in Dataset")
print("-" * 80)
print(f"US-based institutions with Mexican content: {len(results['non_mexican_in_dataset'])}")
print()
if results['non_mexican_in_dataset']:
for item in results['non_mexican_in_dataset']:
print(f" - {item['name']}")
print(f" Location: {item['city']}, {item['region']}, {item['country']}")
print(f" Note: {item['note']}")
print()
print("-" * 80)
print("CITY DISTRIBUTION (Top 15)")
print("-" * 80)
city_dist = sorted(results['city_distribution'].items(), key=lambda x: x[1], reverse=True)
for city, count in city_dist[:15]:
print(f" {city:30s} : {count:3d}")
print()
print("-" * 80)
print("REGION DISTRIBUTION")
print("-" * 80)
region_dist = sorted(results['region_distribution'].items(), key=lambda x: x[1], reverse=True)
for region, count in region_dist:
print(f" {region:30s} : {count:3d}")
print()
print("=" * 80)
def main():
filepath = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'
print("Analyzing Mexican geographic data...")
results = analyze_mexican_geography(filepath)
print_report(results)
# Save detailed results to file
output_file = 'data/mexican_geography_analysis.yaml'
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(results, f, allow_unicode=True, sort_keys=False)
print(f"\nDetailed results saved to: {output_file}")
if __name__ == '__main__':
main()