192 lines
7.1 KiB
Python
Executable file
192 lines
7.1 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Analyze geographic data quality for Mexican heritage institutions.
|
|
Identifies missing cities, suspicious coordinates, and non-Mexican institutions.
|
|
"""
|
|
|
|
import yaml
|
|
from collections import defaultdict
|
|
from typing import Dict, List, Tuple
|
|
|
|
def analyze_mexican_geography(filepath: str) -> Dict:
|
|
"""Analyze geographic data for Mexican institutions."""
|
|
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
results = {
|
|
'total_institutions': len(data),
|
|
'mexican_institutions': [],
|
|
'missing_city': [],
|
|
'suspicious_coordinates': [],
|
|
'non_mexican_in_dataset': [],
|
|
'region_distribution': defaultdict(int),
|
|
'city_distribution': defaultdict(int)
|
|
}
|
|
|
|
# Known US-based institutions that might appear in Mexican dataset
|
|
us_institutions = [
|
|
'Library of Congress',
|
|
'Getty Research Institute',
|
|
'FAMSI', # Foundation for Advancement of Mesoamerican Studies (USA-based)
|
|
]
|
|
|
|
for inst in data:
|
|
if not inst.get('locations'):
|
|
continue
|
|
|
|
for loc in inst['locations']:
|
|
country = loc.get('country')
|
|
|
|
# Check for Mexican institutions
|
|
if country == 'MX':
|
|
results['mexican_institutions'].append(inst['name'])
|
|
|
|
# Check for missing city
|
|
if not loc.get('city'):
|
|
results['missing_city'].append({
|
|
'name': inst['name'],
|
|
'id': inst.get('id'),
|
|
'region': loc.get('region', 'N/A'),
|
|
'latitude': loc.get('latitude'),
|
|
'longitude': loc.get('longitude'),
|
|
'extraction_date': inst.get('provenance', {}).get('extraction_date'),
|
|
'conversation_id': inst.get('provenance', {}).get('conversation_id')
|
|
})
|
|
|
|
# Track region distribution
|
|
region = loc.get('region', 'Unknown')
|
|
results['region_distribution'][region] += 1
|
|
|
|
# Track city distribution
|
|
city = loc.get('city', 'No City')
|
|
results['city_distribution'][city] += 1
|
|
|
|
# Check for suspicious coordinates (state-level centroids)
|
|
lat = loc.get('latitude')
|
|
lon = loc.get('longitude')
|
|
|
|
# Known problematic coordinates (Zacatecas state centroid)
|
|
if lat and lon:
|
|
# Check if coordinates match state centroids (simplified check)
|
|
if abs(lat - 23.0916177) < 0.01 and abs(lon - (-102.9333954)) < 0.01:
|
|
results['suspicious_coordinates'].append({
|
|
'name': inst['name'],
|
|
'region': loc.get('region'),
|
|
'lat': lat,
|
|
'lon': lon,
|
|
'note': 'Matches Zacatecas state centroid'
|
|
})
|
|
|
|
# Check for non-Mexican institutions
|
|
elif country != 'MX':
|
|
# Check if institution name suggests it's US-based but related to Mexico
|
|
for us_inst in us_institutions:
|
|
if us_inst.lower() in inst['name'].lower():
|
|
results['non_mexican_in_dataset'].append({
|
|
'name': inst['name'],
|
|
'country': country,
|
|
'city': loc.get('city', 'N/A'),
|
|
'region': loc.get('region', 'N/A'),
|
|
'note': 'US-based institution with Mexican content'
|
|
})
|
|
|
|
return results
|
|
|
|
def print_report(results: Dict):
|
|
"""Print analysis report."""
|
|
|
|
print("=" * 80)
|
|
print("MEXICAN HERITAGE INSTITUTIONS - GEOGRAPHIC DATA ANALYSIS")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
print(f"Total institutions in dataset: {results['total_institutions']:,}")
|
|
print(f"Mexican institutions (country: MX): {len(results['mexican_institutions'])}")
|
|
print()
|
|
|
|
print("-" * 80)
|
|
print("ISSUE 1: Missing City Field")
|
|
print("-" * 80)
|
|
print(f"Institutions without city field: {len(results['missing_city'])}")
|
|
print()
|
|
|
|
if results['missing_city']:
|
|
# Group by region
|
|
by_region = defaultdict(list)
|
|
for item in results['missing_city']:
|
|
by_region[item['region']].append(item)
|
|
|
|
for region, institutions in sorted(by_region.items()):
|
|
print(f"\n{region} ({len(institutions)} institutions):")
|
|
for inst in institutions[:5]: # Show first 5
|
|
print(f" - {inst['name']}")
|
|
if inst['latitude'] and inst['longitude']:
|
|
print(f" Coords: {inst['latitude']}, {inst['longitude']}")
|
|
if len(institutions) > 5:
|
|
print(f" ... and {len(institutions) - 5} more")
|
|
|
|
print()
|
|
print("-" * 80)
|
|
print("ISSUE 2: Suspicious Coordinates")
|
|
print("-" * 80)
|
|
print(f"Institutions with suspicious coordinates: {len(results['suspicious_coordinates'])}")
|
|
print()
|
|
|
|
if results['suspicious_coordinates']:
|
|
for item in results['suspicious_coordinates']:
|
|
print(f" - {item['name']}")
|
|
print(f" Region: {item['region']}, Coords: {item['lat']}, {item['lon']}")
|
|
print(f" Note: {item['note']}")
|
|
|
|
print()
|
|
print("-" * 80)
|
|
print("ISSUE 3: Non-Mexican Institutions in Dataset")
|
|
print("-" * 80)
|
|
print(f"US-based institutions with Mexican content: {len(results['non_mexican_in_dataset'])}")
|
|
print()
|
|
|
|
if results['non_mexican_in_dataset']:
|
|
for item in results['non_mexican_in_dataset']:
|
|
print(f" - {item['name']}")
|
|
print(f" Location: {item['city']}, {item['region']}, {item['country']}")
|
|
print(f" Note: {item['note']}")
|
|
|
|
print()
|
|
print("-" * 80)
|
|
print("CITY DISTRIBUTION (Top 15)")
|
|
print("-" * 80)
|
|
|
|
city_dist = sorted(results['city_distribution'].items(), key=lambda x: x[1], reverse=True)
|
|
for city, count in city_dist[:15]:
|
|
print(f" {city:30s} : {count:3d}")
|
|
|
|
print()
|
|
print("-" * 80)
|
|
print("REGION DISTRIBUTION")
|
|
print("-" * 80)
|
|
|
|
region_dist = sorted(results['region_distribution'].items(), key=lambda x: x[1], reverse=True)
|
|
for region, count in region_dist:
|
|
print(f" {region:30s} : {count:3d}")
|
|
|
|
print()
|
|
print("=" * 80)
|
|
|
|
def main():
|
|
filepath = 'data/instances/all/globalglam-20251113-mexico-deduplicated.yaml'
|
|
|
|
print("Analyzing Mexican geographic data...")
|
|
results = analyze_mexican_geography(filepath)
|
|
|
|
print_report(results)
|
|
|
|
# Save detailed results to file
|
|
output_file = 'data/mexican_geography_analysis.yaml'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(results, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"\nDetailed results saved to: {output_file}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|