#!/usr/bin/env python3 """ Manual validation of Batch 12 library matches. Reviews 4 fuzzy matches from batch12_library_query_results.json Validates location accuracy and eliminates false positives. """ import json from pathlib import Path def validate_batch12(): """Manual validation with reasoning.""" input_file = Path("scripts/batch12_library_query_results.json") print("=" * 80) print("BATCH 12 LIBRARY MATCHES - MANUAL VALIDATION") print("=" * 80) print() # Load query results with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) matches = data['matches'] print(f"Reviewing {len(matches)} matches from automated query:") print() # Manual validation validated = [] rejected = [] # Match 1: Biblioteca Pública Federico Varela (Chañaral) → Q135435755 (La Reina) print("1. Biblioteca Pública Federico Varela (Chañaral)") print(" → Q135435755: Biblioteca Pública de La Reina (La Reina)") print(" ❌ REJECT: Location mismatch") print(" Reason: Chañaral (Atacama Region) ≠ La Reina (Santiago Metro)") print(" Generic 'Biblioteca Pública' name caused false match") print() rejected.append({ "library_name": "Biblioteca Pública Federico Varela", "city": "Chañaral", "q_number": "Q135435755", "wikidata_name": "Biblioteca Pública de La Reina", "reason": "Location mismatch: Chañaral (Atacama) ≠ La Reina (Santiago)" }) # Match 2: Biblioteca Pública de Illapel (Los Vilos) → Q135435755 (La Reina) print("2. Biblioteca Pública de Illapel (Los Vilos)") print(" → Q135435755: Biblioteca Pública de La Reina (La Reina)") print(" ❌ REJECT: Same false positive") print(" Reason: Illapel/Los Vilos (Coquimbo Region) ≠ La Reina (Santiago)") print(" Generic 'Biblioteca Pública' name caused false match") print() rejected.append({ "library_name": "Biblioteca Pública de Illapel", "city": "Los Vilos", "q_number": "Q135435755", "wikidata_name": "Biblioteca Pública de La Reina", "reason": "Location mismatch: Los Vilos (Coquimbo) ≠ La Reina (Santiago)" }) # Match 3: Biblioteca Pública N°56 (Talagante) → Q135435755 (La Reina) print("3. Biblioteca Pública N°56 (Talagante)") print(" → Q135435755: Biblioteca Pública de La Reina (La Reina)") print(" ❌ REJECT: Same false positive") print(" Reason: Talagante ≠ La Reina (both Santiago Metro, but different communes)") print(" Generic 'Biblioteca Pública' name caused false match") print() rejected.append({ "library_name": "Biblioteca Pública N°56", "city": "Talagante", "q_number": "Q135435755", "wikidata_name": "Biblioteca Pública de La Reina", "reason": "Location mismatch: Talagante ≠ La Reina" }) # Match 4: Biblioteca Pública N° 244 (Pichilemu) → Q134891536 (Lebu) print("4. Biblioteca Pública N° 244 (Pichilemu)") print(" → Q134891536: Biblioteca Pública Municipal de Lebu (Lebu)") print(" ❌ REJECT: Location mismatch") print(" Reason: Pichilemu (O'Higgins) ≠ Lebu (Bío Bío)") print(" Generic 'Biblioteca Pública' name caused false match") print() rejected.append({ "library_name": "Biblioteca Pública N° 244", "city": "Pichilemu", "q_number": "Q134891536", "wikidata_name": "Biblioteca Pública Municipal de Lebu", "reason": "Location mismatch: Pichilemu (O'Higgins) ≠ Lebu (Bío Bío)" }) # Summary print("=" * 80) print("VALIDATION SUMMARY") print("=" * 80) print(f"Validated: {len(validated)}") print(f"Rejected: {len(rejected)}") print() # Analysis print("KEY FINDING:") print(" All 4 matches are FALSE POSITIVES caused by generic 'Biblioteca Pública' names") print(" Q135435755 matched 3 different libraries due to fuzzy name similarity") print(" Location information was ignored or insufficient for validation") print() print("ISSUE:") print(" Chilean public libraries often have generic names:") print(" - Biblioteca Pública N°56") print(" - Biblioteca Pública N° 244") print(" - Biblioteca Pública [City Name]") print(" These create high false positive rates in fuzzy matching") print() print("RECOMMENDATION:") print(" ❌ DO NOT apply any Batch 12 library matches") print(" 🔍 Alternative strategy needed:") print(" 1. Manual Wikidata search for specific named libraries") print(" 2. Focus on well-documented museums instead") print(" 3. Create Wikidata entries for missing libraries") print() # Save validation results output = { "batch": 12, "validation_date": "2025-11-09", "institution_type": "LIBRARY", "validated_matches": validated, "rejected_matches": rejected, "summary": { "validated": len(validated), "rejected": len(rejected), "false_positive_rate": f"{len(rejected)/len(matches)*100:.1f}%", "recommendation": "REJECT ALL - Generic library names cause false positives" } } output_file = Path("scripts/batch12_final_validation.json") with open(output_file, 'w', encoding='utf-8') as f: json.dump(output, f, indent=2, ensure_ascii=False) print(f"✅ Validation results saved to: {output_file}") if __name__ == "__main__": validate_batch12()