glam/scripts/finalize_batch12.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

148 lines
5.6 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Manual validation of Batch 12 library matches.
Reviews 4 fuzzy matches from batch12_library_query_results.json
Validates location accuracy and eliminates false positives.
"""
import json
from pathlib import Path
def validate_batch12():
"""Manual validation with reasoning."""
input_file = Path("scripts/batch12_library_query_results.json")
print("=" * 80)
print("BATCH 12 LIBRARY MATCHES - MANUAL VALIDATION")
print("=" * 80)
print()
# Load query results
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
matches = data['matches']
print(f"Reviewing {len(matches)} matches from automated query:")
print()
# Manual validation
validated = []
rejected = []
# Match 1: Biblioteca Pública Federico Varela (Chañaral) → Q135435755 (La Reina)
print("1. Biblioteca Pública Federico Varela (Chañaral)")
print(" → Q135435755: Biblioteca Pública de La Reina (La Reina)")
print(" ❌ REJECT: Location mismatch")
print(" Reason: Chañaral (Atacama Region) ≠ La Reina (Santiago Metro)")
print(" Generic 'Biblioteca Pública' name caused false match")
print()
rejected.append({
"library_name": "Biblioteca Pública Federico Varela",
"city": "Chañaral",
"q_number": "Q135435755",
"wikidata_name": "Biblioteca Pública de La Reina",
"reason": "Location mismatch: Chañaral (Atacama) ≠ La Reina (Santiago)"
})
# Match 2: Biblioteca Pública de Illapel (Los Vilos) → Q135435755 (La Reina)
print("2. Biblioteca Pública de Illapel (Los Vilos)")
print(" → Q135435755: Biblioteca Pública de La Reina (La Reina)")
print(" ❌ REJECT: Same false positive")
print(" Reason: Illapel/Los Vilos (Coquimbo Region) ≠ La Reina (Santiago)")
print(" Generic 'Biblioteca Pública' name caused false match")
print()
rejected.append({
"library_name": "Biblioteca Pública de Illapel",
"city": "Los Vilos",
"q_number": "Q135435755",
"wikidata_name": "Biblioteca Pública de La Reina",
"reason": "Location mismatch: Los Vilos (Coquimbo) ≠ La Reina (Santiago)"
})
# Match 3: Biblioteca Pública N°56 (Talagante) → Q135435755 (La Reina)
print("3. Biblioteca Pública N°56 (Talagante)")
print(" → Q135435755: Biblioteca Pública de La Reina (La Reina)")
print(" ❌ REJECT: Same false positive")
print(" Reason: Talagante ≠ La Reina (both Santiago Metro, but different communes)")
print(" Generic 'Biblioteca Pública' name caused false match")
print()
rejected.append({
"library_name": "Biblioteca Pública N°56",
"city": "Talagante",
"q_number": "Q135435755",
"wikidata_name": "Biblioteca Pública de La Reina",
"reason": "Location mismatch: Talagante ≠ La Reina"
})
# Match 4: Biblioteca Pública N° 244 (Pichilemu) → Q134891536 (Lebu)
print("4. Biblioteca Pública N° 244 (Pichilemu)")
print(" → Q134891536: Biblioteca Pública Municipal de Lebu (Lebu)")
print(" ❌ REJECT: Location mismatch")
print(" Reason: Pichilemu (O'Higgins) ≠ Lebu (Bío Bío)")
print(" Generic 'Biblioteca Pública' name caused false match")
print()
rejected.append({
"library_name": "Biblioteca Pública N° 244",
"city": "Pichilemu",
"q_number": "Q134891536",
"wikidata_name": "Biblioteca Pública Municipal de Lebu",
"reason": "Location mismatch: Pichilemu (O'Higgins) ≠ Lebu (Bío Bío)"
})
# Summary
print("=" * 80)
print("VALIDATION SUMMARY")
print("=" * 80)
print(f"Validated: {len(validated)}")
print(f"Rejected: {len(rejected)}")
print()
# Analysis
print("KEY FINDING:")
print(" All 4 matches are FALSE POSITIVES caused by generic 'Biblioteca Pública' names")
print(" Q135435755 matched 3 different libraries due to fuzzy name similarity")
print(" Location information was ignored or insufficient for validation")
print()
print("ISSUE:")
print(" Chilean public libraries often have generic names:")
print(" - Biblioteca Pública N°56")
print(" - Biblioteca Pública N° 244")
print(" - Biblioteca Pública [City Name]")
print(" These create high false positive rates in fuzzy matching")
print()
print("RECOMMENDATION:")
print(" ❌ DO NOT apply any Batch 12 library matches")
print(" 🔍 Alternative strategy needed:")
print(" 1. Manual Wikidata search for specific named libraries")
print(" 2. Focus on well-documented museums instead")
print(" 3. Create Wikidata entries for missing libraries")
print()
# Save validation results
output = {
"batch": 12,
"validation_date": "2025-11-09",
"institution_type": "LIBRARY",
"validated_matches": validated,
"rejected_matches": rejected,
"summary": {
"validated": len(validated),
"rejected": len(rejected),
"false_positive_rate": f"{len(rejected)/len(matches)*100:.1f}%",
"recommendation": "REJECT ALL - Generic library names cause false positives"
}
}
output_file = Path("scripts/batch12_final_validation.json")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"✅ Validation results saved to: {output_file}")
if __name__ == "__main__":
validate_batch12()