- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
148 lines
5.6 KiB
Python
Executable file
148 lines
5.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Manual validation of Batch 12 library matches.
|
|
|
|
Reviews 4 fuzzy matches from batch12_library_query_results.json
|
|
Validates location accuracy and eliminates false positives.
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
|
|
def validate_batch12():
|
|
"""Manual validation with reasoning."""
|
|
|
|
input_file = Path("scripts/batch12_library_query_results.json")
|
|
|
|
print("=" * 80)
|
|
print("BATCH 12 LIBRARY MATCHES - MANUAL VALIDATION")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load query results
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
matches = data['matches']
|
|
|
|
print(f"Reviewing {len(matches)} matches from automated query:")
|
|
print()
|
|
|
|
# Manual validation
|
|
validated = []
|
|
rejected = []
|
|
|
|
# Match 1: Biblioteca Pública Federico Varela (Chañaral) → Q135435755 (La Reina)
|
|
print("1. Biblioteca Pública Federico Varela (Chañaral)")
|
|
print(" → Q135435755: Biblioteca Pública de La Reina (La Reina)")
|
|
print(" ❌ REJECT: Location mismatch")
|
|
print(" Reason: Chañaral (Atacama Region) ≠ La Reina (Santiago Metro)")
|
|
print(" Generic 'Biblioteca Pública' name caused false match")
|
|
print()
|
|
rejected.append({
|
|
"library_name": "Biblioteca Pública Federico Varela",
|
|
"city": "Chañaral",
|
|
"q_number": "Q135435755",
|
|
"wikidata_name": "Biblioteca Pública de La Reina",
|
|
"reason": "Location mismatch: Chañaral (Atacama) ≠ La Reina (Santiago)"
|
|
})
|
|
|
|
# Match 2: Biblioteca Pública de Illapel (Los Vilos) → Q135435755 (La Reina)
|
|
print("2. Biblioteca Pública de Illapel (Los Vilos)")
|
|
print(" → Q135435755: Biblioteca Pública de La Reina (La Reina)")
|
|
print(" ❌ REJECT: Same false positive")
|
|
print(" Reason: Illapel/Los Vilos (Coquimbo Region) ≠ La Reina (Santiago)")
|
|
print(" Generic 'Biblioteca Pública' name caused false match")
|
|
print()
|
|
rejected.append({
|
|
"library_name": "Biblioteca Pública de Illapel",
|
|
"city": "Los Vilos",
|
|
"q_number": "Q135435755",
|
|
"wikidata_name": "Biblioteca Pública de La Reina",
|
|
"reason": "Location mismatch: Los Vilos (Coquimbo) ≠ La Reina (Santiago)"
|
|
})
|
|
|
|
# Match 3: Biblioteca Pública N°56 (Talagante) → Q135435755 (La Reina)
|
|
print("3. Biblioteca Pública N°56 (Talagante)")
|
|
print(" → Q135435755: Biblioteca Pública de La Reina (La Reina)")
|
|
print(" ❌ REJECT: Same false positive")
|
|
print(" Reason: Talagante ≠ La Reina (both Santiago Metro, but different communes)")
|
|
print(" Generic 'Biblioteca Pública' name caused false match")
|
|
print()
|
|
rejected.append({
|
|
"library_name": "Biblioteca Pública N°56",
|
|
"city": "Talagante",
|
|
"q_number": "Q135435755",
|
|
"wikidata_name": "Biblioteca Pública de La Reina",
|
|
"reason": "Location mismatch: Talagante ≠ La Reina"
|
|
})
|
|
|
|
# Match 4: Biblioteca Pública N° 244 (Pichilemu) → Q134891536 (Lebu)
|
|
print("4. Biblioteca Pública N° 244 (Pichilemu)")
|
|
print(" → Q134891536: Biblioteca Pública Municipal de Lebu (Lebu)")
|
|
print(" ❌ REJECT: Location mismatch")
|
|
print(" Reason: Pichilemu (O'Higgins) ≠ Lebu (Bío Bío)")
|
|
print(" Generic 'Biblioteca Pública' name caused false match")
|
|
print()
|
|
rejected.append({
|
|
"library_name": "Biblioteca Pública N° 244",
|
|
"city": "Pichilemu",
|
|
"q_number": "Q134891536",
|
|
"wikidata_name": "Biblioteca Pública Municipal de Lebu",
|
|
"reason": "Location mismatch: Pichilemu (O'Higgins) ≠ Lebu (Bío Bío)"
|
|
})
|
|
|
|
# Summary
|
|
print("=" * 80)
|
|
print("VALIDATION SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Validated: {len(validated)}")
|
|
print(f"Rejected: {len(rejected)}")
|
|
print()
|
|
|
|
# Analysis
|
|
print("KEY FINDING:")
|
|
print(" All 4 matches are FALSE POSITIVES caused by generic 'Biblioteca Pública' names")
|
|
print(" Q135435755 matched 3 different libraries due to fuzzy name similarity")
|
|
print(" Location information was ignored or insufficient for validation")
|
|
print()
|
|
|
|
print("ISSUE:")
|
|
print(" Chilean public libraries often have generic names:")
|
|
print(" - Biblioteca Pública N°56")
|
|
print(" - Biblioteca Pública N° 244")
|
|
print(" - Biblioteca Pública [City Name]")
|
|
print(" These create high false positive rates in fuzzy matching")
|
|
print()
|
|
|
|
print("RECOMMENDATION:")
|
|
print(" ❌ DO NOT apply any Batch 12 library matches")
|
|
print(" 🔍 Alternative strategy needed:")
|
|
print(" 1. Manual Wikidata search for specific named libraries")
|
|
print(" 2. Focus on well-documented museums instead")
|
|
print(" 3. Create Wikidata entries for missing libraries")
|
|
print()
|
|
|
|
# Save validation results
|
|
output = {
|
|
"batch": 12,
|
|
"validation_date": "2025-11-09",
|
|
"institution_type": "LIBRARY",
|
|
"validated_matches": validated,
|
|
"rejected_matches": rejected,
|
|
"summary": {
|
|
"validated": len(validated),
|
|
"rejected": len(rejected),
|
|
"false_positive_rate": f"{len(rejected)/len(matches)*100:.1f}%",
|
|
"recommendation": "REJECT ALL - Generic library names cause false positives"
|
|
}
|
|
}
|
|
|
|
output_file = Path("scripts/batch12_final_validation.json")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✅ Validation results saved to: {output_file}")
|
|
|
|
if __name__ == "__main__":
|
|
validate_batch12()
|