glam/scripts/finalize_batch11.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

218 lines
7.1 KiB
Python

#!/usr/bin/env python3
"""
Finalize Batch 11 Matches - Manual Validation
=============================================
After review, these are the VALIDATED matches from Batch 11
"""
import json
# Manually validated matches after review
VALIDATED_MATCHES = [
{
"museum_name": "Museo Histórico-Arqueológico",
"city": "Quillota",
"q_number": "Q12184920",
"wikidata_name": "Museo Histórico - Arqueológico de Quillota",
"confidence": "HIGH",
"reason": "Perfect name and location match"
},
{
"museum_name": "Museo Mapuche de Purén",
"city": "Capitán Pastene",
"q_number": "Q86282614",
"wikidata_name": "Museo Mapuche de Purén",
"confidence": "HIGH",
"reason": "Exact name match, Capitán Pastene is in Purén commune"
},
{
"museum_name": "Museo Pleistocénico",
"city": "Osorno",
"q_number": "Q112044601",
"wikidata_name": "Museo del Pleistoceno de Osorno",
"confidence": "HIGH",
"reason": "Perfect name and location match"
},
{
"museum_name": "Red de Museos Aysén",
"city": "Coyhaique",
"q_number": "Q53877849",
"wikidata_name": "Museo Regional de Aysén",
"confidence": "HIGH",
"reason": "Regional museum network, Coyhaique location matches"
},
{
"museum_name": "Museo Territorial Yagan Usi",
"city": "Cabo de Hornos",
"q_number": "Q6775581",
"wikidata_name": "Museo Territorial Yagán Usi - Martín González Calderón",
"confidence": "HIGH",
"reason": "Exact name match, Puerto Williams is capital of Cabo de Hornos"
}
]
# Rejected matches (keep for documentation)
REJECTED_MATCHES = [
{
"museum_name": "Museo de Tocopilla",
"q_number": "Q112135646",
"wikidata_name": "Museo Di",
"reason": "Virtual LGBT museum, completely different subject"
},
{
"museum_name": "Museo Rodulfo Philippi",
"q_number": "Q112135646",
"wikidata_name": "Museo Di",
"reason": "Same false positive"
},
{
"museum_name": "Museo del Libro del Mar",
"q_number": "Q112135646",
"wikidata_name": "Museo Di",
"reason": "Same false positive"
},
{
"museum_name": "Museo de Historia Local Los Perales",
"q_number": "Q6171788",
"wikidata_name": "Zoológico de Quilpué",
"reason": "Zoo, not a museum"
},
{
"museum_name": "Museo de las Iglesias",
"q_number": "Q112135646",
"wikidata_name": "Museo Di",
"reason": "Same false positive"
},
{
"museum_name": "Museo Histórico Municipal",
"q_number": "Q112135646",
"wikidata_name": "Museo Di",
"reason": "Same false positive"
}
]
# Needs manual research (no Wikidata entry found)
NEEDS_RESEARCH = [
{
"museum_name": "Museo de Tocopilla",
"city": "María Elena / Tocopilla",
"status": "No Wikidata entry found"
},
{
"museum_name": "Museo Rodulfo Philippi",
"city": "Chañaral",
"status": "No Wikidata entry found (possibly confused with Rudolf Philippi museums elsewhere)"
},
{
"museum_name": "Museo del Libro del Mar",
"city": "San Antonio",
"status": "No Wikidata entry found"
},
{
"museum_name": "Museo de Historia Local Los Perales",
"city": "Quilpué",
"status": "No Wikidata entry found"
},
{
"museum_name": "Museo de las Iglesias",
"city": "Castro, Chiloé",
"status": "No Wikidata entry found"
},
{
"museum_name": "Museo Histórico Municipal",
"city": "Puerto Natales, Última Esperanza",
"status": "No Wikidata entry found"
}
]
# Items requiring further investigation
NEEDS_VERIFICATION = [
{
"museum_name": "Museo Histórico y Cultural",
"city": "Cauquenes",
"q_number": "Q86281191",
"wikidata_name": "Museo Histórico Cultural Antuhuenu",
"location": "Nacimiento",
"issue": "Location mismatch - Nacimiento is in Bío Bío, not Cauquenes in Maule",
"recommendation": "REJECT - Different museum"
},
{
"museum_name": "Museo Rudolph Philippi",
"city": "Valdivia",
"q_number": "Q86283174",
"wikidata_name": "Mira Valdivia",
"issue": "Name doesn't match - unclear if 'Mira Valdivia' is Rudolph Philippi museum",
"recommendation": "REJECT - Needs manual Wikidata search for 'Museo Rudolph Philippi Valdivia'"
}
]
def main():
print("="*80)
print("BATCH 11 FINAL VALIDATION")
print("="*80)
print(f"\n✅ VALIDATED MATCHES: {len(VALIDATED_MATCHES)}")
for match in VALIDATED_MATCHES:
print(f"\n{match['museum_name']}")
print(f"{match['wikidata_name']} ({match['q_number']})")
print(f" Confidence: {match['confidence']}")
print(f" Reason: {match['reason']}")
print(f"\n\n❌ REJECTED MATCHES: {len(REJECTED_MATCHES)}")
for match in REJECTED_MATCHES:
print(f"\n{match['museum_name']}")
print(f"{match['wikidata_name']} ({match['q_number']})")
print(f" Reason: {match['reason']}")
print(f"\n\n🔍 NEEDS MANUAL RESEARCH: {len(NEEDS_RESEARCH)}")
for item in NEEDS_RESEARCH:
print(f"{item['museum_name']} ({item['city']})")
print(f"\n\n⚠️ NEEDS VERIFICATION: {len(NEEDS_VERIFICATION)}")
for item in NEEDS_VERIFICATION:
print(f"\n{item['museum_name']}")
print(f"{item['wikidata_name']} ({item['q_number']})")
print(f" Issue: {item['issue']}")
print(f" Recommendation: {item['recommendation']}")
# Summary
current_coverage = 55
validated_additions = len(VALIDATED_MATCHES)
new_coverage = current_coverage + validated_additions
print("\n" + "="*80)
print("BATCH 11 SUMMARY")
print("="*80)
print(f"Current coverage: {current_coverage}/90 (61.1%)")
print(f"Validated matches: {validated_additions}")
print(f"New coverage: {new_coverage}/90 ({(new_coverage/90*100):.1f}%)")
print(f"Museums without Wikidata: {len(NEEDS_RESEARCH)}")
if new_coverage / 90 >= 0.67:
print(f"\n✅ Reached 67% coverage milestone!")
# Save final results
output = {
'batch': 11,
'validation_date': '2025-11-09',
'validated_matches': VALIDATED_MATCHES,
'rejected_matches': REJECTED_MATCHES,
'needs_research': NEEDS_RESEARCH,
'needs_verification': NEEDS_VERIFICATION,
'summary': {
'validated': len(VALIDATED_MATCHES),
'rejected': len(REJECTED_MATCHES),
'needs_research': len(NEEDS_RESEARCH),
'current_coverage': f"{current_coverage}/90",
'new_coverage': f"{new_coverage}/90 ({(new_coverage/90*100):.1f}%)"
}
}
with open('scripts/batch11_final_validation.json', 'w', encoding='utf-8') as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\n💾 Final validation saved to: scripts/batch11_final_validation.json")
if __name__ == "__main__":
main()