- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
218 lines
7.1 KiB
Python
218 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Finalize Batch 11 Matches - Manual Validation
|
|
=============================================
|
|
|
|
After review, these are the VALIDATED matches from Batch 11
|
|
"""
|
|
|
|
import json
|
|
|
|
# Manually validated matches after review
|
|
VALIDATED_MATCHES = [
|
|
{
|
|
"museum_name": "Museo Histórico-Arqueológico",
|
|
"city": "Quillota",
|
|
"q_number": "Q12184920",
|
|
"wikidata_name": "Museo Histórico - Arqueológico de Quillota",
|
|
"confidence": "HIGH",
|
|
"reason": "Perfect name and location match"
|
|
},
|
|
{
|
|
"museum_name": "Museo Mapuche de Purén",
|
|
"city": "Capitán Pastene",
|
|
"q_number": "Q86282614",
|
|
"wikidata_name": "Museo Mapuche de Purén",
|
|
"confidence": "HIGH",
|
|
"reason": "Exact name match, Capitán Pastene is in Purén commune"
|
|
},
|
|
{
|
|
"museum_name": "Museo Pleistocénico",
|
|
"city": "Osorno",
|
|
"q_number": "Q112044601",
|
|
"wikidata_name": "Museo del Pleistoceno de Osorno",
|
|
"confidence": "HIGH",
|
|
"reason": "Perfect name and location match"
|
|
},
|
|
{
|
|
"museum_name": "Red de Museos Aysén",
|
|
"city": "Coyhaique",
|
|
"q_number": "Q53877849",
|
|
"wikidata_name": "Museo Regional de Aysén",
|
|
"confidence": "HIGH",
|
|
"reason": "Regional museum network, Coyhaique location matches"
|
|
},
|
|
{
|
|
"museum_name": "Museo Territorial Yagan Usi",
|
|
"city": "Cabo de Hornos",
|
|
"q_number": "Q6775581",
|
|
"wikidata_name": "Museo Territorial Yagán Usi - Martín González Calderón",
|
|
"confidence": "HIGH",
|
|
"reason": "Exact name match, Puerto Williams is capital of Cabo de Hornos"
|
|
}
|
|
]
|
|
|
|
# Rejected matches (keep for documentation)
|
|
REJECTED_MATCHES = [
|
|
{
|
|
"museum_name": "Museo de Tocopilla",
|
|
"q_number": "Q112135646",
|
|
"wikidata_name": "Museo Di",
|
|
"reason": "Virtual LGBT museum, completely different subject"
|
|
},
|
|
{
|
|
"museum_name": "Museo Rodulfo Philippi",
|
|
"q_number": "Q112135646",
|
|
"wikidata_name": "Museo Di",
|
|
"reason": "Same false positive"
|
|
},
|
|
{
|
|
"museum_name": "Museo del Libro del Mar",
|
|
"q_number": "Q112135646",
|
|
"wikidata_name": "Museo Di",
|
|
"reason": "Same false positive"
|
|
},
|
|
{
|
|
"museum_name": "Museo de Historia Local Los Perales",
|
|
"q_number": "Q6171788",
|
|
"wikidata_name": "Zoológico de Quilpué",
|
|
"reason": "Zoo, not a museum"
|
|
},
|
|
{
|
|
"museum_name": "Museo de las Iglesias",
|
|
"q_number": "Q112135646",
|
|
"wikidata_name": "Museo Di",
|
|
"reason": "Same false positive"
|
|
},
|
|
{
|
|
"museum_name": "Museo Histórico Municipal",
|
|
"q_number": "Q112135646",
|
|
"wikidata_name": "Museo Di",
|
|
"reason": "Same false positive"
|
|
}
|
|
]
|
|
|
|
# Needs manual research (no Wikidata entry found)
|
|
NEEDS_RESEARCH = [
|
|
{
|
|
"museum_name": "Museo de Tocopilla",
|
|
"city": "María Elena / Tocopilla",
|
|
"status": "No Wikidata entry found"
|
|
},
|
|
{
|
|
"museum_name": "Museo Rodulfo Philippi",
|
|
"city": "Chañaral",
|
|
"status": "No Wikidata entry found (possibly confused with Rudolf Philippi museums elsewhere)"
|
|
},
|
|
{
|
|
"museum_name": "Museo del Libro del Mar",
|
|
"city": "San Antonio",
|
|
"status": "No Wikidata entry found"
|
|
},
|
|
{
|
|
"museum_name": "Museo de Historia Local Los Perales",
|
|
"city": "Quilpué",
|
|
"status": "No Wikidata entry found"
|
|
},
|
|
{
|
|
"museum_name": "Museo de las Iglesias",
|
|
"city": "Castro, Chiloé",
|
|
"status": "No Wikidata entry found"
|
|
},
|
|
{
|
|
"museum_name": "Museo Histórico Municipal",
|
|
"city": "Puerto Natales, Última Esperanza",
|
|
"status": "No Wikidata entry found"
|
|
}
|
|
]
|
|
|
|
# Items requiring further investigation
|
|
NEEDS_VERIFICATION = [
|
|
{
|
|
"museum_name": "Museo Histórico y Cultural",
|
|
"city": "Cauquenes",
|
|
"q_number": "Q86281191",
|
|
"wikidata_name": "Museo Histórico Cultural Antuhuenu",
|
|
"location": "Nacimiento",
|
|
"issue": "Location mismatch - Nacimiento is in Bío Bío, not Cauquenes in Maule",
|
|
"recommendation": "REJECT - Different museum"
|
|
},
|
|
{
|
|
"museum_name": "Museo Rudolph Philippi",
|
|
"city": "Valdivia",
|
|
"q_number": "Q86283174",
|
|
"wikidata_name": "Mira Valdivia",
|
|
"issue": "Name doesn't match - unclear if 'Mira Valdivia' is Rudolph Philippi museum",
|
|
"recommendation": "REJECT - Needs manual Wikidata search for 'Museo Rudolph Philippi Valdivia'"
|
|
}
|
|
]
|
|
|
|
def main():
|
|
print("="*80)
|
|
print("BATCH 11 FINAL VALIDATION")
|
|
print("="*80)
|
|
|
|
print(f"\n✅ VALIDATED MATCHES: {len(VALIDATED_MATCHES)}")
|
|
for match in VALIDATED_MATCHES:
|
|
print(f"\n{match['museum_name']}")
|
|
print(f" → {match['wikidata_name']} ({match['q_number']})")
|
|
print(f" Confidence: {match['confidence']}")
|
|
print(f" Reason: {match['reason']}")
|
|
|
|
print(f"\n\n❌ REJECTED MATCHES: {len(REJECTED_MATCHES)}")
|
|
for match in REJECTED_MATCHES:
|
|
print(f"\n{match['museum_name']}")
|
|
print(f" ✗ {match['wikidata_name']} ({match['q_number']})")
|
|
print(f" Reason: {match['reason']}")
|
|
|
|
print(f"\n\n🔍 NEEDS MANUAL RESEARCH: {len(NEEDS_RESEARCH)}")
|
|
for item in NEEDS_RESEARCH:
|
|
print(f" • {item['museum_name']} ({item['city']})")
|
|
|
|
print(f"\n\n⚠️ NEEDS VERIFICATION: {len(NEEDS_VERIFICATION)}")
|
|
for item in NEEDS_VERIFICATION:
|
|
print(f"\n{item['museum_name']}")
|
|
print(f" → {item['wikidata_name']} ({item['q_number']})")
|
|
print(f" Issue: {item['issue']}")
|
|
print(f" Recommendation: {item['recommendation']}")
|
|
|
|
# Summary
|
|
current_coverage = 55
|
|
validated_additions = len(VALIDATED_MATCHES)
|
|
new_coverage = current_coverage + validated_additions
|
|
|
|
print("\n" + "="*80)
|
|
print("BATCH 11 SUMMARY")
|
|
print("="*80)
|
|
print(f"Current coverage: {current_coverage}/90 (61.1%)")
|
|
print(f"Validated matches: {validated_additions}")
|
|
print(f"New coverage: {new_coverage}/90 ({(new_coverage/90*100):.1f}%)")
|
|
print(f"Museums without Wikidata: {len(NEEDS_RESEARCH)}")
|
|
|
|
if new_coverage / 90 >= 0.67:
|
|
print(f"\n✅ Reached 67% coverage milestone!")
|
|
|
|
# Save final results
|
|
output = {
|
|
'batch': 11,
|
|
'validation_date': '2025-11-09',
|
|
'validated_matches': VALIDATED_MATCHES,
|
|
'rejected_matches': REJECTED_MATCHES,
|
|
'needs_research': NEEDS_RESEARCH,
|
|
'needs_verification': NEEDS_VERIFICATION,
|
|
'summary': {
|
|
'validated': len(VALIDATED_MATCHES),
|
|
'rejected': len(REJECTED_MATCHES),
|
|
'needs_research': len(NEEDS_RESEARCH),
|
|
'current_coverage': f"{current_coverage}/90",
|
|
'new_coverage': f"{new_coverage}/90 ({(new_coverage/90*100):.1f}%)"
|
|
}
|
|
}
|
|
|
|
with open('scripts/batch11_final_validation.json', 'w', encoding='utf-8') as f:
|
|
json.dump(output, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"\n💾 Final validation saved to: scripts/batch11_final_validation.json")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|