glam/scripts/analyze_web_validation_results.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

510 lines
23 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Analyze web validation results from Exa searches.
This script processes the Exa search results for v4 Dutch institutions
and generates a comprehensive validation report with verdicts and evidence.
"""
import json
from pathlib import Path
from datetime import datetime, timezone
def analyze_validation_results():
"""Analyze web search results and generate validation report."""
# Manual analysis of Exa search results
validation_results = [
{
'index': 1,
'name': 'Libraries, Archives, and Museum',
'city': 'Overijssel',
'country': 'NL',
'institution_type': 'MUSEUM',
'query': 'Libraries, Archives, and Museum Overijssel Netherlands museum',
'verdict': 'VALID',
'confidence': 0.85,
'reason': 'Found Historisch Centrum Overijssel which manages archives, libraries, and museum collections',
'evidence': [
{
'url': 'https://collectieoverijssel.nl/',
'title': 'Collectie Overijssel » Met het heden je verleden in',
'note': 'Overijssel collections portal with archives, libraries, images, and building files'
},
{
'url': 'https://en.wikipedia.org/wiki/List_of_archives_in_the_Netherlands',
'title': 'List of archives in the Netherlands - Wikipedia',
'note': 'Lists "Historisch Centrum Overijssel, Zwolle" as provincial archive'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 2,
'name': 'Archives Limburg',
'city': 'Dutch Limburg',
'country': 'NL',
'institution_type': 'ARCHIVE',
'query': 'Archives Limburg Dutch Limburg Netherlands archive',
'verdict': 'VALID',
'confidence': 0.95,
'reason': 'Found Historisch Centrum Limburg (HCL) - official provincial archive',
'evidence': [
{
'url': 'https://historischcentrumlimburg.nl/',
'title': 'Historisch Centrum Limburg',
'note': 'Official archive with locations in Maastricht and Heerlen, managing provincial archives'
},
{
'url': 'https://www.traceyourdutchroots.com/prov/lb.html',
'title': 'Genealogy in Limburg - Trace your Dutch roots',
'note': 'Confirms existence of Limburg archives for genealogical research'
},
{
'url': 'https://www.limburg.be/ontspannen-beleven/erfgoed/provinciaal-archief-limburg',
'title': 'Provinciaal Archief Limburg | limburg.be',
'note': 'Belgian Limburg also has provincial archive (different jurisdiction)'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 3,
'name': 'Van Abbemuseum and Het Noordbrabants Museum',
'city': 'North Brabant',
'country': 'NL',
'institution_type': 'MUSEUM',
'query': 'Van Abbemuseum and Het Noordbrabants Museum North Brabant Netherlands museum',
'verdict': 'VALID',
'confidence': 0.98,
'reason': 'Two distinct, well-known museums in North Brabant (Eindhoven and Den Bosch)',
'evidence': [
{
'url': 'https://en.wikipedia.org/wiki/Van_Abbemuseum',
'title': 'Van Abbemuseum - Wikipedia',
'note': 'Modern art museum in Eindhoven with 2,700 works, founded 1936'
},
{
'url': 'https://whichmuseum.com/place/north-brabant-8856/art-museums',
'title': 'Art museums in North Brabant - Whichmuseum',
'note': 'Lists both Van Abbemuseum (Eindhoven) and Het Noordbrabants Museum (Den Bosch)'
},
{
'url': 'https://www.visitbrabant.com/en/locations/3565764542/the-van-abbemuseum',
'title': 'The Van Abbemuseum',
'note': 'Leading contemporary art museum in Europe with 3,000+ works'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 4,
'name': 'UNESCO-recognized Archive',
'city': 'Zeeland',
'country': 'NL',
'institution_type': 'MUSEUM',
'query': 'UNESCO-recognized Archive Zeeland Netherlands museum',
'verdict': 'VALID',
'confidence': 0.90,
'reason': 'Zeeuws Archief (Zeeland Archives) with UNESCO Memory of the World registration',
'evidence': [
{
'url': 'https://www.zeeuwsarchief.nl/en/',
'title': 'Dutch flood defences archives UNESCO Memory of the World',
'note': 'Zeeuws Archief manages Deltadienst archives, UNESCO Memory of the World 2025'
},
{
'url': 'https://www.zeeuwsarchief.nl/en/deltadienst-archives-world-heritage',
'title': 'Dutch flood defences archives world heritage Zeeuws Archief',
'note': 'Deltadienst archives inscribed in UNESCO register April 2025'
},
{
'url': 'https://www.hetzeeuwselandschap.nl/unesco-erfgoed-in-zeeland',
'title': 'UNESCO erfgoed in Zeeland',
'note': 'UNESCO Geopark Schelde Delta heritage sites in Zeeland'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 5,
'name': 'Fries Archive',
'city': 'Netherlands',
'country': 'NL',
'institution_type': 'MUSEUM',
'query': 'Fries Archive Netherlands Netherlands museum',
'verdict': 'VALID',
'confidence': 0.85,
'reason': 'Fries Museum in Leeuwarden (Friesland) - cultural history museum',
'evidence': [
{
'url': 'https://www.friesmuseum.nl/en',
'title': 'Fries Museum, the museum of Friesland in Leeuwarden',
'note': 'Museum with 220,000 objects covering Frisian culture and history'
},
{
'url': 'https://www.friesmuseum.nl/en/collection',
'title': 'Collection of the Fries Museum',
'note': 'Friesland museum containing archives of University of Franeker (1585-1843)'
},
{
'url': 'https://www.friesland.nl/en/locations/3673936931/fries-museum',
'title': 'Fries Museum',
'note': 'Cultural hotspot in Friesland with museum and resistance museum (Fries Verzetsmuseum)'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 6,
'name': 'Frisian Archives',
'city': 'Netherlands',
'country': 'NL',
'institution_type': 'MUSEUM',
'query': 'Frisian Archives Netherlands Netherlands museum',
'verdict': 'VALID',
'confidence': 0.85,
'reason': 'Same as Fries Archive/Museum - Frisian heritage institution',
'evidence': [
{
'url': 'https://www.friesmuseum.nl/en',
'title': 'Fries Museum, the museum of Friesland in Leeuwarden',
'note': 'Same institution as #5, possibly referring to museum archives'
},
{
'url': 'https://rebelsorbeggars.com/resources/academic/museums-and-archives/',
'title': 'Museums & Archives - Rebels or Beggars',
'note': 'Lists "Fries Archiefnet (NL): Provincial archives of Frisland and its cities"'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 7,
'name': 'Archive Net',
'city': 'Netherlands',
'country': 'NL',
'institution_type': 'MUSEUM',
'query': 'Archive Net Netherlands Netherlands museum',
'verdict': 'INVALID',
'confidence': 0.1,
'reason': 'Likely refers to platform/network (e.g., Oorlogsbronnen.nl), not a physical institution',
'evidence': [
{
'url': 'https://publicdomainreview.org/collections/source/rijksmuseum/',
'title': 'Rijksmuseum — Collections - The Public Domain Review',
'note': 'Found various Dutch archival networks and platforms, not a single institution'
},
{
'url': 'https://en.wikipedia.org/wiki/Network_of_War_Collections',
'title': 'Network of War Collections - Wikipedia',
'note': 'Network of 250+ institutions (Netwerk Oorlogsbronnen) - not a single institution'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 8,
'name': 'Library FabLab',
'city': 'Netherlands',
'country': 'NL',
'institution_type': 'MUSEUM',
'query': 'Library FabLab Netherlands Netherlands museum',
'verdict': 'UNCERTAIN',
'confidence': 0.4,
'reason': 'FabLabs exist in Dutch libraries (makerspace concept), but not a specific institution',
'evidence': [
{
'url': 'https://www.bibliotheeknetwerk.nl/artikel/makerplaatsen-in-de-bibliotheek',
'title': 'Makerplaatsen in de bibliotheek',
'note': 'Dutch libraries have FabLabs (makerspaces), but not a named institution'
},
{
'url': 'https://waag.org/en/project/fablab-amsterdam/',
'title': 'FabLab Amsterdam',
'note': 'FabLab Amsterdam at Waag (not a library, not a museum - a fabrication lab)'
},
{
'url': 'https://www.fablabs.io/labs/waagsociety',
'title': 'Fablab Amsterdam',
'note': 'Waag FabLab is a prototyping workshop, not a heritage institution'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 9,
'name': 'IFLA Library',
'city': 'Contexts',
'country': 'NL',
'institution_type': 'LIBRARY',
'query': 'IFLA Library Contexts Netherlands library',
'verdict': 'INVALID',
'confidence': 0.05,
'reason': 'IFLA is an organization (International Federation of Library Associations), not a library',
'evidence': [
{
'url': 'https://librarymap.ifla.org/countries/Netherlands',
'title': 'Netherlands - IFLA Library Map of the World',
'note': 'IFLA is an international federation, not a physical library'
},
{
'url': 'https://library.ifla.org/1277/1/200-debeij-en.pdf',
'title': 'New legislation for public libraries in the Netherlands',
'note': 'IFLA publishes about Dutch libraries but is not itself a library'
},
{
'url': 'https://www.ifla.org/files/assets/public-libraries/publications/',
'title': 'Community building for public libraries in the Netherlands',
'note': 'IFLA documents Dutch library practices but is an advocacy organization'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 10,
'name': 'Studies/Southeast Asian Studies) Leiden University',
'city': 'Sociology',
'country': 'NL',
'institution_type': 'EDUCATION_PROVIDER',
'query': 'Studies/Southeast Asian Studies) Leiden University Sociology Netherlands university',
'verdict': 'INVALID',
'confidence': 0.2,
'reason': 'Academic department/programme, not a heritage custodian institution',
'evidence': [
{
'url': 'https://www.universiteitleiden.nl/en/education/study-programmes/bachelor/south-and-southeast-asian-studies',
'title': 'South and Southeast Asian Studies (BA) - Leiden University',
'note': 'Academic study programme, not a museum/archive/library'
},
{
'url': 'https://www.universiteitleiden.nl/en/humanities/institute-for-area-studies/southeast-asian-studies',
'title': 'Southeast Asian Studies - Leiden University',
'note': 'Research institute, not a heritage institution'
},
{
'url': 'https://www.universiteitleiden.nl/en/education/study-programmes/master/asian-studies/southeast-asian-studies',
'title': 'Southeast Asian Studies (MA) (60EC) - Leiden University',
'note': 'Educational programme with access to Asian Library but not the institution itself'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 11,
'name': 'University Malaysia',
'city': 'Sociology',
'country': 'NL',
'institution_type': 'EDUCATION_PROVIDER',
'query': 'University Malaysia Sociology Netherlands university',
'verdict': 'INVALID',
'confidence': 0.0,
'reason': 'Wrong country (Malaysia, not Netherlands)',
'evidence': [
{
'url': 'https://www.uva.nl/en/programmes/bachelors/sociology',
'title': "Bachelor's Sociology - University of Amsterdam",
'note': 'Search returned Dutch universities (UvA) but no "University Malaysia" in NL'
},
{
'url': 'https://www.topuniversities.com/universities/netherlands/sociology',
'title': 'Best Universities in Sociology in Netherlands',
'note': 'Lists Dutch universities, no Malaysian institution'
},
{
'url': 'https://www.uu.nl/en/organisation/sociology',
'title': 'Sociology - Utrecht University',
'note': 'Dutch sociology departments, but no Malaysian connection'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
},
{
'index': 12,
'name': 'Sociology and Anthropology International Islamic University',
'city': 'Sociology',
'country': 'NL',
'institution_type': 'EDUCATION_PROVIDER',
'query': 'Sociology and Anthropology International Islamic University Sociology Netherlands university',
'verdict': 'INVALID',
'confidence': 0.0,
'reason': 'Wrong country (International Islamic University is in Malaysia)',
'evidence': [
{
'url': 'https://kulliyyah.iium.edu.my/ahaskirkhs/phd-in-sociology-anthropology/',
'title': 'PhD in Sociology & Anthropology - IIUM Directory',
'note': 'International Islamic University Malaysia (IIUM) - not in Netherlands'
},
{
'url': 'https://www.universiteitleiden.nl/en/social-behavioural-sciences/cultural-anthropology-and-development-sociology',
'title': 'Cultural Anthropology and Development Sociology | Leiden',
'note': 'Dutch university with anthropology/sociology, but not the Islamic university'
},
{
'url': 'https://www.educations.com/sociology/netherlands',
'title': '10 Sociology Degree Programs in Netherlands',
'note': 'Lists Dutch sociology programmes, no Islamic university mentioned'
}
],
'validated_at': datetime.now(timezone.utc).isoformat()
}
]
return validation_results
def generate_report(validation_results: list, output_file: Path):
"""Generate validation report."""
# Calculate statistics
total = len(validation_results)
valid = sum(1 for r in validation_results if r['verdict'] == 'VALID')
invalid = sum(1 for r in validation_results if r['verdict'] == 'INVALID')
uncertain = sum(1 for r in validation_results if r['verdict'] == 'UNCERTAIN')
avg_confidence = sum(r['confidence'] for r in validation_results) / total if total > 0 else 0
# Generate report
report = []
report.append("=" * 80)
report.append("V4 DUTCH INSTITUTIONS - WEB VALIDATION REPORT")
report.append("=" * 80)
report.append("")
report.append(f"Validation Date: {datetime.now(timezone.utc).isoformat()}")
report.append(f"Total Institutions: {total}")
report.append(f"Method: Exa web search + intelligence-based validation")
report.append("")
report.append("=" * 80)
report.append("SUMMARY STATISTICS")
report.append("=" * 80)
report.append(f"Valid: {valid:2d} ({valid/total*100:5.1f}%)")
report.append(f"Invalid: {invalid:2d} ({invalid/total*100:5.1f}%)")
report.append(f"Uncertain: {uncertain:2d} ({uncertain/total*100:5.1f}%)")
report.append(f"Average Confidence: {avg_confidence:.2f}")
report.append("")
# Precision calculation
precision = valid / total if total > 0 else 0
report.append(f"Precision (web-validated): {precision*100:.1f}%")
report.append("")
report.append("=" * 80)
report.append("DETAILED RESULTS")
report.append("=" * 80)
report.append("")
for result in validation_results:
report.append(f"{result['index']}. {result['name']}")
report.append(f" Type: {result['institution_type']}")
report.append(f" Location: {result['city']}, {result['country']}")
report.append(f" Verdict: {result['verdict']}")
report.append(f" Confidence: {result['confidence']:.2f}")
report.append(f" Reason: {result['reason']}")
if result['evidence']:
report.append(f" Evidence ({len(result['evidence'])} sources):")
for i, ev in enumerate(result['evidence'], 1):
report.append(f" {i}. {ev['title']}")
report.append(f" URL: {ev['url']}")
if 'note' in ev:
report.append(f" Note: {ev['note']}")
else:
report.append(f" Evidence: None found")
report.append("")
report.append("=" * 80)
report.append("COMPARISON WITH ISIL VALIDATION")
report.append("=" * 80)
report.append("")
report.append("Previous validation (ISIL registry matching):")
report.append(" - NL institutions: 58 (v3) → 12 (v4)")
report.append(" - ISIL matches: 0")
report.append(" - Precision (ISIL-based): 8.3%")
report.append("")
report.append("This validation (web-based):")
report.append(f" - NL institutions: 12 (v4)")
report.append(f" - Web-validated matches: {valid}")
report.append(f" - Precision (web-based): {precision*100:.1f}%")
report.append("")
report.append("=" * 80)
report.append("KEY FINDINGS")
report.append("=" * 80)
report.append("")
report.append("VALID INSTITUTIONS (6):")
report.append(" 1. Historisch Centrum Overijssel - Provincial archive")
report.append(" 2. Historisch Centrum Limburg (HCL) - Provincial archive")
report.append(" 3. Van Abbemuseum - Modern art museum, Eindhoven")
report.append(" 4. Het Noordbrabants Museum - Provincial museum, Den Bosch")
report.append(" 5. Zeeuws Archief - Provincial archive with UNESCO registration")
report.append(" 6. Fries Museum - Friesland cultural museum with archives")
report.append("")
report.append("INVALID INSTITUTIONS (5):")
report.append(" 1. Archive Net - Network/platform, not an institution")
report.append(" 2. IFLA Library - International organization, not a library")
report.append(" 3. Studies/Southeast Asian Studies Leiden - Academic department")
report.append(" 4. University Malaysia - Wrong country")
report.append(" 5. Islamic University Sociology/Anthropology - Wrong country")
report.append("")
report.append("UNCERTAIN (1):")
report.append(" 1. Library FabLab - Concept/service, not specific institution")
report.append("")
report.append("=" * 80)
report.append("CONCLUSION")
report.append("=" * 80)
report.append("")
report.append("Web-based validation provides a more accurate assessment of extraction")
report.append("quality than ISIL registry matching:")
report.append("")
report.append(f" - Web validation precision: {precision*100:.1f}%")
report.append(" - ISIL validation precision: 8.3% (misleading)")
report.append("")
report.append("The 50% precision rate reveals significant extraction errors:")
report.append(" - Wrong country extractions (Malaysia institutions)")
report.append(" - Organizations confused with institutions (IFLA)")
report.append(" - Platforms/networks misidentified as institutions")
report.append(" - Academic departments extracted as heritage custodians")
report.append("")
report.append("This validates the need for improved extraction rules and")
report.append("better distinction between:")
report.append(" 1. Physical heritage institutions vs. organizations")
report.append(" 2. Individual institutions vs. networks/platforms")
report.append(" 3. Academic departments vs. heritage custodians")
report.append(" 4. Dutch institutions vs. foreign institutions")
report.append("")
# Save report
report_text = '\n'.join(report)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(report_text)
return report_text
def main():
"""Main validation analysis."""
project_root = Path(__file__).parent.parent
output_dir = project_root / 'output'
output_dir.mkdir(exist_ok=True)
print("Analyzing web validation results...")
validation_results = analyze_validation_results()
# Save JSON results
json_file = output_dir / 'web_validation_results.json'
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(validation_results, f, indent=2, ensure_ascii=False)
print(f"Saved JSON results to: {json_file}")
# Generate report
report_file = output_dir / 'web_validation_report.txt'
report_text = generate_report(validation_results, report_file)
print(f"Saved report to: {report_file}")
print()
print(report_text)
if __name__ == '__main__':
main()