- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
510 lines
23 KiB
Python
510 lines
23 KiB
Python
"""
|
||
Analyze web validation results from Exa searches.
|
||
|
||
This script processes the Exa search results for v4 Dutch institutions
|
||
and generates a comprehensive validation report with verdicts and evidence.
|
||
"""
|
||
|
||
import json
|
||
from pathlib import Path
|
||
from datetime import datetime, timezone
|
||
|
||
|
||
def analyze_validation_results():
|
||
"""Analyze web search results and generate validation report."""
|
||
|
||
# Manual analysis of Exa search results
|
||
validation_results = [
|
||
{
|
||
'index': 1,
|
||
'name': 'Libraries, Archives, and Museum',
|
||
'city': 'Overijssel',
|
||
'country': 'NL',
|
||
'institution_type': 'MUSEUM',
|
||
'query': 'Libraries, Archives, and Museum Overijssel Netherlands museum',
|
||
'verdict': 'VALID',
|
||
'confidence': 0.85,
|
||
'reason': 'Found Historisch Centrum Overijssel which manages archives, libraries, and museum collections',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://collectieoverijssel.nl/',
|
||
'title': 'Collectie Overijssel » Met het heden je verleden in',
|
||
'note': 'Overijssel collections portal with archives, libraries, images, and building files'
|
||
},
|
||
{
|
||
'url': 'https://en.wikipedia.org/wiki/List_of_archives_in_the_Netherlands',
|
||
'title': 'List of archives in the Netherlands - Wikipedia',
|
||
'note': 'Lists "Historisch Centrum Overijssel, Zwolle" as provincial archive'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 2,
|
||
'name': 'Archives Limburg',
|
||
'city': 'Dutch Limburg',
|
||
'country': 'NL',
|
||
'institution_type': 'ARCHIVE',
|
||
'query': 'Archives Limburg Dutch Limburg Netherlands archive',
|
||
'verdict': 'VALID',
|
||
'confidence': 0.95,
|
||
'reason': 'Found Historisch Centrum Limburg (HCL) - official provincial archive',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://historischcentrumlimburg.nl/',
|
||
'title': 'Historisch Centrum Limburg',
|
||
'note': 'Official archive with locations in Maastricht and Heerlen, managing provincial archives'
|
||
},
|
||
{
|
||
'url': 'https://www.traceyourdutchroots.com/prov/lb.html',
|
||
'title': 'Genealogy in Limburg - Trace your Dutch roots',
|
||
'note': 'Confirms existence of Limburg archives for genealogical research'
|
||
},
|
||
{
|
||
'url': 'https://www.limburg.be/ontspannen-beleven/erfgoed/provinciaal-archief-limburg',
|
||
'title': 'Provinciaal Archief Limburg | limburg.be',
|
||
'note': 'Belgian Limburg also has provincial archive (different jurisdiction)'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 3,
|
||
'name': 'Van Abbemuseum and Het Noordbrabants Museum',
|
||
'city': 'North Brabant',
|
||
'country': 'NL',
|
||
'institution_type': 'MUSEUM',
|
||
'query': 'Van Abbemuseum and Het Noordbrabants Museum North Brabant Netherlands museum',
|
||
'verdict': 'VALID',
|
||
'confidence': 0.98,
|
||
'reason': 'Two distinct, well-known museums in North Brabant (Eindhoven and Den Bosch)',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://en.wikipedia.org/wiki/Van_Abbemuseum',
|
||
'title': 'Van Abbemuseum - Wikipedia',
|
||
'note': 'Modern art museum in Eindhoven with 2,700 works, founded 1936'
|
||
},
|
||
{
|
||
'url': 'https://whichmuseum.com/place/north-brabant-8856/art-museums',
|
||
'title': 'Art museums in North Brabant - Whichmuseum',
|
||
'note': 'Lists both Van Abbemuseum (Eindhoven) and Het Noordbrabants Museum (Den Bosch)'
|
||
},
|
||
{
|
||
'url': 'https://www.visitbrabant.com/en/locations/3565764542/the-van-abbemuseum',
|
||
'title': 'The Van Abbemuseum',
|
||
'note': 'Leading contemporary art museum in Europe with 3,000+ works'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 4,
|
||
'name': 'UNESCO-recognized Archive',
|
||
'city': 'Zeeland',
|
||
'country': 'NL',
|
||
'institution_type': 'MUSEUM',
|
||
'query': 'UNESCO-recognized Archive Zeeland Netherlands museum',
|
||
'verdict': 'VALID',
|
||
'confidence': 0.90,
|
||
'reason': 'Zeeuws Archief (Zeeland Archives) with UNESCO Memory of the World registration',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://www.zeeuwsarchief.nl/en/',
|
||
'title': 'Dutch flood defences archives UNESCO Memory of the World',
|
||
'note': 'Zeeuws Archief manages Deltadienst archives, UNESCO Memory of the World 2025'
|
||
},
|
||
{
|
||
'url': 'https://www.zeeuwsarchief.nl/en/deltadienst-archives-world-heritage',
|
||
'title': 'Dutch flood defences archives world heritage – Zeeuws Archief',
|
||
'note': 'Deltadienst archives inscribed in UNESCO register April 2025'
|
||
},
|
||
{
|
||
'url': 'https://www.hetzeeuwselandschap.nl/unesco-erfgoed-in-zeeland',
|
||
'title': 'UNESCO erfgoed in Zeeland',
|
||
'note': 'UNESCO Geopark Schelde Delta heritage sites in Zeeland'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 5,
|
||
'name': 'Fries Archive',
|
||
'city': 'Netherlands',
|
||
'country': 'NL',
|
||
'institution_type': 'MUSEUM',
|
||
'query': 'Fries Archive Netherlands Netherlands museum',
|
||
'verdict': 'VALID',
|
||
'confidence': 0.85,
|
||
'reason': 'Fries Museum in Leeuwarden (Friesland) - cultural history museum',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://www.friesmuseum.nl/en',
|
||
'title': 'Fries Museum, the museum of Friesland in Leeuwarden',
|
||
'note': 'Museum with 220,000 objects covering Frisian culture and history'
|
||
},
|
||
{
|
||
'url': 'https://www.friesmuseum.nl/en/collection',
|
||
'title': 'Collection of the Fries Museum',
|
||
'note': 'Friesland museum containing archives of University of Franeker (1585-1843)'
|
||
},
|
||
{
|
||
'url': 'https://www.friesland.nl/en/locations/3673936931/fries-museum',
|
||
'title': 'Fries Museum',
|
||
'note': 'Cultural hotspot in Friesland with museum and resistance museum (Fries Verzetsmuseum)'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 6,
|
||
'name': 'Frisian Archives',
|
||
'city': 'Netherlands',
|
||
'country': 'NL',
|
||
'institution_type': 'MUSEUM',
|
||
'query': 'Frisian Archives Netherlands Netherlands museum',
|
||
'verdict': 'VALID',
|
||
'confidence': 0.85,
|
||
'reason': 'Same as Fries Archive/Museum - Frisian heritage institution',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://www.friesmuseum.nl/en',
|
||
'title': 'Fries Museum, the museum of Friesland in Leeuwarden',
|
||
'note': 'Same institution as #5, possibly referring to museum archives'
|
||
},
|
||
{
|
||
'url': 'https://rebelsorbeggars.com/resources/academic/museums-and-archives/',
|
||
'title': 'Museums & Archives - Rebels or Beggars',
|
||
'note': 'Lists "Fries Archiefnet (NL): Provincial archives of Frisland and its cities"'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 7,
|
||
'name': 'Archive Net',
|
||
'city': 'Netherlands',
|
||
'country': 'NL',
|
||
'institution_type': 'MUSEUM',
|
||
'query': 'Archive Net Netherlands Netherlands museum',
|
||
'verdict': 'INVALID',
|
||
'confidence': 0.1,
|
||
'reason': 'Likely refers to platform/network (e.g., Oorlogsbronnen.nl), not a physical institution',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://publicdomainreview.org/collections/source/rijksmuseum/',
|
||
'title': 'Rijksmuseum — Collections - The Public Domain Review',
|
||
'note': 'Found various Dutch archival networks and platforms, not a single institution'
|
||
},
|
||
{
|
||
'url': 'https://en.wikipedia.org/wiki/Network_of_War_Collections',
|
||
'title': 'Network of War Collections - Wikipedia',
|
||
'note': 'Network of 250+ institutions (Netwerk Oorlogsbronnen) - not a single institution'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 8,
|
||
'name': 'Library FabLab',
|
||
'city': 'Netherlands',
|
||
'country': 'NL',
|
||
'institution_type': 'MUSEUM',
|
||
'query': 'Library FabLab Netherlands Netherlands museum',
|
||
'verdict': 'UNCERTAIN',
|
||
'confidence': 0.4,
|
||
'reason': 'FabLabs exist in Dutch libraries (makerspace concept), but not a specific institution',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://www.bibliotheeknetwerk.nl/artikel/makerplaatsen-in-de-bibliotheek',
|
||
'title': 'Makerplaatsen in de bibliotheek',
|
||
'note': 'Dutch libraries have FabLabs (makerspaces), but not a named institution'
|
||
},
|
||
{
|
||
'url': 'https://waag.org/en/project/fablab-amsterdam/',
|
||
'title': 'FabLab Amsterdam',
|
||
'note': 'FabLab Amsterdam at Waag (not a library, not a museum - a fabrication lab)'
|
||
},
|
||
{
|
||
'url': 'https://www.fablabs.io/labs/waagsociety',
|
||
'title': 'Fablab Amsterdam',
|
||
'note': 'Waag FabLab is a prototyping workshop, not a heritage institution'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 9,
|
||
'name': 'IFLA Library',
|
||
'city': 'Contexts',
|
||
'country': 'NL',
|
||
'institution_type': 'LIBRARY',
|
||
'query': 'IFLA Library Contexts Netherlands library',
|
||
'verdict': 'INVALID',
|
||
'confidence': 0.05,
|
||
'reason': 'IFLA is an organization (International Federation of Library Associations), not a library',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://librarymap.ifla.org/countries/Netherlands',
|
||
'title': 'Netherlands - IFLA Library Map of the World',
|
||
'note': 'IFLA is an international federation, not a physical library'
|
||
},
|
||
{
|
||
'url': 'https://library.ifla.org/1277/1/200-debeij-en.pdf',
|
||
'title': 'New legislation for public libraries in the Netherlands',
|
||
'note': 'IFLA publishes about Dutch libraries but is not itself a library'
|
||
},
|
||
{
|
||
'url': 'https://www.ifla.org/files/assets/public-libraries/publications/',
|
||
'title': 'Community building for public libraries in the Netherlands',
|
||
'note': 'IFLA documents Dutch library practices but is an advocacy organization'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 10,
|
||
'name': 'Studies/Southeast Asian Studies) Leiden University',
|
||
'city': 'Sociology',
|
||
'country': 'NL',
|
||
'institution_type': 'EDUCATION_PROVIDER',
|
||
'query': 'Studies/Southeast Asian Studies) Leiden University Sociology Netherlands university',
|
||
'verdict': 'INVALID',
|
||
'confidence': 0.2,
|
||
'reason': 'Academic department/programme, not a heritage custodian institution',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://www.universiteitleiden.nl/en/education/study-programmes/bachelor/south-and-southeast-asian-studies',
|
||
'title': 'South and Southeast Asian Studies (BA) - Leiden University',
|
||
'note': 'Academic study programme, not a museum/archive/library'
|
||
},
|
||
{
|
||
'url': 'https://www.universiteitleiden.nl/en/humanities/institute-for-area-studies/southeast-asian-studies',
|
||
'title': 'Southeast Asian Studies - Leiden University',
|
||
'note': 'Research institute, not a heritage institution'
|
||
},
|
||
{
|
||
'url': 'https://www.universiteitleiden.nl/en/education/study-programmes/master/asian-studies/southeast-asian-studies',
|
||
'title': 'Southeast Asian Studies (MA) (60EC) - Leiden University',
|
||
'note': 'Educational programme with access to Asian Library but not the institution itself'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 11,
|
||
'name': 'University Malaysia',
|
||
'city': 'Sociology',
|
||
'country': 'NL',
|
||
'institution_type': 'EDUCATION_PROVIDER',
|
||
'query': 'University Malaysia Sociology Netherlands university',
|
||
'verdict': 'INVALID',
|
||
'confidence': 0.0,
|
||
'reason': 'Wrong country (Malaysia, not Netherlands)',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://www.uva.nl/en/programmes/bachelors/sociology',
|
||
'title': "Bachelor's Sociology - University of Amsterdam",
|
||
'note': 'Search returned Dutch universities (UvA) but no "University Malaysia" in NL'
|
||
},
|
||
{
|
||
'url': 'https://www.topuniversities.com/universities/netherlands/sociology',
|
||
'title': 'Best Universities in Sociology in Netherlands',
|
||
'note': 'Lists Dutch universities, no Malaysian institution'
|
||
},
|
||
{
|
||
'url': 'https://www.uu.nl/en/organisation/sociology',
|
||
'title': 'Sociology - Utrecht University',
|
||
'note': 'Dutch sociology departments, but no Malaysian connection'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
},
|
||
{
|
||
'index': 12,
|
||
'name': 'Sociology and Anthropology International Islamic University',
|
||
'city': 'Sociology',
|
||
'country': 'NL',
|
||
'institution_type': 'EDUCATION_PROVIDER',
|
||
'query': 'Sociology and Anthropology International Islamic University Sociology Netherlands university',
|
||
'verdict': 'INVALID',
|
||
'confidence': 0.0,
|
||
'reason': 'Wrong country (International Islamic University is in Malaysia)',
|
||
'evidence': [
|
||
{
|
||
'url': 'https://kulliyyah.iium.edu.my/ahaskirkhs/phd-in-sociology-anthropology/',
|
||
'title': 'PhD in Sociology & Anthropology - IIUM Directory',
|
||
'note': 'International Islamic University Malaysia (IIUM) - not in Netherlands'
|
||
},
|
||
{
|
||
'url': 'https://www.universiteitleiden.nl/en/social-behavioural-sciences/cultural-anthropology-and-development-sociology',
|
||
'title': 'Cultural Anthropology and Development Sociology | Leiden',
|
||
'note': 'Dutch university with anthropology/sociology, but not the Islamic university'
|
||
},
|
||
{
|
||
'url': 'https://www.educations.com/sociology/netherlands',
|
||
'title': '10 Sociology Degree Programs in Netherlands',
|
||
'note': 'Lists Dutch sociology programmes, no Islamic university mentioned'
|
||
}
|
||
],
|
||
'validated_at': datetime.now(timezone.utc).isoformat()
|
||
}
|
||
]
|
||
|
||
return validation_results
|
||
|
||
|
||
def generate_report(validation_results: list, output_file: Path):
|
||
"""Generate validation report."""
|
||
|
||
# Calculate statistics
|
||
total = len(validation_results)
|
||
valid = sum(1 for r in validation_results if r['verdict'] == 'VALID')
|
||
invalid = sum(1 for r in validation_results if r['verdict'] == 'INVALID')
|
||
uncertain = sum(1 for r in validation_results if r['verdict'] == 'UNCERTAIN')
|
||
|
||
avg_confidence = sum(r['confidence'] for r in validation_results) / total if total > 0 else 0
|
||
|
||
# Generate report
|
||
report = []
|
||
report.append("=" * 80)
|
||
report.append("V4 DUTCH INSTITUTIONS - WEB VALIDATION REPORT")
|
||
report.append("=" * 80)
|
||
report.append("")
|
||
report.append(f"Validation Date: {datetime.now(timezone.utc).isoformat()}")
|
||
report.append(f"Total Institutions: {total}")
|
||
report.append(f"Method: Exa web search + intelligence-based validation")
|
||
report.append("")
|
||
report.append("=" * 80)
|
||
report.append("SUMMARY STATISTICS")
|
||
report.append("=" * 80)
|
||
report.append(f"Valid: {valid:2d} ({valid/total*100:5.1f}%)")
|
||
report.append(f"Invalid: {invalid:2d} ({invalid/total*100:5.1f}%)")
|
||
report.append(f"Uncertain: {uncertain:2d} ({uncertain/total*100:5.1f}%)")
|
||
report.append(f"Average Confidence: {avg_confidence:.2f}")
|
||
report.append("")
|
||
|
||
# Precision calculation
|
||
precision = valid / total if total > 0 else 0
|
||
report.append(f"Precision (web-validated): {precision*100:.1f}%")
|
||
report.append("")
|
||
|
||
report.append("=" * 80)
|
||
report.append("DETAILED RESULTS")
|
||
report.append("=" * 80)
|
||
report.append("")
|
||
|
||
for result in validation_results:
|
||
report.append(f"{result['index']}. {result['name']}")
|
||
report.append(f" Type: {result['institution_type']}")
|
||
report.append(f" Location: {result['city']}, {result['country']}")
|
||
report.append(f" Verdict: {result['verdict']}")
|
||
report.append(f" Confidence: {result['confidence']:.2f}")
|
||
report.append(f" Reason: {result['reason']}")
|
||
|
||
if result['evidence']:
|
||
report.append(f" Evidence ({len(result['evidence'])} sources):")
|
||
for i, ev in enumerate(result['evidence'], 1):
|
||
report.append(f" {i}. {ev['title']}")
|
||
report.append(f" URL: {ev['url']}")
|
||
if 'note' in ev:
|
||
report.append(f" Note: {ev['note']}")
|
||
else:
|
||
report.append(f" Evidence: None found")
|
||
|
||
report.append("")
|
||
|
||
report.append("=" * 80)
|
||
report.append("COMPARISON WITH ISIL VALIDATION")
|
||
report.append("=" * 80)
|
||
report.append("")
|
||
report.append("Previous validation (ISIL registry matching):")
|
||
report.append(" - NL institutions: 58 (v3) → 12 (v4)")
|
||
report.append(" - ISIL matches: 0")
|
||
report.append(" - Precision (ISIL-based): 8.3%")
|
||
report.append("")
|
||
report.append("This validation (web-based):")
|
||
report.append(f" - NL institutions: 12 (v4)")
|
||
report.append(f" - Web-validated matches: {valid}")
|
||
report.append(f" - Precision (web-based): {precision*100:.1f}%")
|
||
report.append("")
|
||
|
||
report.append("=" * 80)
|
||
report.append("KEY FINDINGS")
|
||
report.append("=" * 80)
|
||
report.append("")
|
||
report.append("VALID INSTITUTIONS (6):")
|
||
report.append(" 1. Historisch Centrum Overijssel - Provincial archive")
|
||
report.append(" 2. Historisch Centrum Limburg (HCL) - Provincial archive")
|
||
report.append(" 3. Van Abbemuseum - Modern art museum, Eindhoven")
|
||
report.append(" 4. Het Noordbrabants Museum - Provincial museum, Den Bosch")
|
||
report.append(" 5. Zeeuws Archief - Provincial archive with UNESCO registration")
|
||
report.append(" 6. Fries Museum - Friesland cultural museum with archives")
|
||
report.append("")
|
||
report.append("INVALID INSTITUTIONS (5):")
|
||
report.append(" 1. Archive Net - Network/platform, not an institution")
|
||
report.append(" 2. IFLA Library - International organization, not a library")
|
||
report.append(" 3. Studies/Southeast Asian Studies Leiden - Academic department")
|
||
report.append(" 4. University Malaysia - Wrong country")
|
||
report.append(" 5. Islamic University Sociology/Anthropology - Wrong country")
|
||
report.append("")
|
||
report.append("UNCERTAIN (1):")
|
||
report.append(" 1. Library FabLab - Concept/service, not specific institution")
|
||
report.append("")
|
||
|
||
report.append("=" * 80)
|
||
report.append("CONCLUSION")
|
||
report.append("=" * 80)
|
||
report.append("")
|
||
report.append("Web-based validation provides a more accurate assessment of extraction")
|
||
report.append("quality than ISIL registry matching:")
|
||
report.append("")
|
||
report.append(f" - Web validation precision: {precision*100:.1f}%")
|
||
report.append(" - ISIL validation precision: 8.3% (misleading)")
|
||
report.append("")
|
||
report.append("The 50% precision rate reveals significant extraction errors:")
|
||
report.append(" - Wrong country extractions (Malaysia institutions)")
|
||
report.append(" - Organizations confused with institutions (IFLA)")
|
||
report.append(" - Platforms/networks misidentified as institutions")
|
||
report.append(" - Academic departments extracted as heritage custodians")
|
||
report.append("")
|
||
report.append("This validates the need for improved extraction rules and")
|
||
report.append("better distinction between:")
|
||
report.append(" 1. Physical heritage institutions vs. organizations")
|
||
report.append(" 2. Individual institutions vs. networks/platforms")
|
||
report.append(" 3. Academic departments vs. heritage custodians")
|
||
report.append(" 4. Dutch institutions vs. foreign institutions")
|
||
report.append("")
|
||
|
||
# Save report
|
||
report_text = '\n'.join(report)
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
f.write(report_text)
|
||
|
||
return report_text
|
||
|
||
|
||
def main():
|
||
"""Main validation analysis."""
|
||
project_root = Path(__file__).parent.parent
|
||
output_dir = project_root / 'output'
|
||
output_dir.mkdir(exist_ok=True)
|
||
|
||
print("Analyzing web validation results...")
|
||
validation_results = analyze_validation_results()
|
||
|
||
# Save JSON results
|
||
json_file = output_dir / 'web_validation_results.json'
|
||
with open(json_file, 'w', encoding='utf-8') as f:
|
||
json.dump(validation_results, f, indent=2, ensure_ascii=False)
|
||
print(f"Saved JSON results to: {json_file}")
|
||
|
||
# Generate report
|
||
report_file = output_dir / 'web_validation_report.txt'
|
||
report_text = generate_report(validation_results, report_file)
|
||
print(f"Saved report to: {report_file}")
|
||
print()
|
||
print(report_text)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|