""" Analyze web validation results from Exa searches. This script processes the Exa search results for v4 Dutch institutions and generates a comprehensive validation report with verdicts and evidence. """ import json from pathlib import Path from datetime import datetime, timezone def analyze_validation_results(): """Analyze web search results and generate validation report.""" # Manual analysis of Exa search results validation_results = [ { 'index': 1, 'name': 'Libraries, Archives, and Museum', 'city': 'Overijssel', 'country': 'NL', 'institution_type': 'MUSEUM', 'query': 'Libraries, Archives, and Museum Overijssel Netherlands museum', 'verdict': 'VALID', 'confidence': 0.85, 'reason': 'Found Historisch Centrum Overijssel which manages archives, libraries, and museum collections', 'evidence': [ { 'url': 'https://collectieoverijssel.nl/', 'title': 'Collectie Overijssel ยป Met het heden je verleden in', 'note': 'Overijssel collections portal with archives, libraries, images, and building files' }, { 'url': 'https://en.wikipedia.org/wiki/List_of_archives_in_the_Netherlands', 'title': 'List of archives in the Netherlands - Wikipedia', 'note': 'Lists "Historisch Centrum Overijssel, Zwolle" as provincial archive' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 2, 'name': 'Archives Limburg', 'city': 'Dutch Limburg', 'country': 'NL', 'institution_type': 'ARCHIVE', 'query': 'Archives Limburg Dutch Limburg Netherlands archive', 'verdict': 'VALID', 'confidence': 0.95, 'reason': 'Found Historisch Centrum Limburg (HCL) - official provincial archive', 'evidence': [ { 'url': 'https://historischcentrumlimburg.nl/', 'title': 'Historisch Centrum Limburg', 'note': 'Official archive with locations in Maastricht and Heerlen, managing provincial archives' }, { 'url': 'https://www.traceyourdutchroots.com/prov/lb.html', 'title': 'Genealogy in Limburg - Trace your Dutch roots', 'note': 'Confirms existence of Limburg archives for genealogical research' }, { 'url': 'https://www.limburg.be/ontspannen-beleven/erfgoed/provinciaal-archief-limburg', 'title': 'Provinciaal Archief Limburg | limburg.be', 'note': 'Belgian Limburg also has provincial archive (different jurisdiction)' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 3, 'name': 'Van Abbemuseum and Het Noordbrabants Museum', 'city': 'North Brabant', 'country': 'NL', 'institution_type': 'MUSEUM', 'query': 'Van Abbemuseum and Het Noordbrabants Museum North Brabant Netherlands museum', 'verdict': 'VALID', 'confidence': 0.98, 'reason': 'Two distinct, well-known museums in North Brabant (Eindhoven and Den Bosch)', 'evidence': [ { 'url': 'https://en.wikipedia.org/wiki/Van_Abbemuseum', 'title': 'Van Abbemuseum - Wikipedia', 'note': 'Modern art museum in Eindhoven with 2,700 works, founded 1936' }, { 'url': 'https://whichmuseum.com/place/north-brabant-8856/art-museums', 'title': 'Art museums in North Brabant - Whichmuseum', 'note': 'Lists both Van Abbemuseum (Eindhoven) and Het Noordbrabants Museum (Den Bosch)' }, { 'url': 'https://www.visitbrabant.com/en/locations/3565764542/the-van-abbemuseum', 'title': 'The Van Abbemuseum', 'note': 'Leading contemporary art museum in Europe with 3,000+ works' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 4, 'name': 'UNESCO-recognized Archive', 'city': 'Zeeland', 'country': 'NL', 'institution_type': 'MUSEUM', 'query': 'UNESCO-recognized Archive Zeeland Netherlands museum', 'verdict': 'VALID', 'confidence': 0.90, 'reason': 'Zeeuws Archief (Zeeland Archives) with UNESCO Memory of the World registration', 'evidence': [ { 'url': 'https://www.zeeuwsarchief.nl/en/', 'title': 'Dutch flood defences archives UNESCO Memory of the World', 'note': 'Zeeuws Archief manages Deltadienst archives, UNESCO Memory of the World 2025' }, { 'url': 'https://www.zeeuwsarchief.nl/en/deltadienst-archives-world-heritage', 'title': 'Dutch flood defences archives world heritage โ€“ Zeeuws Archief', 'note': 'Deltadienst archives inscribed in UNESCO register April 2025' }, { 'url': 'https://www.hetzeeuwselandschap.nl/unesco-erfgoed-in-zeeland', 'title': 'UNESCO erfgoed in Zeeland', 'note': 'UNESCO Geopark Schelde Delta heritage sites in Zeeland' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 5, 'name': 'Fries Archive', 'city': 'Netherlands', 'country': 'NL', 'institution_type': 'MUSEUM', 'query': 'Fries Archive Netherlands Netherlands museum', 'verdict': 'VALID', 'confidence': 0.85, 'reason': 'Fries Museum in Leeuwarden (Friesland) - cultural history museum', 'evidence': [ { 'url': 'https://www.friesmuseum.nl/en', 'title': 'Fries Museum, the museum of Friesland in Leeuwarden', 'note': 'Museum with 220,000 objects covering Frisian culture and history' }, { 'url': 'https://www.friesmuseum.nl/en/collection', 'title': 'Collection of the Fries Museum', 'note': 'Friesland museum containing archives of University of Franeker (1585-1843)' }, { 'url': 'https://www.friesland.nl/en/locations/3673936931/fries-museum', 'title': 'Fries Museum', 'note': 'Cultural hotspot in Friesland with museum and resistance museum (Fries Verzetsmuseum)' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 6, 'name': 'Frisian Archives', 'city': 'Netherlands', 'country': 'NL', 'institution_type': 'MUSEUM', 'query': 'Frisian Archives Netherlands Netherlands museum', 'verdict': 'VALID', 'confidence': 0.85, 'reason': 'Same as Fries Archive/Museum - Frisian heritage institution', 'evidence': [ { 'url': 'https://www.friesmuseum.nl/en', 'title': 'Fries Museum, the museum of Friesland in Leeuwarden', 'note': 'Same institution as #5, possibly referring to museum archives' }, { 'url': 'https://rebelsorbeggars.com/resources/academic/museums-and-archives/', 'title': 'Museums & Archives - Rebels or Beggars', 'note': 'Lists "Fries Archiefnet (NL): Provincial archives of Frisland and its cities"' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 7, 'name': 'Archive Net', 'city': 'Netherlands', 'country': 'NL', 'institution_type': 'MUSEUM', 'query': 'Archive Net Netherlands Netherlands museum', 'verdict': 'INVALID', 'confidence': 0.1, 'reason': 'Likely refers to platform/network (e.g., Oorlogsbronnen.nl), not a physical institution', 'evidence': [ { 'url': 'https://publicdomainreview.org/collections/source/rijksmuseum/', 'title': 'Rijksmuseum โ€” Collections - The Public Domain Review', 'note': 'Found various Dutch archival networks and platforms, not a single institution' }, { 'url': 'https://en.wikipedia.org/wiki/Network_of_War_Collections', 'title': 'Network of War Collections - Wikipedia', 'note': 'Network of 250+ institutions (Netwerk Oorlogsbronnen) - not a single institution' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 8, 'name': 'Library FabLab', 'city': 'Netherlands', 'country': 'NL', 'institution_type': 'MUSEUM', 'query': 'Library FabLab Netherlands Netherlands museum', 'verdict': 'UNCERTAIN', 'confidence': 0.4, 'reason': 'FabLabs exist in Dutch libraries (makerspace concept), but not a specific institution', 'evidence': [ { 'url': 'https://www.bibliotheeknetwerk.nl/artikel/makerplaatsen-in-de-bibliotheek', 'title': 'Makerplaatsen in de bibliotheek', 'note': 'Dutch libraries have FabLabs (makerspaces), but not a named institution' }, { 'url': 'https://waag.org/en/project/fablab-amsterdam/', 'title': 'FabLab Amsterdam', 'note': 'FabLab Amsterdam at Waag (not a library, not a museum - a fabrication lab)' }, { 'url': 'https://www.fablabs.io/labs/waagsociety', 'title': 'Fablab Amsterdam', 'note': 'Waag FabLab is a prototyping workshop, not a heritage institution' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 9, 'name': 'IFLA Library', 'city': 'Contexts', 'country': 'NL', 'institution_type': 'LIBRARY', 'query': 'IFLA Library Contexts Netherlands library', 'verdict': 'INVALID', 'confidence': 0.05, 'reason': 'IFLA is an organization (International Federation of Library Associations), not a library', 'evidence': [ { 'url': 'https://librarymap.ifla.org/countries/Netherlands', 'title': 'Netherlands - IFLA Library Map of the World', 'note': 'IFLA is an international federation, not a physical library' }, { 'url': 'https://library.ifla.org/1277/1/200-debeij-en.pdf', 'title': 'New legislation for public libraries in the Netherlands', 'note': 'IFLA publishes about Dutch libraries but is not itself a library' }, { 'url': 'https://www.ifla.org/files/assets/public-libraries/publications/', 'title': 'Community building for public libraries in the Netherlands', 'note': 'IFLA documents Dutch library practices but is an advocacy organization' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 10, 'name': 'Studies/Southeast Asian Studies) Leiden University', 'city': 'Sociology', 'country': 'NL', 'institution_type': 'EDUCATION_PROVIDER', 'query': 'Studies/Southeast Asian Studies) Leiden University Sociology Netherlands university', 'verdict': 'INVALID', 'confidence': 0.2, 'reason': 'Academic department/programme, not a heritage custodian institution', 'evidence': [ { 'url': 'https://www.universiteitleiden.nl/en/education/study-programmes/bachelor/south-and-southeast-asian-studies', 'title': 'South and Southeast Asian Studies (BA) - Leiden University', 'note': 'Academic study programme, not a museum/archive/library' }, { 'url': 'https://www.universiteitleiden.nl/en/humanities/institute-for-area-studies/southeast-asian-studies', 'title': 'Southeast Asian Studies - Leiden University', 'note': 'Research institute, not a heritage institution' }, { 'url': 'https://www.universiteitleiden.nl/en/education/study-programmes/master/asian-studies/southeast-asian-studies', 'title': 'Southeast Asian Studies (MA) (60EC) - Leiden University', 'note': 'Educational programme with access to Asian Library but not the institution itself' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 11, 'name': 'University Malaysia', 'city': 'Sociology', 'country': 'NL', 'institution_type': 'EDUCATION_PROVIDER', 'query': 'University Malaysia Sociology Netherlands university', 'verdict': 'INVALID', 'confidence': 0.0, 'reason': 'Wrong country (Malaysia, not Netherlands)', 'evidence': [ { 'url': 'https://www.uva.nl/en/programmes/bachelors/sociology', 'title': "Bachelor's Sociology - University of Amsterdam", 'note': 'Search returned Dutch universities (UvA) but no "University Malaysia" in NL' }, { 'url': 'https://www.topuniversities.com/universities/netherlands/sociology', 'title': 'Best Universities in Sociology in Netherlands', 'note': 'Lists Dutch universities, no Malaysian institution' }, { 'url': 'https://www.uu.nl/en/organisation/sociology', 'title': 'Sociology - Utrecht University', 'note': 'Dutch sociology departments, but no Malaysian connection' } ], 'validated_at': datetime.now(timezone.utc).isoformat() }, { 'index': 12, 'name': 'Sociology and Anthropology International Islamic University', 'city': 'Sociology', 'country': 'NL', 'institution_type': 'EDUCATION_PROVIDER', 'query': 'Sociology and Anthropology International Islamic University Sociology Netherlands university', 'verdict': 'INVALID', 'confidence': 0.0, 'reason': 'Wrong country (International Islamic University is in Malaysia)', 'evidence': [ { 'url': 'https://kulliyyah.iium.edu.my/ahaskirkhs/phd-in-sociology-anthropology/', 'title': 'PhD in Sociology & Anthropology - IIUM Directory', 'note': 'International Islamic University Malaysia (IIUM) - not in Netherlands' }, { 'url': 'https://www.universiteitleiden.nl/en/social-behavioural-sciences/cultural-anthropology-and-development-sociology', 'title': 'Cultural Anthropology and Development Sociology | Leiden', 'note': 'Dutch university with anthropology/sociology, but not the Islamic university' }, { 'url': 'https://www.educations.com/sociology/netherlands', 'title': '10 Sociology Degree Programs in Netherlands', 'note': 'Lists Dutch sociology programmes, no Islamic university mentioned' } ], 'validated_at': datetime.now(timezone.utc).isoformat() } ] return validation_results def generate_report(validation_results: list, output_file: Path): """Generate validation report.""" # Calculate statistics total = len(validation_results) valid = sum(1 for r in validation_results if r['verdict'] == 'VALID') invalid = sum(1 for r in validation_results if r['verdict'] == 'INVALID') uncertain = sum(1 for r in validation_results if r['verdict'] == 'UNCERTAIN') avg_confidence = sum(r['confidence'] for r in validation_results) / total if total > 0 else 0 # Generate report report = [] report.append("=" * 80) report.append("V4 DUTCH INSTITUTIONS - WEB VALIDATION REPORT") report.append("=" * 80) report.append("") report.append(f"Validation Date: {datetime.now(timezone.utc).isoformat()}") report.append(f"Total Institutions: {total}") report.append(f"Method: Exa web search + intelligence-based validation") report.append("") report.append("=" * 80) report.append("SUMMARY STATISTICS") report.append("=" * 80) report.append(f"Valid: {valid:2d} ({valid/total*100:5.1f}%)") report.append(f"Invalid: {invalid:2d} ({invalid/total*100:5.1f}%)") report.append(f"Uncertain: {uncertain:2d} ({uncertain/total*100:5.1f}%)") report.append(f"Average Confidence: {avg_confidence:.2f}") report.append("") # Precision calculation precision = valid / total if total > 0 else 0 report.append(f"Precision (web-validated): {precision*100:.1f}%") report.append("") report.append("=" * 80) report.append("DETAILED RESULTS") report.append("=" * 80) report.append("") for result in validation_results: report.append(f"{result['index']}. {result['name']}") report.append(f" Type: {result['institution_type']}") report.append(f" Location: {result['city']}, {result['country']}") report.append(f" Verdict: {result['verdict']}") report.append(f" Confidence: {result['confidence']:.2f}") report.append(f" Reason: {result['reason']}") if result['evidence']: report.append(f" Evidence ({len(result['evidence'])} sources):") for i, ev in enumerate(result['evidence'], 1): report.append(f" {i}. {ev['title']}") report.append(f" URL: {ev['url']}") if 'note' in ev: report.append(f" Note: {ev['note']}") else: report.append(f" Evidence: None found") report.append("") report.append("=" * 80) report.append("COMPARISON WITH ISIL VALIDATION") report.append("=" * 80) report.append("") report.append("Previous validation (ISIL registry matching):") report.append(" - NL institutions: 58 (v3) โ†’ 12 (v4)") report.append(" - ISIL matches: 0") report.append(" - Precision (ISIL-based): 8.3%") report.append("") report.append("This validation (web-based):") report.append(f" - NL institutions: 12 (v4)") report.append(f" - Web-validated matches: {valid}") report.append(f" - Precision (web-based): {precision*100:.1f}%") report.append("") report.append("=" * 80) report.append("KEY FINDINGS") report.append("=" * 80) report.append("") report.append("VALID INSTITUTIONS (6):") report.append(" 1. Historisch Centrum Overijssel - Provincial archive") report.append(" 2. Historisch Centrum Limburg (HCL) - Provincial archive") report.append(" 3. Van Abbemuseum - Modern art museum, Eindhoven") report.append(" 4. Het Noordbrabants Museum - Provincial museum, Den Bosch") report.append(" 5. Zeeuws Archief - Provincial archive with UNESCO registration") report.append(" 6. Fries Museum - Friesland cultural museum with archives") report.append("") report.append("INVALID INSTITUTIONS (5):") report.append(" 1. Archive Net - Network/platform, not an institution") report.append(" 2. IFLA Library - International organization, not a library") report.append(" 3. Studies/Southeast Asian Studies Leiden - Academic department") report.append(" 4. University Malaysia - Wrong country") report.append(" 5. Islamic University Sociology/Anthropology - Wrong country") report.append("") report.append("UNCERTAIN (1):") report.append(" 1. Library FabLab - Concept/service, not specific institution") report.append("") report.append("=" * 80) report.append("CONCLUSION") report.append("=" * 80) report.append("") report.append("Web-based validation provides a more accurate assessment of extraction") report.append("quality than ISIL registry matching:") report.append("") report.append(f" - Web validation precision: {precision*100:.1f}%") report.append(" - ISIL validation precision: 8.3% (misleading)") report.append("") report.append("The 50% precision rate reveals significant extraction errors:") report.append(" - Wrong country extractions (Malaysia institutions)") report.append(" - Organizations confused with institutions (IFLA)") report.append(" - Platforms/networks misidentified as institutions") report.append(" - Academic departments extracted as heritage custodians") report.append("") report.append("This validates the need for improved extraction rules and") report.append("better distinction between:") report.append(" 1. Physical heritage institutions vs. organizations") report.append(" 2. Individual institutions vs. networks/platforms") report.append(" 3. Academic departments vs. heritage custodians") report.append(" 4. Dutch institutions vs. foreign institutions") report.append("") # Save report report_text = '\n'.join(report) with open(output_file, 'w', encoding='utf-8') as f: f.write(report_text) return report_text def main(): """Main validation analysis.""" project_root = Path(__file__).parent.parent output_dir = project_root / 'output' output_dir.mkdir(exist_ok=True) print("Analyzing web validation results...") validation_results = analyze_validation_results() # Save JSON results json_file = output_dir / 'web_validation_results.json' with open(json_file, 'w', encoding='utf-8') as f: json.dump(validation_results, f, indent=2, ensure_ascii=False) print(f"Saved JSON results to: {json_file}") # Generate report report_file = output_dir / 'web_validation_report.txt' report_text = generate_report(validation_results, report_file) print(f"Saved report to: {report_file}") print() print(report_text) if __name__ == '__main__': main()