#!/usr/bin/env python3 """ Test subagent-based NER extraction for heritage institutions. This script tests using a coding subagent to perform Named Entity Recognition instead of the pattern-based extractor. """ import sys import json from pathlib import Path # Sample text with known institutions (same as V5 test) SAMPLE_TEXT = """ # Dutch Heritage Institutions Discussion ## Valid Institutions (should be extracted) The Van Abbemuseum in Eindhoven is a renowned modern art museum with an ISIL code NL-EhdVAM. Founded in 1936, it holds one of the finest collections of modern and contemporary art in Europe. The Zeeuws Archief in Middelburg preserves historical records from the province of Zeeland. ISIL: NL-MdlZA. It maintains extensive archival collections dating back to the 13th century. Historisch Centrum Overijssel (ISIL: NL-ZwHCO) in Zwolle serves as the regional archive for the province of Overijssel. ## Organizations/Networks (should be filtered by V5) The IFLA Library is an international organization that coordinates library networks worldwide. Archive Net is a global network connecting archives across multiple continents. ## Generic Descriptors (should be filtered by V5) The Library FabLab is a makerspace facility open to the public. University Library is part of the academic infrastructure. ## Geographic Errors (should be filtered by V5 country validation) The National Museum of Malaysia in Kuala Lumpur houses extensive Southeast Asian collections. University Malaysia specializes in research and education. """ def main(): """Test subagent-based extraction""" print("=" * 70) print("Subagent-Based NER Extraction Test") print("=" * 70) print() print("Sample text:") print("-" * 70) print(SAMPLE_TEXT[:500] + "...") print("-" * 70) print() print("This test demonstrates using a Task subagent for NER extraction") print("instead of pattern-based regex extraction.") print() print("Expected output:") print(" - Clean institution names (no mangling)") print(" - Accurate type classification (MUSEUM, ARCHIVE, LIBRARY)") print(" - Location extraction (city, country)") print(" - ISIL code identification") print() # Instructions for using Task tool print("=" * 70) print("NEXT STEP: Use Task tool with subagent_type='general'") print("=" * 70) print() print("Task prompt should be:") print("-" * 70) print(""" Extract ALL heritage institutions (museums, archives, libraries, galleries) from the following text. For EACH institution found, return JSON with: { "name": "Full institution name (no truncation)", "institution_type": "MUSEUM | ARCHIVE | LIBRARY | GALLERY", "city": "City name (if mentioned)", "country": "2-letter ISO country code (if can be determined)", "isil_code": "ISIL code (if mentioned, format: XX-XXXXX)", "confidence": 0.0-1.0 (based on explicitness of mention) } Rules: 1. Preserve full names (e.g., "Van Abbemuseum", not "Abbemuseum") 2. Classify by primary function (museum, archive, library, gallery) 3. Extract city from context (e.g., "in Eindhoven" → city: "Eindhoven") 4. Determine country from city or explicit mentions 5. Find ISIL codes in patterns like "ISIL: XX-XXXXX" or "(ISIL: XX-XXXXX)" 6. Exclude: - International organizations (IFLA, UNESCO, etc.) - Networks/platforms ("Archive Net", "Museum Association") - Generic descriptors ("University Library" without specific institution name) - Academic departments (unless they have dedicated collections) Return ONLY valid JSON array, no additional text. TEXT TO ANALYZE: """ + SAMPLE_TEXT) print("-" * 70) print() print("Expected result: JSON array with 3 institutions") print(" 1. Van Abbemuseum (Eindhoven, NL, MUSEUM)") print(" 2. Zeeuws Archief (Middelburg, NL, ARCHIVE)") print(" 3. Historisch Centrum Overijssel (Zwolle, NL, ARCHIVE)") print() print("Should filter out:") print(" - IFLA Library (international organization)") print(" - Archive Net (network)") print(" - Library FabLab (generic descriptor)") print(" - University Library (no specific name)") print(" - National Museum of Malaysia (wrong country for Dutch focus)") print(" - University Malaysia (wrong country)") print() return 0 if __name__ == "__main__": sys.exit(main())