- Introduced `llm_extract_archiveslab.py` script for entity and relationship extraction using LLMAnnotator with GLAM-NER v1.7.0. - Replaced regex-based extraction with generative LLM inference. - Added functions for loading markdown content, converting annotation sessions to dictionaries, and generating extraction statistics. - Implemented comprehensive logging of extraction results, including counts of entities, relationships, and specific types like heritage institutions and persons. - Results and statistics are saved in JSON format for further analysis. |
||
|---|---|---|
| .github/workflows | ||
| .opencode | ||
| archive | ||
| data | ||
| docs | ||
| exa-mcp-server-source@4aeb0543f9 | ||
| examples | ||
| frontend | ||
| infrastructure | ||
| mcp-wikidata@230e0456d2 | ||
| mcp_servers | ||
| ontology | ||
| package | ||
| reports | ||
| schemas | ||
| scripts | ||
| src/glam_extractor | ||
| tests | ||
| .gitignore | ||
| .ignore | ||
| ADVANCED_LAYOUT_OPTIONS_COMPLETE.md | ||
| AGENTS.md | ||
| analyze_brazil_batch13_candidates.py | ||
| APPELLATION_IDENTIFIER_REFACTORING_20251122.md | ||
| APPELLATION_REFACTORING_PHASE2_20251122.md | ||
| archive_log.txt | ||
| AUSTRIAN_ISIL_DEDUPLICATION_SUMMARY.md | ||
| AUSTRIAN_ISIL_QUICK_START.md | ||
| AUSTRIAN_ISIL_SESSION_COMPLETE.md | ||
| AUSTRIAN_ISIL_SESSION_COMPLETE_BATCH1.md | ||
| AUSTRIAN_ISIL_SESSION_CONTINUED_20251118.md | ||
| AUSTRIAN_ISIL_SESSION_HANDOFF_20251118.md | ||
| AUSTRIAN_ISIL_SESSION_SUMMARY.md | ||
| AUXILIARY_CLASSES_COMPLETE.md | ||
| BATCH12_ENRICHMENT_REPORT.md | ||
| BATCH13_ENRICHMENT_REPORT.md | ||
| BATCH14_ENRICHMENT_REPORT.md | ||
| BEFORE_AFTER_MERMAID_COMPARISON.md | ||
| BELGIAN_ISIL_COMPLETE.md | ||
| BRAZILIAN_CURATION_SESSION_SUMMARY.md | ||
| BULGARIAN_ISIL_EXTRACTION_COMPLETE.md | ||
| CANADIAN_ENRICHMENT_GUIDE.md | ||
| CANADIAN_GEOCODING_COMPLETE.md | ||
| CANADIAN_INTEGRATION_REPORT.md | ||
| CANADIAN_ISIL_SUCCESS.md | ||
| CHANGES_SUMMARY_20251122.txt | ||
| check_geocoding_progress.py | ||
| check_scraper_status.sh | ||
| CHILEAN_BATCH1_REPORT.md | ||
| CLEANUP_MERMAID_FILES.md | ||
| COLLECTION_DEPARTMENT_INTEGRATION_COMPLETE_20251122.md | ||
| compare_dutch_datasets.py | ||
| COMPLETE_SCHEMA_DIAGRAM_SESSION_SUMMARY.md | ||
| COMPLETE_SCHEMA_MERMAID_GENERATION.md | ||
| COMPLETE_SESSION_OVERVIEW_20251122.md | ||
| CONTRIBUTING.md | ||
| convert_canadian_to_linkml.py | ||
| COUNTRY_CLASS_IMPLEMENTATION_COMPLETE.md | ||
| COUNTRY_RESTRICTION_IMPLEMENTATION.md | ||
| COUNTRY_RESTRICTION_QUICKSTART.md | ||
| CRITICAL_ARCHITECTURAL_FIX_PROV.md | ||
| CRITICAL_FIX_TYPED_RANGES.md | ||
| crosslink_dutch_datasets.py | ||
| curate_brazilian_institutions.py | ||
| curate_chilean_institutions.md | ||
| CURATION_STATUS.md | ||
| CUSTODIAN_COLLECTION_ADDITION_20251122.md | ||
| CUSTODIAN_MULTI_ASPECT_REFACTORING.md | ||
| CUSTODIAN_TYPE_ONTOLOGY_ALIGNMENT.md | ||
| CUSTODIAN_TYPE_ONTOLOGY_ALIGNMENT_COMPLETE.md | ||
| CUSTODIAN_TYPE_PHASE2_COMPLETE.md | ||
| CUSTODIAN_TYPE_PHASE2_PROGRESS_20251123.md | ||
| CUSTODIAN_TYPE_PHASE2_SESSION3_COMPLETE.md | ||
| CUSTODIAN_TYPE_PHASE2_SESSION4_COMPLETE.md | ||
| CUSTODIAN_TYPE_PHASE2_SESSION5_COMPLETE.md | ||
| CUSTODIAN_TYPE_PHASE2_SESSION5_EXTENDED_COMPLETE.md | ||
| CUSTODIAN_TYPE_PHASE2_SESSION_COMPLETE.md | ||
| CZECH_ARCHIVES_INVESTIGATION.md | ||
| CZECH_ARCHIVES_NEXT_ACTIONS.md | ||
| CZECH_ARON_API_INVESTIGATION.md | ||
| CZECH_CROSSLINK_REPORT.md | ||
| CZECH_ISIL_COMPLETE_REPORT.md | ||
| CZECH_ISIL_HARVEST_SUMMARY.md | ||
| CZECH_ISIL_NEXT_STEPS.md | ||
| CZECH_ISIL_WIKIDATA_EXTRACTION.md | ||
| CZECH_PRIORITY1_COMPLETE.md | ||
| CZECH_WIKIDATA_ENRICHMENT_COMPLETE.md | ||
| D3JS_UML_VISUALIZATION_COMPLETE.md | ||
| DAGRE_GRID_LAYOUT_IMPLEMENTATION.md | ||
| DAGRE_RANKER_EXPLAINED.md | ||
| deduplicate_brazilian_institutions.py | ||
| DENMARK_QUICK_REFERENCE.md | ||
| DIGITAL_PLATFORM_CLASS_COMPLETE.md | ||
| DIGITAL_PLATFORM_CLASS_COMPLETE_v1.md | ||
| EDGE_DIRECTIONALITY_IMPLEMENTATION.md | ||
| EDGE_DIRECTIONALITY_QUICK_GUIDE.md | ||
| EDGE_DIRECTIONALITY_SESSION_COMPLETE.md | ||
| EDGE_TESTING_MCP_ANALYSIS.md | ||
| EDGE_TESTING_MCP_ANALYSIS_SUMMARY.md | ||
| ENCOMPASSING_BODY_FIXES_COMPLETE.md | ||
| ENCOMPASSING_BODY_IMPLEMENTATION_COMPLETE.md | ||
| ENCOMPASSING_BODY_INTEGRATION_STATUS.md | ||
| ENCOMPASSING_BODY_RDF_UML_GENERATION.md | ||
| enrich_brazil_batch11.py | ||
| enrich_brazil_batch12.py | ||
| enrich_brazil_batch13.py | ||
| enrich_brazil_batch17.py | ||
| enrich_bulgaria_isil.py | ||
| enrich_geocoding.py | ||
| enrich_japan_fast.py | ||
| enrich_japan_isil.py | ||
| enrichment_force_log.txt | ||
| enrichment_log.txt | ||
| enrichment_log_fixed.txt | ||
| EXA_BUG_FIX.md | ||
| EXECUTIVE_SUMMARY.md | ||
| EXECUTIVE_SUMMARY_UML_EDGE_DIRECTIONALITY.md | ||
| export_bulgaria_rdf.py | ||
| EXPORT_FUNCTIONALITY_IMPLEMENTATION.md | ||
| extract_brazilian_institutions.py | ||
| extract_brazilian_institutions_v2.py | ||
| extract_conversations_batch.py | ||
| extract_mexican_glams.py | ||
| extract_mexican_glams_v2.py | ||
| extraction_log.txt | ||
| extraction_log_session3.txt | ||
| FEATUREPLACE_IMPLEMENTATION_COMPLETE.md | ||
| FEATUREPLACE_ONTOLOGY_MAPPING_COMPLETE.md | ||
| FEATUREPLACE_ONTOLOGY_MAPPING_STRATEGY.md | ||
| FINAL_CLARIFICATION_MERMAID_OUTPUTS.md | ||
| FINAL_SESSION_SUMMARY.md | ||
| find_brazil_bonus.py | ||
| find_brazil_institutions.py | ||
| fix_heritage_linked_pubs.py | ||
| FOUR_ASPECT_ARCHITECTURE_QUICK_REF.md | ||
| generate_comparison_report.py | ||
| generate_geocoding_report.py | ||
| GEOCODING_SESSION_2025-11-07.md | ||
| GEOCODING_SESSION_2025-11-07_RESUMED.md | ||
| GEOGRAPHIC_RESTRICTION_COMPLETE.md | ||
| GEOGRAPHIC_RESTRICTION_QUICK_STATUS.md | ||
| GEOGRAPHIC_RESTRICTION_SESSION_COMPLETE.md | ||
| GERMAN_HARVEST_STATUS.md | ||
| GERMAN_REGIONAL_ARCHIVE_PORTALS_DISCOVERY.md | ||
| GERMAN_STATE_EXTRACTION_PATTERN.md | ||
| HUB_ARCHITECTURE_DIAGRAM.md | ||
| HYPERNYMS_REMOVAL_COMPLETE.md | ||
| IMPLEMENTATION_COMPLETE.md | ||
| ISIL_HARVEST_STATUS_20251119.md | ||
| JAPAN_WIKIDATA_ENRICHMENT_STRATEGY.md | ||
| LAYOUT_OPTIONS_QUICK_REFERENCE.md | ||
| LEGAL_RESPONSIBILITY_COLLECTION_COMPLETE.md | ||
| LEGAL_RESPONSIBILITY_COLLECTION_QUICKSTART.md | ||
| LIBYA_ENRICHMENT_COMPLETE.md | ||
| LIBYA_WIKIDATA_CLEANUP_SUMMARY.md | ||
| LIBYA_WIKIDATA_CREATION_STATUS.md | ||
| LIBYA_WIKIDATA_ENRICHMENT_COMPLETE.md | ||
| LICENSE | ||
| LINKML_CONSTRAINTS_COMPLETE_20251122.md | ||
| LINKML_VISUALIZATION_SESSION_COMPLETE_20251122.md | ||
| MAIN_SCHEMA_RDF_GENERATION_COMPLETE.md | ||
| MANUAL_TESTING_RESULTS.md | ||
| merge_batch13_corrected.py | ||
| merge_batch14.py | ||
| merge_batch15.py | ||
| merge_brazil_batch13.py | ||
| MERMAID_GENERATORS_EXPLAINED.md | ||
| mexican_glam_1.json | ||
| mexican_glam_2.json | ||
| mexican_glam_extracted.json | ||
| MIGRATION_CHECKLIST_ISO20275.md | ||
| MIGRATION_COMPLETED_v0.2.2.md | ||
| MNEMONIC_CORRECTION.md | ||
| NEXT_AGENT_HANDOFF_NRW_COMPLETE.md | ||
| NEXT_AGENT_HANDOFF_SAXONY_COMPLETE.md | ||
| NEXT_SESSION_HANDOFF.md | ||
| NEXT_STEPS.md | ||
| NEXT_STEPS_Mexican_Geocoding.md | ||
| NRW_HARVEST_COMPLETE_20251119.md | ||
| ONTOLOGY_CONSULTATION_REPORT_CUSTODIAN_TYPE.md | ||
| ONTOLOGY_ENRICHMENT_PLAN.md | ||
| ONTOLOGY_RULES_SUMMARY.md | ||
| ORGANIZATIONAL_CHANGE_EVENT_COMPLETE_20251122.md | ||
| ORGANIZATIONAL_STRUCTURE_ADDITION_20251122.md | ||
| ORGANIZATIONAL_STRUCTURE_COMPLETE_20251122.md | ||
| ORGANIZATIONAL_STRUCTURE_EXAMPLES.md | ||
| osm_resume_log.txt | ||
| parse_eu_isil.py | ||
| parse_japan_isil.py | ||
| PHASE1_QUICK_WINS_COMPLETE.md | ||
| PICO_STAFF_ROLES_COMPLETE_20251122.md | ||
| process_chilean_institutions.py | ||
| process_mexican_institutions.py | ||
| PROGRESS.md | ||
| pyproject.toml | ||
| QUERY_BUILDER_LAYOUT_FIX.md | ||
| QUICK_ACTION_PLAN_GERMAN_REGIONAL_HARVESTS.md | ||
| QUICK_ACTION_PLAN_UML_TESTING.md | ||
| QUICK_REFERENCE_SESSION_COMPLETE.md | ||
| QUICK_REFERENCE_VALIDATION.md | ||
| QUICK_START_AUSTRALIA.md | ||
| QUICK_START_DAGRE_TESTING.md | ||
| QUICK_STATUS_20251119.md | ||
| QUICK_STATUS_20251119_POST_NRW.md | ||
| QUICK_STATUS_APPELLATION_IDENTIFIER_COMPLETE.md | ||
| QUICK_STATUS_BAVARIA_DECISION.md | ||
| QUICK_STATUS_COMPLETE_MERMAID_GENERATION.md | ||
| QUICK_STATUS_COUNTRY_CLASS_20251122.md | ||
| QUICK_STATUS_CUSTODIAN_SCHEMA_20251121.md | ||
| QUICK_STATUS_CUSTODIAN_SCHEMA_MOD | ||
| QUICK_STATUS_CUSTODIAN_SCHEMA_MOD_20251122.md | ||
| QUICK_STATUS_CUSTODIAN_TYPE_20251123.md | ||
| QUICK_STATUS_EDGE_TESTING.md | ||
| QUICK_STATUS_EXPORT_COMPLETE.md | ||
| QUICK_STATUS_FEATUREPLACE_COMPLETE.md | ||
| QUICK_STATUS_HYPERNYMS_REMOVAL_20251122.md | ||
| QUICK_STATUS_LEGAL_ENTITY_20251122.md | ||
| QUICK_STATUS_MAIN_SCHEMA_RDF_20251124.md | ||
| QUICK_STATUS_ORGANIZATIONAL_COMPLETE_20251122.md | ||
| QUICK_STATUS_ORGANIZATIONAL_STRUCTURE_20251122.md | ||
| QUICK_STATUS_SCHEMA_MODULARIZATION_DONE_20251121.md | ||
| QUICK_STATUS_SLOT_USAGE_COMPLETE_20251121.md | ||
| QUICK_STATUS_TOOIONT_20251121.md | ||
| QUICK_STATUS_UML_GENERATION_20251123.md | ||
| RDF_GENERATION_SUMMARY.md | ||
| RDF_UML_GENERATION_COMPLETE_20251122_155319.md | ||
| RDF_UML_GENERATION_COMPLETE_20251122_old.md | ||
| README.md | ||
| RECORD_COMPARISON.md | ||
| RESUME_CHILEAN_ENRICHMENT.md | ||
| run_scraper_background.sh | ||
| RUNNING_THE_APPLICATION.md | ||
| SACHSEN_ANHALT_96_PERCENT_COMPLETE.md | ||
| sachsen_anhalt_100percent_log.txt | ||
| SACHSEN_ANHALT_COMPLETE.md | ||
| sachsen_anhalt_enrichment_v2_log.txt | ||
| SAXONY_HARVEST_STRATEGY.md | ||
| SCHEMA_AUTHORITY_CHECKLIST.md | ||
| SCRAPER_COMPLETION_INSTRUCTIONS.md | ||
| SESSION-RESUME.md | ||
| session-ses_52a6.md | ||
| session-ses_52ff.md | ||
| SESSION_2025-11-09_SCHEMA_ONTOLOGY_UPDATE.md | ||
| SESSION_COMPLETE.md | ||
| SESSION_COMPLETE_20251122_APPELLATION_PHASE2.md | ||
| SESSION_COMPLETE_20251122_COLLECTION.md | ||
| SESSION_COMPLETE_ARGENTINA_ENRICHMENT.txt | ||
| SESSION_COMPLETE_COMPLETE_MERMAID_EXTENSION.md | ||
| SESSION_COMPLETE_ENCOMPASSING_BODY.md | ||
| SESSION_COMPLETE_ENCOMPASSING_BODY_MAIN_SCHEMA.md | ||
| SESSION_COMPLETION_SUMMARY.md | ||
| SESSION_CONTINUATION_SUMMARY_20251119.md | ||
| SESSION_SUMMARY.md | ||
| SESSION_SUMMARY_2025-11-05.md | ||
| SESSION_SUMMARY_2025-11-05_batch_processing.md | ||
| SESSION_SUMMARY_2025-11-06_Chilean_Geocoding.md | ||
| SESSION_SUMMARY_2025-11-07.md | ||
| SESSION_SUMMARY_2025-11-08.md | ||
| SESSION_SUMMARY_2025-11-08_LATAM.md | ||
| SESSION_SUMMARY_2025-11-09.md | ||
| SESSION_SUMMARY_20251111_BRAZIL_MERGE.md | ||
| SESSION_SUMMARY_20251112_BRAZIL_DOCUMENTATION.md | ||
| SESSION_SUMMARY_20251113_MEXICO_BATCH2.md | ||
| SESSION_SUMMARY_20251113_MEXICO_RECONCILIATION.md | ||
| SESSION_SUMMARY_20251118_ARGENTINA_LINKML_EXPORT.md | ||
| SESSION_SUMMARY_20251118_AUSTRALIA_TROVE.md | ||
| SESSION_SUMMARY_20251118_ISIL_PROCESSING.md | ||
| SESSION_SUMMARY_20251119_ARCHIVES_DISCOVERY.md | ||
| SESSION_SUMMARY_20251119_AUSTRIAN_CONSOLIDATION.md | ||
| SESSION_SUMMARY_20251119_AUTOMATED_SPOT_CHECKS.md | ||
| SESSION_SUMMARY_20251119_CANADIAN_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_CZECH_ARCHIVES_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_CZECH_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_CZECH_WIKIDATA_ENRICHMENT_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_DDB_HARVEST_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_DENMARK_ARCHIVES_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_DENMARK_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_DENMARK_ISIL_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_NRW_MERGE_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_PREFILL_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_PRIORITY1_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_RDF_WIKIDATA_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_UNIFICATION_COMPLETE.md | ||
| SESSION_SUMMARY_20251119_WIKIDATA_VALIDATION_PACKAGE.md | ||
| SESSION_SUMMARY_20251120_BAVARIA_COMPLETE.md | ||
| SESSION_SUMMARY_20251120_BAVARIA_ENRICHMENT.md | ||
| SESSION_SUMMARY_20251120_BAVARIA_ENRICHMENT_COMPLETE.md | ||
| SESSION_SUMMARY_20251120_FINLAND_UNIFIED.md | ||
| SESSION_SUMMARY_20251120_JAPAN_SYNTHETIC_QNUMBER_CLEANUP.md | ||
| SESSION_SUMMARY_20251120_JAPAN_WIKIDATA_ENRICHMENT_COMPLETION.md | ||
| SESSION_SUMMARY_20251120_PHASE2_CRITICAL_FIXES.md | ||
| SESSION_SUMMARY_20251120_SACHSEN_ANHALT_STARTED.md | ||
| SESSION_SUMMARY_20251120_SACHSEN_ARCHIVES.md | ||
| SESSION_SUMMARY_20251120_SAXONY_FOUNDATION.md | ||
| SESSION_SUMMARY_20251120_SAXONY_MUSEUMS_COMPLETE.md | ||
| SESSION_SUMMARY_20251120_THUERINGEN_100_PERCENT.md | ||
| SESSION_SUMMARY_20251121_CUSTODIAN_RENAMING.md | ||
| SESSION_SUMMARY_20251121_DBPEDIA_INTEGRATION_COMPLETE.md | ||
| SESSION_SUMMARY_20251121_ENUM_SLOT_USAGE_MAPPINGS.md | ||
| SESSION_SUMMARY_20251121_ISO20275_COMPLETE.md | ||
| SESSION_SUMMARY_20251121_LINKML_HUB_ARCHITECTURE_COMPLETE.md | ||
| SESSION_SUMMARY_20251121_NAME_ENTITY_FOUNDATION_COMPLETE.md | ||
| SESSION_SUMMARY_20251121_NARROW_MAPPINGS_EXTENSION.md | ||
| SESSION_SUMMARY_20251121_OBSERVATION_RECONSTRUCTION_CONTINUATION.md | ||
| SESSION_SUMMARY_20251121_OBSERVATION_RECONSTRUCTION_PATTERN.md | ||
| SESSION_SUMMARY_20251121_PLANTUML_BUG_FIX.md | ||
| SESSION_SUMMARY_20251121_SCHEMA_AUTHORITY_COMPLETE.md | ||
| SESSION_SUMMARY_20251121_SCHEMA_CONSOLIDATION.md | ||
| SESSION_SUMMARY_20251121_SCHEMA_METADATA_REFINEMENT.md | ||
| SESSION_SUMMARY_20251121_SCHEMA_MODULARIZATION_COMPLETE.md | ||
| SESSION_SUMMARY_20251121_SLOT_URI_COMPLETE.md | ||
| SESSION_SUMMARY_20251121_SLOT_USAGE_COMPLETE.md | ||
| SESSION_SUMMARY_20251121_TIMESPAN_INTEGRATION.md | ||
| SESSION_SUMMARY_20251121_TOOIONT_INTEGRATION.md | ||
| SESSION_SUMMARY_20251122_APPELLATION_IDENTIFIER_REFACTORING.md | ||
| SESSION_SUMMARY_20251122_CUSTODIAN_MULTI_ASPECT.md | ||
| SESSION_SUMMARY_20251122_LEGAL_ENTITY_IMPLEMENTATION.md | ||
| SESSION_SUMMARY_20251122_LEGAL_ENTITY_REFACTORING.md | ||
| SESSION_SUMMARY_20251122_LEGAL_ENTITY_REFACTORING_COMPLETE.md | ||
| SESSION_SUMMARY_20251122_SOURCEDOCUMENT_ONTOLOGY_ENRICHMENT.md | ||
| SESSION_SUMMARY_20251123.md | ||
| SESSION_SUMMARY_20251125_UML_EDGE_DIRECTIONALITY.md | ||
| SESSION_SUMMARY_ARGENTINA_CONABIP.md | ||
| SESSION_SUMMARY_ARGENTINA_Z3950_INVESTIGATION.md | ||
| SESSION_SUMMARY_BATCH7.md | ||
| SESSION_SUMMARY_COLLECTION_DEPT_PHASE4_20251122.md | ||
| SESSION_SUMMARY_DAGRE_IMPLEMENTATION.md | ||
| SESSION_SUMMARY_LINKML_PHASE8_20251122.md | ||
| SESSION_SUMMARY_NETHERLANDS_ARGENTINA.md | ||
| SESSION_SUMMARY_NOV7_DUTCH_VALIDATION.md | ||
| SESSION_SUMMARY_ORGANIZATIONAL_MODELING_20251122.md | ||
| SESSION_SUMMARY_PICO_PHASE3_20251122.md | ||
| SESSION_SUMMARY_RDF_PARTNERSHIPS.md | ||
| SESSION_SUMMARY_SHACL_PHASE7_20251122.md | ||
| SESSION_SUMMARY_SPARQL_PHASE6_20251122.md | ||
| SESSION_SUMMARY_SWITZERLAND_ISIL.md | ||
| SESSION_SUMMARY_v3_geocoding.md | ||
| SESSION_SUMMARY_V5.md | ||
| SESSION_SUMMARY_VALIDATION_PHASE5_20251122.md | ||
| SHACL_SHAPES_COMPLETE_20251122.md | ||
| SPARQL_QUERY_LIBRARY_COMPLETE_20251122.md | ||
| TASTE_SMELL_CLASS_ADDITION.md | ||
| TAXONOMY_UPDATE_SUMMARY.md | ||
| test-edge-directionality.sh | ||
| test_canadian_parser.py | ||
| TEST_EDGE_DIRECTIONALITY.md | ||
| test_real_dutch_orgs.py | ||
| test_real_isil.py | ||
| TESTING_SUMMARY.md | ||
| THUERINGEN_100_PERCENT_EXTRACTION_ACHIEVED.md | ||
| THUERINGEN_COMPREHENSIVE_HARVEST_SESSION_20251120.md | ||
| THUERINGEN_HARVEST_COMPLETE.md | ||
| THUERINGEN_V4_ENRICHMENT_COMPLETE.md | ||
| THUERINGEN_V4_MERGE_COMPLETE.md | ||
| UML_GENERATION_COMPLETE.md | ||
| UML_GENERATION_COMPLETE_20251123.md | ||
| UML_GENERATION_LINKML_AUTO.md | ||
| UML_VIEWER_FIX.md | ||
| UML_VIEWER_VS_MERMAID_ANALYSIS.md | ||
| UNIFICATION_SUMMARY.md | ||
| V5_QUICK_REFERENCE.md | ||
| validate_curated.py | ||
| validate_instances.py | ||
| VALIDATION_FRAMEWORK_COMPLETE_20251122.md | ||
| validation_output.txt | ||
| VERIFICATION_CHECKLIST_20251122.md | ||
| verify_batch13_ids.py | ||
| wget-log | ||
| wget-log.1 | ||
| wget-log.2 | ||
| wget-log.3 | ||
| wget-log.4 | ||
| wget-log.5 | ||
| wget-log.6 | ||
| wget-log.7 | ||
| wget-log.8 | ||
| wget-log.9 | ||
| wget-log.10 | ||
| wget-log.11 | ||
| wget-log.12 | ||
| wget-log.13 | ||
| wget-log.14 | ||
| wget-log.15 | ||
| wget-log.16 | ||
| wget-log.17 | ||
| wget-log.18 | ||
| wget-log.19 | ||
| wget-log.20 | ||
| wget-log.21 | ||
| wget-log.22 | ||
| wget-log.23 | ||
| wget-log.24 | ||
| wget-log.25 | ||
| wget-log.26 | ||
| wget-log.27 | ||
| wget-log.28 | ||
| wget-log.29 | ||
| wget-log.30 | ||
| wget-log.31 | ||
| wget-log.32 | ||
| wget-log.33 | ||
| wget-log.34 | ||
| wget-log.35 | ||
| wget-log.36 | ||
| wget-log.37 | ||
| wget-log.38 | ||
| wget-log.39 | ||
| wget-log.40 | ||
| wget-log.41 | ||
| wget-log.42 | ||
| wget-log.43 | ||
| wget-log.44 | ||
| wget-log.45 | ||
| wget-log.46 | ||
| wget-log.47 | ||
| wget-log.48 | ||
| wget-log.49 | ||
| wget-log.50 | ||
| wget-log.51 | ||
| wget-log.52 | ||
| wget-log.53 | ||
| wget-log.54 | ||
| wget-log.55 | ||
| wget-log.56 | ||
| wget-log.57 | ||
| wget-log.58 | ||
| wget-log.59 | ||
| wget-log.60 | ||
| wget-log.61 | ||
| wget-log.62 | ||
| wget-log.63 | ||
| wget-log.64 | ||
| wget-log.65 | ||
| wget-log.66 | ||
| wget-log.67 | ||
| wget-log.68 | ||
| wget-log.69 | ||
| wget-log.70 | ||
| wget-log.71 | ||
| wget-log.72 | ||
| wget-log.73 | ||
| wget-log.74 | ||
| wget-log.75 | ||
| wget-log.76 | ||
| wget-log.77 | ||
| wget-log.78 | ||
| wget-log.79 | ||
| wget-log.80 | ||
| wget-log.81 | ||
| wget-log.82 | ||
| wget-log.83 | ||
| wget-log.84 | ||
| wget-log.85 | ||
| wget-log.86 | ||
| wget-log.87 | ||
| wget-log.88 | ||
| wget-log.89 | ||
| wget-log.90 | ||
| wget-log.91 | ||
| wget-log.92 | ||
| wget-log.93 | ||
| wget-log.94 | ||
| wget-log.95 | ||
| wget-log.96 | ||
| wget-log.97 | ||
| wget-log.98 | ||
| wget-log.99 | ||
| wget-log.100 | ||
| wget-log.101 | ||
| wget-log.102 | ||
| wget-log.103 | ||
| wget-log.104 | ||
| wget-log.105 | ||
| wget-log.106 | ||
| wget-log.107 | ||
| wget-log.108 | ||
| wget-log.109 | ||
| wget-log.110 | ||
| wget-log.111 | ||
| wget-log.112 | ||
| wget-log.113 | ||
| wget-log.114 | ||
| wget-log.115 | ||
| wget-log.116 | ||
| wget-log.117 | ||
| wget-log.118 | ||
| wget-log.119 | ||
| wget-log.120 | ||
| wget-log.121 | ||
| wget-log.122 | ||
| wget-log.123 | ||
| wget-log.124 | ||
| wget-log.125 | ||
| wget-log.126 | ||
| wget-log.127 | ||
| wget-log.128 | ||
| wget-log.129 | ||
| wget-log.130 | ||
| wget-log.131 | ||
| wget-log.132 | ||
| wget-log.133 | ||
| wget-log.134 | ||
| wget-log.135 | ||
| wget-log.136 | ||
| wget-log.137 | ||
| wget-log.138 | ||
| WIKIDATA_CREATION_PLAN.md | ||
| WIKIDATA_MANUAL_CREATION_GUIDE.md | ||
| youtube_enrichment_log.txt | ||
| youtube_enrichment_log_v5.txt | ||
| youtube_enrichment_log_v6.txt | ||
| youtube_enrichment_log_v7.txt | ||
| youtube_enrichment_log_v8.txt | ||
| ZOOM_CAMERA_PERSISTENCE.md | ||
GLAM Extractor
Extract and standardize global GLAM (Galleries, Libraries, Archives, Museums) institutional data from conversation transcripts and authoritative registries.
🚀 How to Run the Application - Complete guide for starting frontend, backend, and servers.
Overview
This project extracts structured heritage institution data from 139+ Claude conversation JSON files covering worldwide GLAM research, integrates with authoritative CSV datasets (Dutch ISIL registry, Dutch heritage organizations), validates against a comprehensive LinkML schema, and exports to multiple formats (RDF/Turtle, JSON-LD, CSV, Parquet, SQLite).
Features
- Multi-source data integration: Conversation transcripts, CSV registries, web crawling, Wikidata
- NLP extraction: spaCy NER, transformers-based classification, pattern matching
- LinkML validation: Comprehensive schema with TOOI, Schema.org, CPOC, ISIL, RiC-O, BIBFRAME
- Provenance tracking: Every data point tracks source, confidence, and verification status
- Multi-format export: RDF/Turtle, JSON-LD, CSV, Parquet, SQLite
- Geocoding: Nominatim integration for location enrichment
- Multilingual support: Handles 60+ countries and languages
Interactive Frontend (React + TypeScript + D3.js)
- UML Viewer 🎨 - Interactive D3.js visualization of heritage custodian ontology diagrams (docs)
- Mermaid class diagrams, ER diagrams, PlantUML, GraphViz
- Zoom, pan, drag nodes, click for details
- 14 schema diagrams from
schemas/20251121/uml/
- Query Builder 🔍 - Visual SPARQL query constructor
- Add variables, triple patterns, filters
- Live SPARQL generation
- Execute against endpoints
- Graph Visualizer 🕸️ - RDF graph exploration with D3.js
- Upload RDF/Turtle files
- Interactive force-directed layout
- SPARQL queries
- Node metadata inspection
- Database 🗄️ - TypeDB integration (optional)
- NDE House Style 🎨 - Netwerk Digitaal Erfgoed branding throughout
Start the frontend: cd frontend && npm run dev
Quick Start
Installation
# Install Poetry (if not already installed)
curl -sSL https://install.python-poetry.org | python3 -
# Clone repository and install dependencies
cd glam-extractor
poetry install
# Download spaCy models
poetry run python -m spacy download en_core_web_trf
poetry run python -m spacy download nl_core_news_lg
poetry run python -m spacy download xx_ent_wiki_sm
Basic Usage
# Extract from conversation JSON
poetry run glam extract conversations/Brazilian_GLAM.json -o output.jsonld
# Extract from Dutch CSV
poetry run glam extract data/ISIL-codes_2025-08-01.csv --csv -o dutch_isil.jsonld
# Validate extracted data
poetry run glam validate output.jsonld -s schemas/heritage_custodian.yaml
# Export to RDF
poetry run glam export output.jsonld -o output.ttl -f rdf
# Crawl institutional website
poetry run glam crawl https://www.rijksmuseum.nl -o rijksmuseum.jsonld
Linked Open Data
The project publishes heritage institution data as W3C-compliant RDF aligned with international ontologies.
Schema RDF Formats (8 Serializations)
The LinkML schema is available in 8 RDF formats (generated from schemas/20251121/linkml/01_custodian_name_modular.yaml):
| Format | File | Size | Use Case |
|---|---|---|---|
| Turtle | 01_custodian_name.owl.ttl |
77KB | Human-readable, Git-friendly |
| N-Triples | 01_custodian_name.nt |
233KB | Line-oriented processing |
| JSON-LD | 01_custodian_name.jsonld |
191KB | Web APIs, JavaScript |
| RDF/XML | 01_custodian_name.rdf |
165KB | Legacy systems, Java |
| Notation3 | 01_custodian_name.n3 |
77KB | Logic rules, reasoning |
| TriG | 01_custodian_name.trig |
103KB | Named graphs, datasets |
| TriX | 01_custodian_name.trix |
348KB | XML with named graphs |
| N-Quads | 01_custodian_name.nq |
288KB | Quad-based processing |
All formats located in schemas/20251121/rdf/
Published Datasets
Denmark 🇩🇰 - ✅ COMPLETE (November 2025)
- 2,348 institutions (555 libraries, 594 archives, 1,199 branches)
- 43,429 RDF triples across 9 ontologies
- 769 Wikidata links (32.8% coverage)
- Formats: Turtle, RDF/XML, JSON-LD, N-Triples
See data/rdf/README.md for SPARQL examples and usage.
Ontology Alignment
| Ontology | Purpose | Coverage |
|---|---|---|
| CPOV (Core Public Organisation Vocabulary) | EU public sector standard | All institutions |
| Schema.org | Web semantics (Library, ArchiveOrganization) | All institutions |
| RICO (Records in Contexts) | Archival description | Archives |
| ORG (W3C Organization Ontology) | Hierarchical relationships | Branches |
| PROV-O (Provenance Ontology) | Data provenance tracking | All institutions |
| OWL | Semantic equivalence (Wikidata links) | 32.8% Denmark |
SPARQL Examples
# Find all libraries in Copenhagen
PREFIX schema: <http://schema.org/>
PREFIX cpov: <http://data.europa.eu/m8g/>
SELECT ?library ?name ?address WHERE {
?library a cpov:PublicOrganisation, schema:Library .
?library schema:name ?name .
?library schema:address ?addrNode .
?addrNode schema:addressLocality "København K" .
?addrNode schema:streetAddress ?address .
}
# Find all institutions with Wikidata links
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX schema: <http://schema.org/>
SELECT ?institution ?name ?wikidataID WHERE {
?institution schema:name ?name .
?institution owl:sameAs ?wikidataURI .
FILTER(STRSTARTS(STR(?wikidataURI), "http://www.wikidata.org/entity/Q"))
BIND(STRAFTER(STR(?wikidataURI), "http://www.wikidata.org/entity/") AS ?wikidataID)
}
See data/rdf/README.md for more examples.
Project Structure
glam-extractor/
├── pyproject.toml # Poetry configuration
├── README.md # This file
├── AGENTS.md # AI agent instructions
├── .opencode/ # AI agent documentation
│ ├── HYPER_MODULAR_STRUCTURE.md
│ └── SLOT_NAMING_CONVENTIONS.md
├── src/glam_extractor/ # Main package
│ ├── __init__.py
│ ├── cli.py # Command-line interface
│ ├── parsers/ # Conversation & CSV parsers
│ ├── extractors/ # NLP extraction engines
│ ├── crawlers/ # Web crawling (crawl4ai)
│ ├── validators/ # LinkML validation
│ ├── exporters/ # Multi-format export
│ ├── geocoding/ # Nominatim geocoding
│ └── utils/ # Utilities
├── schemas/20251121/ # LinkML schemas
│ ├── linkml/ # Hyper-modular schema (78 files)
│ │ ├── 01_custodian_name_modular.yaml
│ │ └── modules/
│ │ ├── metadata.yaml
│ │ ├── classes/ # 12 class modules
│ │ ├── enums/ # 5 enum modules
│ │ └── slots/ # 59 slot modules
│ ├── rdf/ # 8 RDF serialization formats
│ │ ├── 01_custodian_name.owl.ttl
│ │ ├── 01_custodian_name.nt
│ │ ├── 01_custodian_name.jsonld
│ │ ├── 01_custodian_name.rdf
│ │ ├── 01_custodian_name.n3
│ │ ├── 01_custodian_name.trig
│ │ ├── 01_custodian_name.trix
│ │ └── 01_custodian_name.nq
│ └── examples/ # LinkML instance examples
├── tests/ # Test suite
│ ├── unit/
│ ├── integration/
│ └── fixtures/
├── docs/ # Documentation
│ ├── plan/global_glam/ # Planning documents
│ ├── api/ # API documentation
│ ├── tutorials/ # User tutorials
│ └── examples/ # Usage examples
└── data/ # Reference data
├── ISIL-codes_2025-08-01.csv
├── voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv
└── ontology/ # Base ontologies (TOOI, CPOV, Schema.org, etc.)
Data Sources
Conversation JSON Files
139+ conversation files covering global GLAM research:
- Geographic coverage: 60+ countries across all continents
- Content: Institution names, locations, collections, digital platforms, partnerships
- Languages: Multilingual (English, Dutch, Portuguese, Spanish, Vietnamese, Japanese, Arabic, etc.)
CSV Datasets
- Dutch ISIL Registry (
ISIL-codes_2025-08-01.csv): ~300 Dutch heritage institutions with authoritative ISIL codes - Dutch Organizations (
voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv): Comprehensive metadata including systems, partnerships, collection platforms
External Sources (Optional Enrichment)
- Wikidata: SPARQL queries for additional metadata
- VIAF: Authority file linking
- GeoNames: Geographic name authority
- Nominatim: Geocoding service
Data Quality & Provenance
Every extracted record includes provenance metadata:
provenance:
data_source: CONVERSATION_NLP | ISIL_REGISTRY | DUTCH_ORG_CSV | WEB_CRAWL | WIKIDATA
data_tier: TIER_1_AUTHORITATIVE | TIER_2_VERIFIED | TIER_3_CROWD_SOURCED | TIER_4_INFERRED
extraction_date: 2025-11-05T...
extraction_method: "spaCy NER + GPT-4 classification"
confidence_score: 0.0-1.0
conversation_id: "uuid"
source_url: "https://..."
verified_date: null
verified_by: null
Data Tiers:
- Tier 1: Official registries (ISIL, national registers) - highest authority
- Tier 2: Verified institutional data (official websites)
- Tier 3: Community-sourced data (Wikidata, OpenStreetMap)
- Tier 4: NLP-extracted or inferred data - requires verification
LinkML Schema
Hyper-Modular Architecture
The project uses a hyper-modular LinkML schema (schemas/20251121/linkml/01_custodian_name_modular.yaml) where every class, enum, and slot is defined in its own individual file for maximum maintainability and version control granularity.
Schema Structure:
- 78 YAML files total
- 12 class modules (
modules/classes/) - 5 enum modules (
modules/enums/) - 59 slot modules (
modules/slots/) - 1 metadata module (
modules/metadata.yaml) - 1 main schema (
01_custodian_name_modular.yaml)
- 12 class modules (
Direct Import Pattern:
imports:
- linkml:types
- modules/metadata
- modules/enums/AgentTypeEnum
- modules/slots/observed_name
- modules/classes/CustodianObservation
# ... 76 total individual module imports
Benefits:
- ✅ Complete transparency - all dependencies visible
- ✅ Granular version control - one file per concept
- ✅ Parallel development - no merge conflicts
- ✅ Selective imports - customize schemas easily
See .opencode/HYPER_MODULAR_STRUCTURE.md for complete architecture documentation.
Ontology Alignment
The schema integrates multiple international standards:
- CPOV: Core Public Organisation Vocabulary (EU public sector)
- TOOI: Dutch organizational ontology
- Schema.org: General web semantics
- CIDOC-CRM: Cultural heritage domain model
- RiC-O: Records in Contexts Ontology
- PROV-O: Provenance tracking
- PiCo: Person observations pattern
Key Classes:
CustodianObservation: Source-based references (emic/etic perspectives)CustodianName: Standardized emic namesCustodianReconstruction: Formal legal entitiesReconstructionActivity: Entity derivation from observationsAgent: People responsible for observations/reconstructionsSourceDocument: Documentary evidenceIdentifier: External identifiers (ISIL, VIAF, Wikidata)TimeSpan: Temporal extents with fuzzy boundariesConfidenceMeasure: Data quality metrics
Observation → Reconstruction Pattern:
SourceDocument → CustodianObservation → ReconstructionActivity → CustodianReconstruction
(text) (what source says) (synthesis method) (formal entity)
This pattern distinguishes between source-based references and scholar-derived formal entities, inspired by the PiCo (Persons in Context) ontology.
Development
Run Tests
poetry run pytest # All tests
poetry run pytest -m unit # Unit tests only
poetry run pytest -m integration # Integration tests only
poetry run pytest --cov # With coverage report
Code Quality
poetry run black src/ tests/ # Format code
poetry run ruff check src/ tests/ # Lint code
poetry run mypy src/ # Type checking
Pre-commit Hooks
poetry run pre-commit install
poetry run pre-commit run --all-files
Documentation
poetry run mkdocs serve # Serve docs locally
poetry run mkdocs build # Build static docs
Examples
Extract Brazilian Institutions
from glam_extractor import ConversationParser, InstitutionExtractor
# Parse conversation
parser = ConversationParser()
conversation = parser.load("Brazilian_GLAM_collection_inventories.json")
# Extract institutions
extractor = InstitutionExtractor()
institutions = extractor.extract(conversation)
# Print results
for inst in institutions:
print(f"{inst.name} ({inst.institution_type})")
print(f" Location: {inst.locations[0].city}, {inst.locations[0].country}")
print(f" Confidence: {inst.provenance.confidence_score}")
Cross-link Dutch Data
from glam_extractor import CSVParser, InstitutionExtractor
from glam_extractor.validators import LinkMLValidator
# Load Dutch ISIL registry
csv_parser = CSVParser()
dutch_institutions = csv_parser.load_isil_registry("ISIL-codes_2025-08-01.csv")
# Load Dutch organizations
dutch_orgs = csv_parser.load_dutch_organizations("voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv")
# Cross-link and merge
extractor = InstitutionExtractor()
merged = extractor.merge_dutch_data(dutch_institutions, dutch_orgs)
# Validate
validator = LinkMLValidator(schema="schemas/heritage_custodian.yaml")
results = validator.validate_batch(merged)
print(f"Valid: {results.valid_count}, Invalid: {results.invalid_count}")
Export to Multiple Formats
from glam_extractor.exporters import JSONLDExporter, RDFExporter, CSVExporter
# Load extracted data
institutions = load_institutions("output.jsonld")
# Export to RDF/Turtle
rdf_exporter = RDFExporter()
rdf_exporter.export(institutions, "output.ttl")
# Export to CSV
csv_exporter = CSVExporter()
csv_exporter.export(institutions, "output.csv")
# Export to Parquet
csv_exporter.export_parquet(institutions, "output.parquet")
Documentation
-
Planning Docs:
docs/plan/global_glam/01-implementation-phases.md: 7-phase implementation plan02-architecture.md: System architecture and data flow03-dependencies.md: Technology stack and dependencies04-data-standardization.md: Data integration strategies05-design-patterns.md: Software design patterns06-consumers-use-cases.md: User segments and applications
-
AI Agent Instructions:
AGENTS.md- NLP extraction guidelines
- Data quality protocols
- Agent workflow examples
-
API Documentation: Generated from docstrings with mkdocstrings
Contributing
This is a research project. Contributions welcome!
- Fork the repository
- Create feature branch (
git checkout -b feature/amazing-feature) - Commit changes (
git commit -m 'Add amazing feature') - Push to branch (
git push origin feature/amazing-feature) - Open Pull Request
License
MIT License - see LICENSE file for details
Acknowledgments
- LinkML: Schema framework
- spaCy: NLP processing
- crawl4ai: Web crawling
- RDFLib: RDF processing
- Dutch ISIL Registry: Authoritative institution data
- Claude AI: Conversation data source
Contact
For questions or collaboration inquiries, please open an issue on GitHub.
Version: 0.1.0
Status: Alpha - Implementation in progress
Last Updated: 2025-11-05