glam/scripts/extract_egypt_step2.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

249 lines
8.9 KiB
Python

#!/usr/bin/env python3
"""
Egyptian GLAM Institution Extraction - Steps 1 & 2
Extracts heritage institutions from Egyptian GLAM conversation JSON.
Step 1: National libraries and archives
Step 2: Major museums
"""
import sys
import json
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Optional
import yaml
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from glam_extractor.models import (
HeritageCustodian,
Location,
Identifier,
Provenance,
InstitutionType,
DataSource,
DataTier
)
# Path to cached markdown content (generated from conversation artifacts)
MARKDOWN_PATH = Path("/tmp/egypt_content.md")
def extract_markdown_content(markdown_path: Path = MARKDOWN_PATH) -> str:
"""Extract markdown content from cached markdown file.
Note: The conversation JSON stores comprehensive reports in artifact tool_result fields,
not in the simple text fields. This cached markdown was extracted from those artifacts.
"""
with open(markdown_path, 'r', encoding='utf-8') as f:
return f.read()
def parse_institution_section(text: str, institution_type: InstitutionType) -> Optional[HeritageCustodian]:
"""Parse an institution section and return HeritageCustodian object."""
# Extract institution name (first line after ###)
name_match = re.search(r'###\s+\*\*([^*]+)\*\*', text)
if not name_match:
return None
name = name_match.group(1).strip()
# Extract alternative names (Arabic in parentheses)
alternative_names = []
arabic_match = re.search(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+[^\n]*', text)
if arabic_match:
alternative_names.append(arabic_match.group(0).strip())
# Per AGENTS.md schema quirks: Don't set to None if empty, use None or list
alternative_names = alternative_names if alternative_names else None
# Extract location
location = None
city_match = re.search(r'\*\*Location\*\*:\s*([^,\n]+)', text)
address_match = re.search(r'\*\*Address\*\*:\s*(.+)', text)
if city_match or address_match:
location = Location(
city=city_match.group(1).strip() if city_match else None,
street_address=address_match.group(1).strip() if address_match else None,
country="EG"
)
# Extract identifiers
identifiers = []
# Website
website_match = re.search(r'\*\*Website\*\*:\s*(https?://[^\s\)]+)', text)
if website_match:
url = website_match.group(1).strip()
identifiers.append(Identifier(
identifier_scheme="Website",
identifier_value=url,
identifier_url=url
))
# Extract description
description_parts = []
collection_match = re.search(r'\*\*(?:Collections?|Digital Infrastructure|Holdings)\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL)
if collection_match:
description_parts.append(collection_match.group(1).strip())
description = ' '.join(description_parts) if description_parts else f"Heritage institution in Egypt."
description = description[:500] + "..." if len(description) > 500 else description
# Create provenance metadata
provenance = Provenance(
data_source=DataSource.CONVERSATION_NLP,
data_tier=DataTier.TIER_4_INFERRED,
extraction_date=datetime.now(timezone.utc),
extraction_method="Python NLP extraction from Egyptian GLAM conversation",
confidence_score=0.85,
conversation_id="39e11630-a2af-407c-a365-d485eb8257b0"
)
# Generate ID from name
name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}"
# Create institution record
# Per AGENTS.md schema quirks: Don't pass None for optional list fields
custodian_kwargs = {
'id': institution_id,
'name': name,
'institution_type': institution_type,
'description': description,
'locations': [location] if location else [],
'identifiers': identifiers if identifiers else [],
'provenance': provenance
}
# Only add alternative_names if we have them
if alternative_names:
custodian_kwargs['alternative_names'] = alternative_names
custodian = HeritageCustodian(**custodian_kwargs)
return custodian
def extract_institutions_step1() -> List[HeritageCustodian]:
"""Step 1: Extract national libraries and archives."""
markdown = extract_markdown_content()
if not markdown:
return []
institutions = []
# Extract Bibliotheca Alexandrina
print("\n Extracting Bibliotheca Alexandrina...")
bibalex_section = markdown[markdown.find("### **Bibliotheca Alexandrina**"):markdown.find("### **Egyptian National Library")]
if bibalex_section:
inst = parse_institution_section(bibalex_section, InstitutionType.LIBRARY)
if inst:
institutions.append(inst)
print(f"{inst.name}")
# Extract Egyptian National Library
print(" Extracting Egyptian National Library...")
enl_section = markdown[markdown.find("### **Egyptian National Library"):markdown.find("### **National Archives")]
if enl_section:
inst = parse_institution_section(enl_section, InstitutionType.LIBRARY)
if inst:
institutions.append(inst)
print(f"{inst.name}")
# Extract National Archives
print(" Extracting National Archives...")
archives_section = markdown[markdown.find("### **National Archives"):markdown.find("# Part II")]
if archives_section:
inst = parse_institution_section(archives_section, InstitutionType.ARCHIVE)
if inst:
institutions.append(inst)
print(f"{inst.name}")
return institutions
def extract_institutions_step2() -> List[HeritageCustodian]:
"""Step 2: Extract major museums."""
markdown = extract_markdown_content()
if not markdown:
return []
institutions = []
# Define museum sections
museum_sections = [
("Egyptian Museum Cairo", "### **Egyptian Museum Cairo", "### **Grand Egyptian Museum"),
("Grand Egyptian Museum", "### **Grand Egyptian Museum", "### **National Museum of Egyptian Civilization"),
("National Museum of Egyptian Civilization", "### **National Museum of Egyptian Civilization", "### **Regional Archaeological Museums"),
("Museum of Islamic Art Cairo", "### **Museum of Islamic Art Cairo**", "### **Coptic Museum"),
("Coptic Museum", "### **Coptic Museum**", "### **Greco-Roman Museum"),
("Greco-Roman Museum Alexandria", "### **Greco-Roman Museum Alexandria**", "### **Art Museums"),
]
for display_name, start_marker, end_marker in museum_sections:
print(f" Extracting {display_name}...")
try:
start_idx = markdown.find(start_marker)
end_idx = markdown.find(end_marker)
if start_idx == -1 or end_idx == -1:
print(f" ⚠ Could not find section")
continue
section = markdown[start_idx:end_idx]
inst = parse_institution_section(section, InstitutionType.MUSEUM)
if inst:
institutions.append(inst)
print(f"{inst.name}")
except Exception as e:
print(f" ✗ Error: {e}")
return institutions
def main():
"""Main extraction workflow."""
print("="*60)
print("Egyptian GLAM Extraction - Steps 1 & 2")
print("="*60)
# Step 1: Libraries and Archives
print("\nSTEP 1: National Libraries and Archives")
print("-"*60)
institutions = extract_institutions_step1()
print(f"\n → Extracted {len(institutions)} institutions")
# Step 2: Museums
print("\nSTEP 2: Major Museums")
print("-"*60)
museums = extract_institutions_step2()
institutions.extend(museums)
print(f"\n → Extracted {len(museums)} museums")
# Summary
print("\n" + "="*60)
print(f"TOTAL: {len(institutions)} institutions extracted")
print("="*60)
# Save to YAML
output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step1_2.yaml"
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"\nSaving to: {output_path}")
with open(output_path, 'w', encoding='utf-8') as f:
records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions]
yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Saved to {output_path.name}")
# Print institution list
print("\nExtracted Institutions:")
for i, inst in enumerate(institutions, 1):
loc_str = f" - {inst.locations[0].city}" if inst.locations else ""
print(f" {i}. {inst.name} ({inst.institution_type}){loc_str}")
if __name__ == "__main__":
main()