- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
249 lines
8.9 KiB
Python
249 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Egyptian GLAM Institution Extraction - Steps 1 & 2
|
|
Extracts heritage institutions from Egyptian GLAM conversation JSON.
|
|
|
|
Step 1: National libraries and archives
|
|
Step 2: Major museums
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Optional
|
|
import yaml
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from glam_extractor.models import (
|
|
HeritageCustodian,
|
|
Location,
|
|
Identifier,
|
|
Provenance,
|
|
InstitutionType,
|
|
DataSource,
|
|
DataTier
|
|
)
|
|
|
|
# Path to cached markdown content (generated from conversation artifacts)
|
|
MARKDOWN_PATH = Path("/tmp/egypt_content.md")
|
|
|
|
def extract_markdown_content(markdown_path: Path = MARKDOWN_PATH) -> str:
|
|
"""Extract markdown content from cached markdown file.
|
|
|
|
Note: The conversation JSON stores comprehensive reports in artifact tool_result fields,
|
|
not in the simple text fields. This cached markdown was extracted from those artifacts.
|
|
"""
|
|
with open(markdown_path, 'r', encoding='utf-8') as f:
|
|
return f.read()
|
|
|
|
def parse_institution_section(text: str, institution_type: InstitutionType) -> Optional[HeritageCustodian]:
|
|
"""Parse an institution section and return HeritageCustodian object."""
|
|
|
|
# Extract institution name (first line after ###)
|
|
name_match = re.search(r'###\s+\*\*([^*]+)\*\*', text)
|
|
if not name_match:
|
|
return None
|
|
|
|
name = name_match.group(1).strip()
|
|
|
|
# Extract alternative names (Arabic in parentheses)
|
|
alternative_names = []
|
|
arabic_match = re.search(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+[^\n]*', text)
|
|
if arabic_match:
|
|
alternative_names.append(arabic_match.group(0).strip())
|
|
|
|
# Per AGENTS.md schema quirks: Don't set to None if empty, use None or list
|
|
alternative_names = alternative_names if alternative_names else None
|
|
|
|
# Extract location
|
|
location = None
|
|
city_match = re.search(r'\*\*Location\*\*:\s*([^,\n]+)', text)
|
|
address_match = re.search(r'\*\*Address\*\*:\s*(.+)', text)
|
|
|
|
if city_match or address_match:
|
|
location = Location(
|
|
city=city_match.group(1).strip() if city_match else None,
|
|
street_address=address_match.group(1).strip() if address_match else None,
|
|
country="EG"
|
|
)
|
|
|
|
# Extract identifiers
|
|
identifiers = []
|
|
|
|
# Website
|
|
website_match = re.search(r'\*\*Website\*\*:\s*(https?://[^\s\)]+)', text)
|
|
if website_match:
|
|
url = website_match.group(1).strip()
|
|
identifiers.append(Identifier(
|
|
identifier_scheme="Website",
|
|
identifier_value=url,
|
|
identifier_url=url
|
|
))
|
|
|
|
# Extract description
|
|
description_parts = []
|
|
collection_match = re.search(r'\*\*(?:Collections?|Digital Infrastructure|Holdings)\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL)
|
|
if collection_match:
|
|
description_parts.append(collection_match.group(1).strip())
|
|
|
|
description = ' '.join(description_parts) if description_parts else f"Heritage institution in Egypt."
|
|
description = description[:500] + "..." if len(description) > 500 else description
|
|
|
|
# Create provenance metadata
|
|
provenance = Provenance(
|
|
data_source=DataSource.CONVERSATION_NLP,
|
|
data_tier=DataTier.TIER_4_INFERRED,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="Python NLP extraction from Egyptian GLAM conversation",
|
|
confidence_score=0.85,
|
|
conversation_id="39e11630-a2af-407c-a365-d485eb8257b0"
|
|
)
|
|
|
|
# Generate ID from name
|
|
name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
|
institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}"
|
|
|
|
# Create institution record
|
|
# Per AGENTS.md schema quirks: Don't pass None for optional list fields
|
|
custodian_kwargs = {
|
|
'id': institution_id,
|
|
'name': name,
|
|
'institution_type': institution_type,
|
|
'description': description,
|
|
'locations': [location] if location else [],
|
|
'identifiers': identifiers if identifiers else [],
|
|
'provenance': provenance
|
|
}
|
|
|
|
# Only add alternative_names if we have them
|
|
if alternative_names:
|
|
custodian_kwargs['alternative_names'] = alternative_names
|
|
|
|
custodian = HeritageCustodian(**custodian_kwargs)
|
|
|
|
return custodian
|
|
|
|
def extract_institutions_step1() -> List[HeritageCustodian]:
|
|
"""Step 1: Extract national libraries and archives."""
|
|
markdown = extract_markdown_content()
|
|
if not markdown:
|
|
return []
|
|
|
|
institutions = []
|
|
|
|
# Extract Bibliotheca Alexandrina
|
|
print("\n Extracting Bibliotheca Alexandrina...")
|
|
bibalex_section = markdown[markdown.find("### **Bibliotheca Alexandrina**"):markdown.find("### **Egyptian National Library")]
|
|
if bibalex_section:
|
|
inst = parse_institution_section(bibalex_section, InstitutionType.LIBRARY)
|
|
if inst:
|
|
institutions.append(inst)
|
|
print(f" ✓ {inst.name}")
|
|
|
|
# Extract Egyptian National Library
|
|
print(" Extracting Egyptian National Library...")
|
|
enl_section = markdown[markdown.find("### **Egyptian National Library"):markdown.find("### **National Archives")]
|
|
if enl_section:
|
|
inst = parse_institution_section(enl_section, InstitutionType.LIBRARY)
|
|
if inst:
|
|
institutions.append(inst)
|
|
print(f" ✓ {inst.name}")
|
|
|
|
# Extract National Archives
|
|
print(" Extracting National Archives...")
|
|
archives_section = markdown[markdown.find("### **National Archives"):markdown.find("# Part II")]
|
|
if archives_section:
|
|
inst = parse_institution_section(archives_section, InstitutionType.ARCHIVE)
|
|
if inst:
|
|
institutions.append(inst)
|
|
print(f" ✓ {inst.name}")
|
|
|
|
return institutions
|
|
|
|
def extract_institutions_step2() -> List[HeritageCustodian]:
|
|
"""Step 2: Extract major museums."""
|
|
markdown = extract_markdown_content()
|
|
if not markdown:
|
|
return []
|
|
|
|
institutions = []
|
|
|
|
# Define museum sections
|
|
museum_sections = [
|
|
("Egyptian Museum Cairo", "### **Egyptian Museum Cairo", "### **Grand Egyptian Museum"),
|
|
("Grand Egyptian Museum", "### **Grand Egyptian Museum", "### **National Museum of Egyptian Civilization"),
|
|
("National Museum of Egyptian Civilization", "### **National Museum of Egyptian Civilization", "### **Regional Archaeological Museums"),
|
|
("Museum of Islamic Art Cairo", "### **Museum of Islamic Art Cairo**", "### **Coptic Museum"),
|
|
("Coptic Museum", "### **Coptic Museum**", "### **Greco-Roman Museum"),
|
|
("Greco-Roman Museum Alexandria", "### **Greco-Roman Museum Alexandria**", "### **Art Museums"),
|
|
]
|
|
|
|
for display_name, start_marker, end_marker in museum_sections:
|
|
print(f" Extracting {display_name}...")
|
|
try:
|
|
start_idx = markdown.find(start_marker)
|
|
end_idx = markdown.find(end_marker)
|
|
|
|
if start_idx == -1 or end_idx == -1:
|
|
print(f" ⚠ Could not find section")
|
|
continue
|
|
|
|
section = markdown[start_idx:end_idx]
|
|
inst = parse_institution_section(section, InstitutionType.MUSEUM)
|
|
|
|
if inst:
|
|
institutions.append(inst)
|
|
print(f" ✓ {inst.name}")
|
|
except Exception as e:
|
|
print(f" ✗ Error: {e}")
|
|
|
|
return institutions
|
|
|
|
def main():
|
|
"""Main extraction workflow."""
|
|
print("="*60)
|
|
print("Egyptian GLAM Extraction - Steps 1 & 2")
|
|
print("="*60)
|
|
|
|
# Step 1: Libraries and Archives
|
|
print("\nSTEP 1: National Libraries and Archives")
|
|
print("-"*60)
|
|
institutions = extract_institutions_step1()
|
|
print(f"\n → Extracted {len(institutions)} institutions")
|
|
|
|
# Step 2: Museums
|
|
print("\nSTEP 2: Major Museums")
|
|
print("-"*60)
|
|
museums = extract_institutions_step2()
|
|
institutions.extend(museums)
|
|
print(f"\n → Extracted {len(museums)} museums")
|
|
|
|
# Summary
|
|
print("\n" + "="*60)
|
|
print(f"TOTAL: {len(institutions)} institutions extracted")
|
|
print("="*60)
|
|
|
|
# Save to YAML
|
|
output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step1_2.yaml"
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"\nSaving to: {output_path}")
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions]
|
|
yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"✓ Saved to {output_path.name}")
|
|
|
|
# Print institution list
|
|
print("\nExtracted Institutions:")
|
|
for i, inst in enumerate(institutions, 1):
|
|
loc_str = f" - {inst.locations[0].city}" if inst.locations else ""
|
|
print(f" {i}. {inst.name} ({inst.institution_type}){loc_str}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|