glam/scripts/extract_egypt_institutions.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

294 lines
11 KiB
Python

#!/usr/bin/env python3
"""
Extract Egyptian heritage institutions from conversation JSON file.
This script reads the Egyptian GLAM inventory conversation and extracts
heritage custodian records in LinkML-compliant format.
"""
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Optional
import yaml
# Import models
import sys
sys.path.insert(0, str(Path(__file__).parent.parent / 'src'))
from glam_extractor.models import (
HeritageCustodian,
Location,
Identifier,
DigitalPlatform,
Provenance,
DataSource,
DataTier,
InstitutionType,
DigitalPlatformType
)
# Conversation file path
CONVERSATION_PATH = Path("/Users/kempersc/Documents/claude/data-2025-11-02-18-13-26-batch-0000/conversations/2025-09-22T14-50-31-39e11630-a2af-407c-a365-d485eb8257b0-Egyptian_GLAM_resources_inventory.json")
def extract_markdown_content(json_path: Path) -> str:
"""Extract the markdown content from conversation JSON."""
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for msg in data['chat_messages']:
for content in msg.get('content', []):
if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
markdown = content.get('input', {}).get('content', '')
if markdown and 'Egyptian GLAM' in markdown:
return markdown
return ""
def parse_institution_section(text: str, institution_type: InstitutionType) -> Optional[HeritageCustodian]:
"""
Parse a single institution section from markdown text.
Args:
text: Markdown text for one institution
institution_type: Type of institution (LIBRARY, MUSEUM, etc.)
Returns:
HeritageCustodian object or None if parsing fails
"""
# Extract institution name (first heading)
name_match = re.search(r'###?\s+\*\*(.+?)\*\*', text)
if not name_match:
return None
name = name_match.group(1).strip()
# Extract Arabic name if present
arabic_match = re.search(r'\*\*(.+?)\*\*\s+\((.+?)\)', text)
alternative_names = []
if arabic_match and arabic_match.group(1) != name:
alternative_names.append(arabic_match.group(1))
# Extract location information
location = None
city_match = re.search(r'\*\*(?:Address|Location)\*\*:\s*(.+?)(?:\n|$)', text)
if city_match:
address_text = city_match.group(1).strip()
# Try to extract city from address
if 'Cairo' in address_text:
city = 'Cairo'
elif 'Alexandria' in address_text:
city = 'Alexandria'
elif 'Aswan' in address_text:
city = 'Aswan'
elif 'Luxor' in address_text:
city = 'Luxor'
else:
# Try to extract city from comma-separated address
parts = address_text.split(',')
city = parts[-1].strip() if len(parts) > 1 else 'Cairo'
location = Location(
city=city,
country="EG",
street_address=address_text
)
# Extract website URL
identifiers = []
website_match = re.search(r'\*\*Website\*\*:\s+(https?://[^\s\)]+)', text)
if website_match:
url = website_match.group(1).strip()
identifiers.append(Identifier(
identifier_scheme="Website",
identifier_value=url,
identifier_url=url
))
# Extract description (everything after the name/address/website)
description_parts = []
# Look for collection/digital infrastructure sections
collection_match = re.search(r'\*\*(?:Collections?|Digital Infrastructure)\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL)
if collection_match:
description_parts.append(collection_match.group(1).strip())
description = ' '.join(description_parts) if description_parts else f"Heritage institution in Egypt."
# Truncate to reasonable length
description = description[:500] + "..." if len(description) > 500 else description
# Create provenance metadata
provenance = Provenance(
data_source=DataSource.CONVERSATION_NLP,
data_tier=DataTier.TIER_4_INFERRED,
extraction_date=datetime.now(timezone.utc),
extraction_method="Python NLP extraction from Egyptian GLAM conversation",
confidence_score=0.85,
conversation_id="39e11630-a2af-407c-a365-d485eb8257b0"
)
# Generate ID from name
name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}"
# Create institution record
custodian = HeritageCustodian(
id=institution_id,
name=name,
institution_type=institution_type,
alternative_names=alternative_names if alternative_names else None,
description=description,
locations=[location] if location else [],
identifiers=identifiers if identifiers else [],
provenance=provenance
)
return custodian
def extract_institutions_step1() -> List[HeritageCustodian]:
"""
Step 1: Extract major national institutions (libraries and museums).
Returns:
List of HeritageCustodian objects
"""
print("Step 1: Extracting markdown content...")
markdown = extract_markdown_content(CONVERSATION_PATH)
if not markdown:
print("ERROR: Could not extract markdown content from conversation")
return []
print(f" Found {len(markdown)} characters of content")
institutions = []
# Extract Bibliotheca Alexandrina
print("\nStep 2: Extracting Bibliotheca Alexandrina...")
bibalex_section = markdown[markdown.find("### **Bibliotheca Alexandrina**"):markdown.find("### **Egyptian National Library")]
if bibalex_section:
inst = parse_institution_section(bibalex_section, InstitutionType.LIBRARY)
if inst:
institutions.append(inst)
print(f" ✓ Extracted: {inst.name}")
# Extract Egyptian National Library
print("\nStep 3: Extracting Egyptian National Library...")
enl_section = markdown[markdown.find("### **Egyptian National Library"):markdown.find("### **National Archives")]
if enl_section:
inst = parse_institution_section(enl_section, InstitutionType.LIBRARY)
if inst:
institutions.append(inst)
print(f" ✓ Extracted: {inst.name}")
# Extract National Archives
print("\nStep 4: Extracting National Archives...")
archives_section = markdown[markdown.find("### **National Archives"):markdown.find("# Part II")]
if archives_section:
inst = parse_institution_section(archives_section, InstitutionType.ARCHIVE)
if inst:
institutions.append(inst)
print(f" ✓ Extracted: {inst.name}")
return institutions
def extract_institutions_step2() -> List[HeritageCustodian]:
"""
Step 2: Extract major museums.
Returns:
List of HeritageCustodian objects
"""
print("Step 2: Extracting museums...")
markdown = extract_markdown_content(CONVERSATION_PATH)
if not markdown:
print("ERROR: Could not extract markdown content from conversation")
return []
print(f" Found {len(markdown)} characters of content")
institutions = []
# Define museum sections to extract
museum_sections = [
("Egyptian Museum Cairo (EMC)", "### **Egyptian Museum Cairo", "### **Grand Egyptian Museum"),
("Grand Egyptian Museum (GEM)", "### **Grand Egyptian Museum", "### **National Museum of Egyptian Civilization"),
("National Museum of Egyptian Civilization (NMEC)", "### **National Museum of Egyptian Civilization", "### **Regional Archaeological Museums"),
("Museum of Islamic Art Cairo", "### **Museum of Islamic Art Cairo**", "### **Coptic Museum"),
("Coptic Museum", "### **Coptic Museum**", "### **Greco-Roman Museum"),
("Greco-Roman Museum Alexandria", "### **Greco-Roman Museum Alexandria**", "### **Art Museums"),
]
for name, start_marker, end_marker in museum_sections:
print(f"\n Extracting {name}...")
try:
start_idx = markdown.find(start_marker)
end_idx = markdown.find(end_marker)
if start_idx == -1 or end_idx == -1:
print(f" ⚠ Could not find section markers for {name}")
continue
section = markdown[start_idx:end_idx]
inst = parse_institution_section(section, InstitutionType.MUSEUM)
if inst:
institutions.append(inst)
print(f" ✓ Extracted: {inst.name}")
except Exception as e:
print(f" ✗ Error extracting {name}: {e}")
return institutions
def main():
"""Main extraction workflow - Steps 1 and 2."""
print("="*60)
print("Egyptian GLAM Institution Extraction - STEPS 1 & 2")
print("="*60)
# Extract first batch (national libraries and archives)
print("\n" + "="*60)
print("STEP 1: National Libraries and Archives")
print("="*60)
institutions = extract_institutions_step1()
print(f"\nStep 1 Complete: Extracted {len(institutions)} institutions")
# Extract second batch (museums)
print("\n" + "="*60)
print("STEP 2: Museums")
print("="*60)
museums = extract_institutions_step2()
institutions.extend(museums)
print(f"\nStep 2 Complete: Extracted {len(museums)} museums")
print(f"\n{'='*60}")
print(f"Total Extracted: {len(institutions)} institutions")
print(f"{'='*60}")
# Save to YAML file
output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step1_2.yaml"
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"\nSaving to: {output_path}")
with open(output_path, 'w', encoding='utf-8') as f:
# Convert to dicts for YAML serialization
records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions]
yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Saved {len(institutions)} institutions to {output_path.name}")
# Print summary
print("\nInstitutions extracted:")
for i, inst in enumerate(institutions, 1):
print(f" {i}. {inst.name} ({inst.institution_type})")
if inst.locations:
print(f" Location: {inst.locations[0].city}")
if inst.identifiers:
print(f" Website: {inst.identifiers[0].identifier_url}")
if __name__ == "__main__":
main()