glam/scripts/extract_egypt_step3.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

199 lines
7.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Egyptian GLAM Institution Extraction - Step 3
Extracts university libraries from Egyptian GLAM conversation.
"""
import sys
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Optional
import yaml
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from glam_extractor.models import (
HeritageCustodian,
Location,
Identifier,
Provenance,
InstitutionType,
DataSource,
DataTier
)
# Path to cached markdown content
MARKDOWN_PATH = Path("/tmp/egypt_content.md")
def extract_markdown_content() -> str:
"""Extract markdown content from cached markdown file."""
with open(MARKDOWN_PATH, 'r', encoding='utf-8') as f:
return f.read()
def parse_university_section(text: str, name: str) -> Optional[HeritageCustodian]:
"""Parse a university section and return HeritageCustodian object."""
# Extract website
identifiers = []
website_match = re.search(r'\*\*Website\*\*:\s*(https?://[^\s\)]+)', text)
if website_match:
url = website_match.group(1).strip()
identifiers.append(Identifier(
identifier_scheme="Website",
identifier_value=url,
identifier_url=url
))
# Extract digital library URL
digital_match = re.search(r'\*\*(?:Digital Library|OPAC System|Library Catalog)\*\*:\s*(?:KOHA\s+)?(?:https?://[^\s\)]+)', text)
if digital_match and 'https://' in digital_match.group(0):
url_extract = re.search(r'https?://[^\s\)]+', digital_match.group(0))
if url_extract:
url = url_extract.group(0).strip()
identifiers.append(Identifier(
identifier_scheme="Digital Library",
identifier_value=url,
identifier_url=url
))
# Build description from key facts
description_parts = []
# Holdings
holdings_match = re.search(r'\*\*Holdings\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL)
if holdings_match:
description_parts.append(holdings_match.group(1).strip())
# Special Collections
special_match = re.search(r'\*\*Special Collections\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL)
if special_match:
description_parts.append(special_match.group(1).strip())
# Digital Repository
repo_match = re.search(r'\*\*Digital Repository\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL)
if repo_match:
description_parts.append(repo_match.group(1).strip())
# EULC Contribution
eulc_match = re.search(r'\*\*EULC Contribution\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL)
if eulc_match:
description_parts.append(f"EULC: {eulc_match.group(1).strip()}")
# Integration/System info
system_match = re.search(r'\*\*(?:Integration|OPAC System|System)\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL)
if system_match:
description_parts.append(system_match.group(1).strip())
description = ' '.join(description_parts) if description_parts else f"University library in Egypt."
description = description[:500] + "..." if len(description) > 500 else description
# Create provenance metadata
provenance = Provenance(
data_source=DataSource.CONVERSATION_NLP,
data_tier=DataTier.TIER_4_INFERRED,
extraction_date=datetime.now(timezone.utc),
extraction_method="Python NLP extraction from Egyptian GLAM conversation",
confidence_score=0.85,
conversation_id="39e11630-a2af-407c-a365-d485eb8257b0"
)
# Generate ID from name
name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}"
# Create institution record
# Note: University libraries use LIBRARY type per AGENTS.md taxonomy
custodian_kwargs = {
'id': institution_id,
'name': name,
'institution_type': InstitutionType.LIBRARY,
'description': description,
'locations': [],
'identifiers': identifiers if identifiers else [],
'provenance': provenance
}
custodian = HeritageCustodian(**custodian_kwargs)
return custodian
def extract_institutions_step3() -> List[HeritageCustodian]:
"""Step 3: Extract university libraries."""
markdown = extract_markdown_content()
if not markdown:
return []
institutions = []
# Define university sections with their names and markers
university_sections = [
("Cairo University Library System", "### **Cairo University Library System**", "### **American University in Cairo"),
("American University in Cairo (AUC) Libraries", "### **American University in Cairo (AUC) Libraries**", "### **Alexandria University"),
("Alexandria University Libraries", "### **Alexandria University Libraries**", "### **Al-Azhar University"),
("Al-Azhar University Library", "### **Al-Azhar University Library**", "### **Additional Major Universities"),
("Ain Shams University Libraries", "**Ain Shams University**", "**Helwan University**"),
("Helwan University Libraries", "**Helwan University**", "**Assiut University**"),
("Assiut University Libraries", "**Assiut University**", "### **International Universities"),
("German University in Cairo (GUC) Library", "**German University in Cairo (GUC)**", "**British University in Egypt"),
("British University in Egypt (BUE) Library", "**British University in Egypt (BUE)**", "**Nile University**"),
("Nile University Library", "**Nile University**", "# Part III"),
]
for display_name, start_marker, end_marker in university_sections:
print(f" Extracting {display_name}...")
try:
start_idx = markdown.find(start_marker)
end_idx = markdown.find(end_marker)
if start_idx == -1 or end_idx == -1:
print(f" ⚠ Could not find section markers")
continue
section = markdown[start_idx:end_idx]
inst = parse_university_section(section, display_name)
if inst:
institutions.append(inst)
print(f"{inst.name}")
except Exception as e:
print(f" ✗ Error: {e}")
return institutions
def main():
"""Main extraction workflow."""
print("="*60)
print("Egyptian GLAM Extraction - Step 3: Universities")
print("="*60)
# Step 3: University Libraries
print("\nSTEP 3: University Libraries")
print("-"*60)
institutions = extract_institutions_step3()
print(f"\n → Extracted {len(institutions)} university libraries")
# Summary
print("\n" + "="*60)
print(f"TOTAL: {len(institutions)} institutions extracted")
print("="*60)
# Save to YAML
output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step3.yaml"
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"\nSaving to: {output_path}")
with open(output_path, 'w', encoding='utf-8') as f:
records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions]
yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Saved to {output_path.name}")
# Print institution list
print("\nExtracted Institutions:")
for i, inst in enumerate(institutions, 1):
print(f" {i}. {inst.name}")
if __name__ == "__main__":
main()