#!/usr/bin/env python3 """ Egyptian GLAM Institution Extraction - Step 3 Extracts university libraries from Egyptian GLAM conversation. """ import sys import re from pathlib import Path from datetime import datetime, timezone from typing import List, Optional import yaml # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) from glam_extractor.models import ( HeritageCustodian, Location, Identifier, Provenance, InstitutionType, DataSource, DataTier ) # Path to cached markdown content MARKDOWN_PATH = Path("/tmp/egypt_content.md") def extract_markdown_content() -> str: """Extract markdown content from cached markdown file.""" with open(MARKDOWN_PATH, 'r', encoding='utf-8') as f: return f.read() def parse_university_section(text: str, name: str) -> Optional[HeritageCustodian]: """Parse a university section and return HeritageCustodian object.""" # Extract website identifiers = [] website_match = re.search(r'\*\*Website\*\*:\s*(https?://[^\s\)]+)', text) if website_match: url = website_match.group(1).strip() identifiers.append(Identifier( identifier_scheme="Website", identifier_value=url, identifier_url=url )) # Extract digital library URL digital_match = re.search(r'\*\*(?:Digital Library|OPAC System|Library Catalog)\*\*:\s*(?:KOHA\s+)?(?:https?://[^\s\)]+)', text) if digital_match and 'https://' in digital_match.group(0): url_extract = re.search(r'https?://[^\s\)]+', digital_match.group(0)) if url_extract: url = url_extract.group(0).strip() identifiers.append(Identifier( identifier_scheme="Digital Library", identifier_value=url, identifier_url=url )) # Build description from key facts description_parts = [] # Holdings holdings_match = re.search(r'\*\*Holdings\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL) if holdings_match: description_parts.append(holdings_match.group(1).strip()) # Special Collections special_match = re.search(r'\*\*Special Collections\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL) if special_match: description_parts.append(special_match.group(1).strip()) # Digital Repository repo_match = re.search(r'\*\*Digital Repository\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL) if repo_match: description_parts.append(repo_match.group(1).strip()) # EULC Contribution eulc_match = re.search(r'\*\*EULC Contribution\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL) if eulc_match: description_parts.append(f"EULC: {eulc_match.group(1).strip()}") # Integration/System info system_match = re.search(r'\*\*(?:Integration|OPAC System|System)\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL) if system_match: description_parts.append(system_match.group(1).strip()) description = ' '.join(description_parts) if description_parts else f"University library in Egypt." description = description[:500] + "..." if len(description) > 500 else description # Create provenance metadata provenance = Provenance( data_source=DataSource.CONVERSATION_NLP, data_tier=DataTier.TIER_4_INFERRED, extraction_date=datetime.now(timezone.utc), extraction_method="Python NLP extraction from Egyptian GLAM conversation", confidence_score=0.85, conversation_id="39e11630-a2af-407c-a365-d485eb8257b0" ) # Generate ID from name name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-') institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}" # Create institution record # Note: University libraries use LIBRARY type per AGENTS.md taxonomy custodian_kwargs = { 'id': institution_id, 'name': name, 'institution_type': InstitutionType.LIBRARY, 'description': description, 'locations': [], 'identifiers': identifiers if identifiers else [], 'provenance': provenance } custodian = HeritageCustodian(**custodian_kwargs) return custodian def extract_institutions_step3() -> List[HeritageCustodian]: """Step 3: Extract university libraries.""" markdown = extract_markdown_content() if not markdown: return [] institutions = [] # Define university sections with their names and markers university_sections = [ ("Cairo University Library System", "### **Cairo University Library System**", "### **American University in Cairo"), ("American University in Cairo (AUC) Libraries", "### **American University in Cairo (AUC) Libraries**", "### **Alexandria University"), ("Alexandria University Libraries", "### **Alexandria University Libraries**", "### **Al-Azhar University"), ("Al-Azhar University Library", "### **Al-Azhar University Library**", "### **Additional Major Universities"), ("Ain Shams University Libraries", "**Ain Shams University**", "**Helwan University**"), ("Helwan University Libraries", "**Helwan University**", "**Assiut University**"), ("Assiut University Libraries", "**Assiut University**", "### **International Universities"), ("German University in Cairo (GUC) Library", "**German University in Cairo (GUC)**", "**British University in Egypt"), ("British University in Egypt (BUE) Library", "**British University in Egypt (BUE)**", "**Nile University**"), ("Nile University Library", "**Nile University**", "# Part III"), ] for display_name, start_marker, end_marker in university_sections: print(f" Extracting {display_name}...") try: start_idx = markdown.find(start_marker) end_idx = markdown.find(end_marker) if start_idx == -1 or end_idx == -1: print(f" ⚠ Could not find section markers") continue section = markdown[start_idx:end_idx] inst = parse_university_section(section, display_name) if inst: institutions.append(inst) print(f" ✓ {inst.name}") except Exception as e: print(f" ✗ Error: {e}") return institutions def main(): """Main extraction workflow.""" print("="*60) print("Egyptian GLAM Extraction - Step 3: Universities") print("="*60) # Step 3: University Libraries print("\nSTEP 3: University Libraries") print("-"*60) institutions = extract_institutions_step3() print(f"\n → Extracted {len(institutions)} university libraries") # Summary print("\n" + "="*60) print(f"TOTAL: {len(institutions)} institutions extracted") print("="*60) # Save to YAML output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step3.yaml" output_path.parent.mkdir(parents=True, exist_ok=True) print(f"\nSaving to: {output_path}") with open(output_path, 'w', encoding='utf-8') as f: records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions] yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Saved to {output_path.name}") # Print institution list print("\nExtracted Institutions:") for i, inst in enumerate(institutions, 1): print(f" {i}. {inst.name}") if __name__ == "__main__": main()