#!/usr/bin/env python3 """ Egyptian GLAM Institution Extraction - Steps 1 & 2 Extracts heritage institutions from Egyptian GLAM conversation JSON. Step 1: National libraries and archives Step 2: Major museums """ import sys import json import re from pathlib import Path from datetime import datetime, timezone from typing import List, Optional import yaml # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) from glam_extractor.models import ( HeritageCustodian, Location, Identifier, Provenance, InstitutionType, DataSource, DataTier ) # Path to cached markdown content (generated from conversation artifacts) MARKDOWN_PATH = Path("/tmp/egypt_content.md") def extract_markdown_content(markdown_path: Path = MARKDOWN_PATH) -> str: """Extract markdown content from cached markdown file. Note: The conversation JSON stores comprehensive reports in artifact tool_result fields, not in the simple text fields. This cached markdown was extracted from those artifacts. """ with open(markdown_path, 'r', encoding='utf-8') as f: return f.read() def parse_institution_section(text: str, institution_type: InstitutionType) -> Optional[HeritageCustodian]: """Parse an institution section and return HeritageCustodian object.""" # Extract institution name (first line after ###) name_match = re.search(r'###\s+\*\*([^*]+)\*\*', text) if not name_match: return None name = name_match.group(1).strip() # Extract alternative names (Arabic in parentheses) alternative_names = [] arabic_match = re.search(r'[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+[^\n]*', text) if arabic_match: alternative_names.append(arabic_match.group(0).strip()) # Per AGENTS.md schema quirks: Don't set to None if empty, use None or list alternative_names = alternative_names if alternative_names else None # Extract location location = None city_match = re.search(r'\*\*Location\*\*:\s*([^,\n]+)', text) address_match = re.search(r'\*\*Address\*\*:\s*(.+)', text) if city_match or address_match: location = Location( city=city_match.group(1).strip() if city_match else None, street_address=address_match.group(1).strip() if address_match else None, country="EG" ) # Extract identifiers identifiers = [] # Website website_match = re.search(r'\*\*Website\*\*:\s*(https?://[^\s\)]+)', text) if website_match: url = website_match.group(1).strip() identifiers.append(Identifier( identifier_scheme="Website", identifier_value=url, identifier_url=url )) # Extract description description_parts = [] collection_match = re.search(r'\*\*(?:Collections?|Digital Infrastructure|Holdings)\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL) if collection_match: description_parts.append(collection_match.group(1).strip()) description = ' '.join(description_parts) if description_parts else f"Heritage institution in Egypt." description = description[:500] + "..." if len(description) > 500 else description # Create provenance metadata provenance = Provenance( data_source=DataSource.CONVERSATION_NLP, data_tier=DataTier.TIER_4_INFERRED, extraction_date=datetime.now(timezone.utc), extraction_method="Python NLP extraction from Egyptian GLAM conversation", confidence_score=0.85, conversation_id="39e11630-a2af-407c-a365-d485eb8257b0" ) # Generate ID from name name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-') institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}" # Create institution record # Per AGENTS.md schema quirks: Don't pass None for optional list fields custodian_kwargs = { 'id': institution_id, 'name': name, 'institution_type': institution_type, 'description': description, 'locations': [location] if location else [], 'identifiers': identifiers if identifiers else [], 'provenance': provenance } # Only add alternative_names if we have them if alternative_names: custodian_kwargs['alternative_names'] = alternative_names custodian = HeritageCustodian(**custodian_kwargs) return custodian def extract_institutions_step1() -> List[HeritageCustodian]: """Step 1: Extract national libraries and archives.""" markdown = extract_markdown_content() if not markdown: return [] institutions = [] # Extract Bibliotheca Alexandrina print("\n Extracting Bibliotheca Alexandrina...") bibalex_section = markdown[markdown.find("### **Bibliotheca Alexandrina**"):markdown.find("### **Egyptian National Library")] if bibalex_section: inst = parse_institution_section(bibalex_section, InstitutionType.LIBRARY) if inst: institutions.append(inst) print(f" ✓ {inst.name}") # Extract Egyptian National Library print(" Extracting Egyptian National Library...") enl_section = markdown[markdown.find("### **Egyptian National Library"):markdown.find("### **National Archives")] if enl_section: inst = parse_institution_section(enl_section, InstitutionType.LIBRARY) if inst: institutions.append(inst) print(f" ✓ {inst.name}") # Extract National Archives print(" Extracting National Archives...") archives_section = markdown[markdown.find("### **National Archives"):markdown.find("# Part II")] if archives_section: inst = parse_institution_section(archives_section, InstitutionType.ARCHIVE) if inst: institutions.append(inst) print(f" ✓ {inst.name}") return institutions def extract_institutions_step2() -> List[HeritageCustodian]: """Step 2: Extract major museums.""" markdown = extract_markdown_content() if not markdown: return [] institutions = [] # Define museum sections museum_sections = [ ("Egyptian Museum Cairo", "### **Egyptian Museum Cairo", "### **Grand Egyptian Museum"), ("Grand Egyptian Museum", "### **Grand Egyptian Museum", "### **National Museum of Egyptian Civilization"), ("National Museum of Egyptian Civilization", "### **National Museum of Egyptian Civilization", "### **Regional Archaeological Museums"), ("Museum of Islamic Art Cairo", "### **Museum of Islamic Art Cairo**", "### **Coptic Museum"), ("Coptic Museum", "### **Coptic Museum**", "### **Greco-Roman Museum"), ("Greco-Roman Museum Alexandria", "### **Greco-Roman Museum Alexandria**", "### **Art Museums"), ] for display_name, start_marker, end_marker in museum_sections: print(f" Extracting {display_name}...") try: start_idx = markdown.find(start_marker) end_idx = markdown.find(end_marker) if start_idx == -1 or end_idx == -1: print(f" ⚠ Could not find section") continue section = markdown[start_idx:end_idx] inst = parse_institution_section(section, InstitutionType.MUSEUM) if inst: institutions.append(inst) print(f" ✓ {inst.name}") except Exception as e: print(f" ✗ Error: {e}") return institutions def main(): """Main extraction workflow.""" print("="*60) print("Egyptian GLAM Extraction - Steps 1 & 2") print("="*60) # Step 1: Libraries and Archives print("\nSTEP 1: National Libraries and Archives") print("-"*60) institutions = extract_institutions_step1() print(f"\n → Extracted {len(institutions)} institutions") # Step 2: Museums print("\nSTEP 2: Major Museums") print("-"*60) museums = extract_institutions_step2() institutions.extend(museums) print(f"\n → Extracted {len(museums)} museums") # Summary print("\n" + "="*60) print(f"TOTAL: {len(institutions)} institutions extracted") print("="*60) # Save to YAML output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step1_2.yaml" output_path.parent.mkdir(parents=True, exist_ok=True) print(f"\nSaving to: {output_path}") with open(output_path, 'w', encoding='utf-8') as f: records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions] yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Saved to {output_path.name}") # Print institution list print("\nExtracted Institutions:") for i, inst in enumerate(institutions, 1): loc_str = f" - {inst.locations[0].city}" if inst.locations else "" print(f" {i}. {inst.name} ({inst.institution_type}){loc_str}") if __name__ == "__main__": main()