#!/usr/bin/env python3 """ Extract Egyptian heritage institutions from conversation JSON file. This script reads the Egyptian GLAM inventory conversation and extracts heritage custodian records in LinkML-compliant format. """ import json import re from datetime import datetime, timezone from pathlib import Path from typing import List, Dict, Optional import yaml # Import models import sys sys.path.insert(0, str(Path(__file__).parent.parent / 'src')) from glam_extractor.models import ( HeritageCustodian, Location, Identifier, DigitalPlatform, Provenance, DataSource, DataTier, InstitutionType, DigitalPlatformType ) # Conversation file path CONVERSATION_PATH = Path("/Users/kempersc/Documents/claude/data-2025-11-02-18-13-26-batch-0000/conversations/2025-09-22T14-50-31-39e11630-a2af-407c-a365-d485eb8257b0-Egyptian_GLAM_resources_inventory.json") def extract_markdown_content(json_path: Path) -> str: """Extract the markdown content from conversation JSON.""" with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) for msg in data['chat_messages']: for content in msg.get('content', []): if content.get('type') == 'tool_use' and content.get('name') == 'artifacts': markdown = content.get('input', {}).get('content', '') if markdown and 'Egyptian GLAM' in markdown: return markdown return "" def parse_institution_section(text: str, institution_type: InstitutionType) -> Optional[HeritageCustodian]: """ Parse a single institution section from markdown text. Args: text: Markdown text for one institution institution_type: Type of institution (LIBRARY, MUSEUM, etc.) Returns: HeritageCustodian object or None if parsing fails """ # Extract institution name (first heading) name_match = re.search(r'###?\s+\*\*(.+?)\*\*', text) if not name_match: return None name = name_match.group(1).strip() # Extract Arabic name if present arabic_match = re.search(r'\*\*(.+?)\*\*\s+\((.+?)\)', text) alternative_names = [] if arabic_match and arabic_match.group(1) != name: alternative_names.append(arabic_match.group(1)) # Extract location information location = None city_match = re.search(r'\*\*(?:Address|Location)\*\*:\s*(.+?)(?:\n|$)', text) if city_match: address_text = city_match.group(1).strip() # Try to extract city from address if 'Cairo' in address_text: city = 'Cairo' elif 'Alexandria' in address_text: city = 'Alexandria' elif 'Aswan' in address_text: city = 'Aswan' elif 'Luxor' in address_text: city = 'Luxor' else: # Try to extract city from comma-separated address parts = address_text.split(',') city = parts[-1].strip() if len(parts) > 1 else 'Cairo' location = Location( city=city, country="EG", street_address=address_text ) # Extract website URL identifiers = [] website_match = re.search(r'\*\*Website\*\*:\s+(https?://[^\s\)]+)', text) if website_match: url = website_match.group(1).strip() identifiers.append(Identifier( identifier_scheme="Website", identifier_value=url, identifier_url=url )) # Extract description (everything after the name/address/website) description_parts = [] # Look for collection/digital infrastructure sections collection_match = re.search(r'\*\*(?:Collections?|Digital Infrastructure)\*\*:(.+?)(?=\n\*\*|\Z)', text, re.DOTALL) if collection_match: description_parts.append(collection_match.group(1).strip()) description = ' '.join(description_parts) if description_parts else f"Heritage institution in Egypt." # Truncate to reasonable length description = description[:500] + "..." if len(description) > 500 else description # Create provenance metadata provenance = Provenance( data_source=DataSource.CONVERSATION_NLP, data_tier=DataTier.TIER_4_INFERRED, extraction_date=datetime.now(timezone.utc), extraction_method="Python NLP extraction from Egyptian GLAM conversation", confidence_score=0.85, conversation_id="39e11630-a2af-407c-a365-d485eb8257b0" ) # Generate ID from name name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-') institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}" # Create institution record custodian = HeritageCustodian( id=institution_id, name=name, institution_type=institution_type, alternative_names=alternative_names if alternative_names else None, description=description, locations=[location] if location else [], identifiers=identifiers if identifiers else [], provenance=provenance ) return custodian def extract_institutions_step1() -> List[HeritageCustodian]: """ Step 1: Extract major national institutions (libraries and museums). Returns: List of HeritageCustodian objects """ print("Step 1: Extracting markdown content...") markdown = extract_markdown_content(CONVERSATION_PATH) if not markdown: print("ERROR: Could not extract markdown content from conversation") return [] print(f" Found {len(markdown)} characters of content") institutions = [] # Extract Bibliotheca Alexandrina print("\nStep 2: Extracting Bibliotheca Alexandrina...") bibalex_section = markdown[markdown.find("### **Bibliotheca Alexandrina**"):markdown.find("### **Egyptian National Library")] if bibalex_section: inst = parse_institution_section(bibalex_section, InstitutionType.LIBRARY) if inst: institutions.append(inst) print(f" ✓ Extracted: {inst.name}") # Extract Egyptian National Library print("\nStep 3: Extracting Egyptian National Library...") enl_section = markdown[markdown.find("### **Egyptian National Library"):markdown.find("### **National Archives")] if enl_section: inst = parse_institution_section(enl_section, InstitutionType.LIBRARY) if inst: institutions.append(inst) print(f" ✓ Extracted: {inst.name}") # Extract National Archives print("\nStep 4: Extracting National Archives...") archives_section = markdown[markdown.find("### **National Archives"):markdown.find("# Part II")] if archives_section: inst = parse_institution_section(archives_section, InstitutionType.ARCHIVE) if inst: institutions.append(inst) print(f" ✓ Extracted: {inst.name}") return institutions def extract_institutions_step2() -> List[HeritageCustodian]: """ Step 2: Extract major museums. Returns: List of HeritageCustodian objects """ print("Step 2: Extracting museums...") markdown = extract_markdown_content(CONVERSATION_PATH) if not markdown: print("ERROR: Could not extract markdown content from conversation") return [] print(f" Found {len(markdown)} characters of content") institutions = [] # Define museum sections to extract museum_sections = [ ("Egyptian Museum Cairo (EMC)", "### **Egyptian Museum Cairo", "### **Grand Egyptian Museum"), ("Grand Egyptian Museum (GEM)", "### **Grand Egyptian Museum", "### **National Museum of Egyptian Civilization"), ("National Museum of Egyptian Civilization (NMEC)", "### **National Museum of Egyptian Civilization", "### **Regional Archaeological Museums"), ("Museum of Islamic Art Cairo", "### **Museum of Islamic Art Cairo**", "### **Coptic Museum"), ("Coptic Museum", "### **Coptic Museum**", "### **Greco-Roman Museum"), ("Greco-Roman Museum Alexandria", "### **Greco-Roman Museum Alexandria**", "### **Art Museums"), ] for name, start_marker, end_marker in museum_sections: print(f"\n Extracting {name}...") try: start_idx = markdown.find(start_marker) end_idx = markdown.find(end_marker) if start_idx == -1 or end_idx == -1: print(f" ⚠ Could not find section markers for {name}") continue section = markdown[start_idx:end_idx] inst = parse_institution_section(section, InstitutionType.MUSEUM) if inst: institutions.append(inst) print(f" ✓ Extracted: {inst.name}") except Exception as e: print(f" ✗ Error extracting {name}: {e}") return institutions def main(): """Main extraction workflow - Steps 1 and 2.""" print("="*60) print("Egyptian GLAM Institution Extraction - STEPS 1 & 2") print("="*60) # Extract first batch (national libraries and archives) print("\n" + "="*60) print("STEP 1: National Libraries and Archives") print("="*60) institutions = extract_institutions_step1() print(f"\nStep 1 Complete: Extracted {len(institutions)} institutions") # Extract second batch (museums) print("\n" + "="*60) print("STEP 2: Museums") print("="*60) museums = extract_institutions_step2() institutions.extend(museums) print(f"\nStep 2 Complete: Extracted {len(museums)} museums") print(f"\n{'='*60}") print(f"Total Extracted: {len(institutions)} institutions") print(f"{'='*60}") # Save to YAML file output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step1_2.yaml" output_path.parent.mkdir(parents=True, exist_ok=True) print(f"\nSaving to: {output_path}") with open(output_path, 'w', encoding='utf-8') as f: # Convert to dicts for YAML serialization records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions] yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Saved {len(institutions)} institutions to {output_path.name}") # Print summary print("\nInstitutions extracted:") for i, inst in enumerate(institutions, 1): print(f" {i}. {inst.name} ({inst.institution_type})") if inst.locations: print(f" Location: {inst.locations[0].city}") if inst.identifiers: print(f" Website: {inst.identifiers[0].identifier_url}") if __name__ == "__main__": main()