#!/usr/bin/env python3 """ Egyptian GLAM Institution Extraction - Step 4 Extracts galleries, cultural centers, and digital platforms from Egyptian GLAM conversation. """ import sys import re from pathlib import Path from datetime import datetime, timezone from typing import List, Optional import yaml # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) from glam_extractor.models import ( HeritageCustodian, Location, Identifier, Provenance, InstitutionType, DataSource, DataTier ) # Path to cached markdown content MARKDOWN_PATH = Path("/tmp/egypt_content.md") def extract_markdown_content() -> str: """Extract markdown content from cached markdown file.""" with open(MARKDOWN_PATH, 'r', encoding='utf-8') as f: return f.read() def create_institution(name: str, inst_type: InstitutionType, description: str, identifiers: List[Identifier], location_city: Optional[str] = None) -> HeritageCustodian: """Create a HeritageCustodian instance with standard metadata.""" # Create provenance metadata provenance = Provenance( data_source=DataSource.CONVERSATION_NLP, data_tier=DataTier.TIER_4_INFERRED, extraction_date=datetime.now(timezone.utc), extraction_method="Python NLP extraction from Egyptian GLAM conversation", confidence_score=0.85, conversation_id="39e11630-a2af-407c-a365-d485eb8257b0" ) # Generate ID from name name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-') institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}" # Create location if city provided locations = [] if location_city: locations.append(Location( city=location_city, country="EG" )) # Create institution record custodian_kwargs = { 'id': institution_id, 'name': name, 'institution_type': inst_type, 'description': description[:500] + "..." if len(description) > 500 else description, 'locations': locations if locations else [], 'identifiers': identifiers if identifiers else [], 'provenance': provenance } custodian = HeritageCustodian(**custodian_kwargs) return custodian def extract_institutions_step4() -> List[HeritageCustodian]: """Step 4: Extract galleries, cultural centers, and digital platforms.""" institutions = [] # Art Galleries (from Part IV) print(" Extracting art galleries...") galleries = [ { 'name': 'Palace of Arts (Cairo Opera House)', 'url': 'https://www.cairoopera.org/en/', 'description': 'Features Youth Salon, Contemporary Art Salon exhibitions. Exhibition information online archive.', 'city': 'Cairo' }, { 'name': 'Mashrabia Gallery', 'url': 'http://www.mashrabiagallery.com/', 'description': 'Established 1990. Monthly exhibition archives, artist database. Artsy profile available.', 'city': 'Cairo' }, { 'name': 'SafarKhan Gallery', 'url': 'https://www.safarkhan.com/', 'description': 'Since 1968. Comprehensive online catalog, virtual viewing. Services include authenticity certificates, worldwide shipping.', 'city': 'Cairo' }, { 'name': 'Contemporary Image Collective (CiC)', 'url': 'http://www.ciccairo.com/', 'description': 'Library with Rufoof digital system (4,000 art books with Townhouse Gallery). Focus on visual culture and photography.', 'city': 'Cairo' }, { 'name': 'Darb 1718 Contemporary Art Center', 'url': 'https://darb1718.com/', 'description': 'Mission: Comprehensive Egyptian art database development. Facilities include exhibition spaces, theater, artist residency.', 'city': 'Cairo' } ] for gallery in galleries: identifiers = [Identifier( identifier_scheme="Website", identifier_value=gallery['url'], identifier_url=gallery['url'] )] inst = create_institution( name=gallery['name'], inst_type=InstitutionType.GALLERY, description=gallery['description'], identifiers=identifiers, location_city=gallery['city'] ) institutions.append(inst) print(f" ✓ {inst.name}") # International Cultural Centers (Research Centers with libraries) print("\n Extracting cultural centers...") centers = [ { 'name': 'French Institute of Egypt (IFAO)', 'url': 'https://www.ifao.egnet.net/', 'description': 'Digital Archives available at https://atom.ifao.egnet.net/. Library holds 90,000+ volumes in Egyptology, papyrology. BIFAO journal on JSTOR.', 'city': 'Cairo' }, { 'name': 'German Archaeological Institute Cairo (DAI)', 'url': 'https://www.dainst.org/en/departments/cairo', 'description': 'Digital Resources: iDAI.images photo archives, Arachne database (1 million+ scans), MDAIK journal publications.', 'city': 'Cairo' }, { 'name': 'Netherlands-Flemish Institute Cairo (NVIC)', 'url': 'https://www.universiteitleiden.nl/en/nvic', 'description': 'Library with 13,000 volumes in Arabic & Islamic studies. Egyptology in the Field study program.', 'city': 'Cairo' } ] for center in centers: identifiers = [Identifier( identifier_scheme="Website", identifier_value=center['url'], identifier_url=center['url'] )] inst = create_institution( name=center['name'], inst_type=InstitutionType.RESEARCH_CENTER, description=center['description'], identifiers=identifiers, location_city=center['city'] ) institutions.append(inst) print(f" ✓ {inst.name}") # Digital Platforms (Official Institutions) print("\n Extracting digital platforms...") platforms = [ { 'name': 'Egyptian Knowledge Bank (EKB)', 'url': 'https://int.ekb.eg/', 'description': 'Launched November 24, 2015. Free access for 92 million citizens. Partners: Oxford, Reuters, Britannica, Cambridge, Elsevier. 1.8 million visitors on launch day.', 'city': None }, { 'name': 'Global Egyptian Museum', 'url': 'https://www.globalegyptianmuseum.org/', 'description': 'Scope: 2 million objects from 850 collections. Modes: Basic (1,340 highlights), Advanced (14,975 objects). Collaborative international platform.', 'city': None } ] for platform in platforms: identifiers = [Identifier( identifier_scheme="Website", identifier_value=platform['url'], identifier_url=platform['url'] )] inst = create_institution( name=platform['name'], inst_type=InstitutionType.OFFICIAL_INSTITUTION, description=platform['description'], identifiers=identifiers, location_city=platform['city'] ) institutions.append(inst) print(f" ✓ {inst.name}") return institutions def main(): """Main extraction workflow.""" print("="*60) print("Egyptian GLAM Extraction - Step 4: Galleries & Centers") print("="*60) # Step 4: Galleries, Cultural Centers, Digital Platforms print("\nSTEP 4: Galleries, Cultural Centers, Digital Platforms") print("-"*60) institutions = extract_institutions_step4() print(f"\n → Extracted {len(institutions)} institutions") # Summary by type from collections import Counter type_counts = Counter(inst.institution_type for inst in institutions) print("\nBy Type:") for inst_type, count in type_counts.items(): print(f" - {inst_type}: {count}") # Summary print("\n" + "="*60) print(f"TOTAL: {len(institutions)} institutions extracted") print("="*60) # Save to YAML output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step4.yaml" output_path.parent.mkdir(parents=True, exist_ok=True) print(f"\nSaving to: {output_path}") with open(output_path, 'w', encoding='utf-8') as f: records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions] yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False) print(f"✓ Saved to {output_path.name}") # Print institution list print("\nExtracted Institutions:") for i, inst in enumerate(institutions, 1): print(f" {i}. {inst.name} ({inst.institution_type})") if __name__ == "__main__": main()