glam/scripts/extract_egypt_step4.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

252 lines
8.9 KiB
Python

#!/usr/bin/env python3
"""
Egyptian GLAM Institution Extraction - Step 4
Extracts galleries, cultural centers, and digital platforms from Egyptian GLAM conversation.
"""
import sys
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Optional
import yaml
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from glam_extractor.models import (
HeritageCustodian,
Location,
Identifier,
Provenance,
InstitutionType,
DataSource,
DataTier
)
# Path to cached markdown content
MARKDOWN_PATH = Path("/tmp/egypt_content.md")
def extract_markdown_content() -> str:
"""Extract markdown content from cached markdown file."""
with open(MARKDOWN_PATH, 'r', encoding='utf-8') as f:
return f.read()
def create_institution(name: str, inst_type: InstitutionType, description: str,
identifiers: List[Identifier], location_city: Optional[str] = None) -> HeritageCustodian:
"""Create a HeritageCustodian instance with standard metadata."""
# Create provenance metadata
provenance = Provenance(
data_source=DataSource.CONVERSATION_NLP,
data_tier=DataTier.TIER_4_INFERRED,
extraction_date=datetime.now(timezone.utc),
extraction_method="Python NLP extraction from Egyptian GLAM conversation",
confidence_score=0.85,
conversation_id="39e11630-a2af-407c-a365-d485eb8257b0"
)
# Generate ID from name
name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}"
# Create location if city provided
locations = []
if location_city:
locations.append(Location(
city=location_city,
country="EG"
))
# Create institution record
custodian_kwargs = {
'id': institution_id,
'name': name,
'institution_type': inst_type,
'description': description[:500] + "..." if len(description) > 500 else description,
'locations': locations if locations else [],
'identifiers': identifiers if identifiers else [],
'provenance': provenance
}
custodian = HeritageCustodian(**custodian_kwargs)
return custodian
def extract_institutions_step4() -> List[HeritageCustodian]:
"""Step 4: Extract galleries, cultural centers, and digital platforms."""
institutions = []
# Art Galleries (from Part IV)
print(" Extracting art galleries...")
galleries = [
{
'name': 'Palace of Arts (Cairo Opera House)',
'url': 'https://www.cairoopera.org/en/',
'description': 'Features Youth Salon, Contemporary Art Salon exhibitions. Exhibition information online archive.',
'city': 'Cairo'
},
{
'name': 'Mashrabia Gallery',
'url': 'http://www.mashrabiagallery.com/',
'description': 'Established 1990. Monthly exhibition archives, artist database. Artsy profile available.',
'city': 'Cairo'
},
{
'name': 'SafarKhan Gallery',
'url': 'https://www.safarkhan.com/',
'description': 'Since 1968. Comprehensive online catalog, virtual viewing. Services include authenticity certificates, worldwide shipping.',
'city': 'Cairo'
},
{
'name': 'Contemporary Image Collective (CiC)',
'url': 'http://www.ciccairo.com/',
'description': 'Library with Rufoof digital system (4,000 art books with Townhouse Gallery). Focus on visual culture and photography.',
'city': 'Cairo'
},
{
'name': 'Darb 1718 Contemporary Art Center',
'url': 'https://darb1718.com/',
'description': 'Mission: Comprehensive Egyptian art database development. Facilities include exhibition spaces, theater, artist residency.',
'city': 'Cairo'
}
]
for gallery in galleries:
identifiers = [Identifier(
identifier_scheme="Website",
identifier_value=gallery['url'],
identifier_url=gallery['url']
)]
inst = create_institution(
name=gallery['name'],
inst_type=InstitutionType.GALLERY,
description=gallery['description'],
identifiers=identifiers,
location_city=gallery['city']
)
institutions.append(inst)
print(f"{inst.name}")
# International Cultural Centers (Research Centers with libraries)
print("\n Extracting cultural centers...")
centers = [
{
'name': 'French Institute of Egypt (IFAO)',
'url': 'https://www.ifao.egnet.net/',
'description': 'Digital Archives available at https://atom.ifao.egnet.net/. Library holds 90,000+ volumes in Egyptology, papyrology. BIFAO journal on JSTOR.',
'city': 'Cairo'
},
{
'name': 'German Archaeological Institute Cairo (DAI)',
'url': 'https://www.dainst.org/en/departments/cairo',
'description': 'Digital Resources: iDAI.images photo archives, Arachne database (1 million+ scans), MDAIK journal publications.',
'city': 'Cairo'
},
{
'name': 'Netherlands-Flemish Institute Cairo (NVIC)',
'url': 'https://www.universiteitleiden.nl/en/nvic',
'description': 'Library with 13,000 volumes in Arabic & Islamic studies. Egyptology in the Field study program.',
'city': 'Cairo'
}
]
for center in centers:
identifiers = [Identifier(
identifier_scheme="Website",
identifier_value=center['url'],
identifier_url=center['url']
)]
inst = create_institution(
name=center['name'],
inst_type=InstitutionType.RESEARCH_CENTER,
description=center['description'],
identifiers=identifiers,
location_city=center['city']
)
institutions.append(inst)
print(f"{inst.name}")
# Digital Platforms (Official Institutions)
print("\n Extracting digital platforms...")
platforms = [
{
'name': 'Egyptian Knowledge Bank (EKB)',
'url': 'https://int.ekb.eg/',
'description': 'Launched November 24, 2015. Free access for 92 million citizens. Partners: Oxford, Reuters, Britannica, Cambridge, Elsevier. 1.8 million visitors on launch day.',
'city': None
},
{
'name': 'Global Egyptian Museum',
'url': 'https://www.globalegyptianmuseum.org/',
'description': 'Scope: 2 million objects from 850 collections. Modes: Basic (1,340 highlights), Advanced (14,975 objects). Collaborative international platform.',
'city': None
}
]
for platform in platforms:
identifiers = [Identifier(
identifier_scheme="Website",
identifier_value=platform['url'],
identifier_url=platform['url']
)]
inst = create_institution(
name=platform['name'],
inst_type=InstitutionType.OFFICIAL_INSTITUTION,
description=platform['description'],
identifiers=identifiers,
location_city=platform['city']
)
institutions.append(inst)
print(f"{inst.name}")
return institutions
def main():
"""Main extraction workflow."""
print("="*60)
print("Egyptian GLAM Extraction - Step 4: Galleries & Centers")
print("="*60)
# Step 4: Galleries, Cultural Centers, Digital Platforms
print("\nSTEP 4: Galleries, Cultural Centers, Digital Platforms")
print("-"*60)
institutions = extract_institutions_step4()
print(f"\n → Extracted {len(institutions)} institutions")
# Summary by type
from collections import Counter
type_counts = Counter(inst.institution_type for inst in institutions)
print("\nBy Type:")
for inst_type, count in type_counts.items():
print(f" - {inst_type}: {count}")
# Summary
print("\n" + "="*60)
print(f"TOTAL: {len(institutions)} institutions extracted")
print("="*60)
# Save to YAML
output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step4.yaml"
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"\nSaving to: {output_path}")
with open(output_path, 'w', encoding='utf-8') as f:
records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions]
yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
print(f"✓ Saved to {output_path.name}")
# Print institution list
print("\nExtracted Institutions:")
for i, inst in enumerate(institutions, 1):
print(f" {i}. {inst.name} ({inst.institution_type})")
if __name__ == "__main__":
main()