- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
252 lines
8.9 KiB
Python
252 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Egyptian GLAM Institution Extraction - Step 4
|
|
Extracts galleries, cultural centers, and digital platforms from Egyptian GLAM conversation.
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Optional
|
|
import yaml
|
|
|
|
# Add parent directory to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from glam_extractor.models import (
|
|
HeritageCustodian,
|
|
Location,
|
|
Identifier,
|
|
Provenance,
|
|
InstitutionType,
|
|
DataSource,
|
|
DataTier
|
|
)
|
|
|
|
# Path to cached markdown content
|
|
MARKDOWN_PATH = Path("/tmp/egypt_content.md")
|
|
|
|
def extract_markdown_content() -> str:
|
|
"""Extract markdown content from cached markdown file."""
|
|
with open(MARKDOWN_PATH, 'r', encoding='utf-8') as f:
|
|
return f.read()
|
|
|
|
def create_institution(name: str, inst_type: InstitutionType, description: str,
|
|
identifiers: List[Identifier], location_city: Optional[str] = None) -> HeritageCustodian:
|
|
"""Create a HeritageCustodian instance with standard metadata."""
|
|
|
|
# Create provenance metadata
|
|
provenance = Provenance(
|
|
data_source=DataSource.CONVERSATION_NLP,
|
|
data_tier=DataTier.TIER_4_INFERRED,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="Python NLP extraction from Egyptian GLAM conversation",
|
|
confidence_score=0.85,
|
|
conversation_id="39e11630-a2af-407c-a365-d485eb8257b0"
|
|
)
|
|
|
|
# Generate ID from name
|
|
name_slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')
|
|
institution_id = f"https://w3id.org/heritage/custodian/eg/{name_slug}"
|
|
|
|
# Create location if city provided
|
|
locations = []
|
|
if location_city:
|
|
locations.append(Location(
|
|
city=location_city,
|
|
country="EG"
|
|
))
|
|
|
|
# Create institution record
|
|
custodian_kwargs = {
|
|
'id': institution_id,
|
|
'name': name,
|
|
'institution_type': inst_type,
|
|
'description': description[:500] + "..." if len(description) > 500 else description,
|
|
'locations': locations if locations else [],
|
|
'identifiers': identifiers if identifiers else [],
|
|
'provenance': provenance
|
|
}
|
|
|
|
custodian = HeritageCustodian(**custodian_kwargs)
|
|
return custodian
|
|
|
|
def extract_institutions_step4() -> List[HeritageCustodian]:
|
|
"""Step 4: Extract galleries, cultural centers, and digital platforms."""
|
|
institutions = []
|
|
|
|
# Art Galleries (from Part IV)
|
|
print(" Extracting art galleries...")
|
|
|
|
galleries = [
|
|
{
|
|
'name': 'Palace of Arts (Cairo Opera House)',
|
|
'url': 'https://www.cairoopera.org/en/',
|
|
'description': 'Features Youth Salon, Contemporary Art Salon exhibitions. Exhibition information online archive.',
|
|
'city': 'Cairo'
|
|
},
|
|
{
|
|
'name': 'Mashrabia Gallery',
|
|
'url': 'http://www.mashrabiagallery.com/',
|
|
'description': 'Established 1990. Monthly exhibition archives, artist database. Artsy profile available.',
|
|
'city': 'Cairo'
|
|
},
|
|
{
|
|
'name': 'SafarKhan Gallery',
|
|
'url': 'https://www.safarkhan.com/',
|
|
'description': 'Since 1968. Comprehensive online catalog, virtual viewing. Services include authenticity certificates, worldwide shipping.',
|
|
'city': 'Cairo'
|
|
},
|
|
{
|
|
'name': 'Contemporary Image Collective (CiC)',
|
|
'url': 'http://www.ciccairo.com/',
|
|
'description': 'Library with Rufoof digital system (4,000 art books with Townhouse Gallery). Focus on visual culture and photography.',
|
|
'city': 'Cairo'
|
|
},
|
|
{
|
|
'name': 'Darb 1718 Contemporary Art Center',
|
|
'url': 'https://darb1718.com/',
|
|
'description': 'Mission: Comprehensive Egyptian art database development. Facilities include exhibition spaces, theater, artist residency.',
|
|
'city': 'Cairo'
|
|
}
|
|
]
|
|
|
|
for gallery in galleries:
|
|
identifiers = [Identifier(
|
|
identifier_scheme="Website",
|
|
identifier_value=gallery['url'],
|
|
identifier_url=gallery['url']
|
|
)]
|
|
|
|
inst = create_institution(
|
|
name=gallery['name'],
|
|
inst_type=InstitutionType.GALLERY,
|
|
description=gallery['description'],
|
|
identifiers=identifiers,
|
|
location_city=gallery['city']
|
|
)
|
|
institutions.append(inst)
|
|
print(f" ✓ {inst.name}")
|
|
|
|
# International Cultural Centers (Research Centers with libraries)
|
|
print("\n Extracting cultural centers...")
|
|
|
|
centers = [
|
|
{
|
|
'name': 'French Institute of Egypt (IFAO)',
|
|
'url': 'https://www.ifao.egnet.net/',
|
|
'description': 'Digital Archives available at https://atom.ifao.egnet.net/. Library holds 90,000+ volumes in Egyptology, papyrology. BIFAO journal on JSTOR.',
|
|
'city': 'Cairo'
|
|
},
|
|
{
|
|
'name': 'German Archaeological Institute Cairo (DAI)',
|
|
'url': 'https://www.dainst.org/en/departments/cairo',
|
|
'description': 'Digital Resources: iDAI.images photo archives, Arachne database (1 million+ scans), MDAIK journal publications.',
|
|
'city': 'Cairo'
|
|
},
|
|
{
|
|
'name': 'Netherlands-Flemish Institute Cairo (NVIC)',
|
|
'url': 'https://www.universiteitleiden.nl/en/nvic',
|
|
'description': 'Library with 13,000 volumes in Arabic & Islamic studies. Egyptology in the Field study program.',
|
|
'city': 'Cairo'
|
|
}
|
|
]
|
|
|
|
for center in centers:
|
|
identifiers = [Identifier(
|
|
identifier_scheme="Website",
|
|
identifier_value=center['url'],
|
|
identifier_url=center['url']
|
|
)]
|
|
|
|
inst = create_institution(
|
|
name=center['name'],
|
|
inst_type=InstitutionType.RESEARCH_CENTER,
|
|
description=center['description'],
|
|
identifiers=identifiers,
|
|
location_city=center['city']
|
|
)
|
|
institutions.append(inst)
|
|
print(f" ✓ {inst.name}")
|
|
|
|
# Digital Platforms (Official Institutions)
|
|
print("\n Extracting digital platforms...")
|
|
|
|
platforms = [
|
|
{
|
|
'name': 'Egyptian Knowledge Bank (EKB)',
|
|
'url': 'https://int.ekb.eg/',
|
|
'description': 'Launched November 24, 2015. Free access for 92 million citizens. Partners: Oxford, Reuters, Britannica, Cambridge, Elsevier. 1.8 million visitors on launch day.',
|
|
'city': None
|
|
},
|
|
{
|
|
'name': 'Global Egyptian Museum',
|
|
'url': 'https://www.globalegyptianmuseum.org/',
|
|
'description': 'Scope: 2 million objects from 850 collections. Modes: Basic (1,340 highlights), Advanced (14,975 objects). Collaborative international platform.',
|
|
'city': None
|
|
}
|
|
]
|
|
|
|
for platform in platforms:
|
|
identifiers = [Identifier(
|
|
identifier_scheme="Website",
|
|
identifier_value=platform['url'],
|
|
identifier_url=platform['url']
|
|
)]
|
|
|
|
inst = create_institution(
|
|
name=platform['name'],
|
|
inst_type=InstitutionType.OFFICIAL_INSTITUTION,
|
|
description=platform['description'],
|
|
identifiers=identifiers,
|
|
location_city=platform['city']
|
|
)
|
|
institutions.append(inst)
|
|
print(f" ✓ {inst.name}")
|
|
|
|
return institutions
|
|
|
|
def main():
|
|
"""Main extraction workflow."""
|
|
print("="*60)
|
|
print("Egyptian GLAM Extraction - Step 4: Galleries & Centers")
|
|
print("="*60)
|
|
|
|
# Step 4: Galleries, Cultural Centers, Digital Platforms
|
|
print("\nSTEP 4: Galleries, Cultural Centers, Digital Platforms")
|
|
print("-"*60)
|
|
institutions = extract_institutions_step4()
|
|
print(f"\n → Extracted {len(institutions)} institutions")
|
|
|
|
# Summary by type
|
|
from collections import Counter
|
|
type_counts = Counter(inst.institution_type for inst in institutions)
|
|
print("\nBy Type:")
|
|
for inst_type, count in type_counts.items():
|
|
print(f" - {inst_type}: {count}")
|
|
|
|
# Summary
|
|
print("\n" + "="*60)
|
|
print(f"TOTAL: {len(institutions)} institutions extracted")
|
|
print("="*60)
|
|
|
|
# Save to YAML
|
|
output_path = Path(__file__).parent.parent / "data" / "instances" / "egypt_step4.yaml"
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"\nSaving to: {output_path}")
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
records = [inst.model_dump(exclude_none=True, mode='json') for inst in institutions]
|
|
yaml.dump(records, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"✓ Saved to {output_path.name}")
|
|
|
|
# Print institution list
|
|
print("\nExtracted Institutions:")
|
|
for i, inst in enumerate(institutions, 1):
|
|
print(f" {i}. {inst.name} ({inst.institution_type})")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|