- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
683 lines
28 KiB
Python
683 lines
28 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch extract partnerships from all GLAM conversation files.
|
|
|
|
This script:
|
|
1. Discovers all GLAM-related conversation JSON files
|
|
2. Extracts partnerships using ConversationParser
|
|
3. Deduplicates partners globally
|
|
4. Generates statistics and network data
|
|
5. Exports unified RDF graph with all partnerships
|
|
|
|
Usage:
|
|
python scripts/batch_extract_partnerships.py
|
|
|
|
Outputs:
|
|
- data/exports/global_glam_partnerships.ttl (RDF/Turtle)
|
|
- data/exports/partnership_statistics.json (summary statistics)
|
|
- data/exports/partner_network.json (network graph data)
|
|
- logs/partnership_extraction.log (processing log)
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from collections import Counter, defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Set, Tuple
|
|
import sys
|
|
|
|
# Add src to path for imports
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from glam_extractor.parsers.conversation import ConversationParser
|
|
from glam_extractor.exporters.rdf_exporter import RDFExporter
|
|
from glam_extractor.models import (
|
|
HeritageCustodian, Partnership, Provenance, InstitutionType,
|
|
DataSource, DataTier, OrganizationStatus
|
|
)
|
|
|
|
|
|
# Configure logging
|
|
LOG_DIR = Path(__file__).parent.parent / "logs"
|
|
LOG_DIR.mkdir(exist_ok=True)
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler(LOG_DIR / "partnership_extraction.log"),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PartnershipBatchExtractor:
|
|
"""Batch processor for partnership extraction from conversation files"""
|
|
|
|
def __init__(self, conversation_dir: Path):
|
|
"""
|
|
Initialize batch extractor.
|
|
|
|
Args:
|
|
conversation_dir: Directory containing conversation JSON files
|
|
"""
|
|
self.conversation_dir = Path(conversation_dir)
|
|
self.parser = ConversationParser()
|
|
self.exporter = RDFExporter()
|
|
|
|
# Statistics tracking
|
|
self.stats = {
|
|
"total_files": 0,
|
|
"processed_files": 0,
|
|
"failed_files": 0,
|
|
"total_partnerships": 0,
|
|
"unique_partners": 0,
|
|
"partnership_types": Counter(),
|
|
"countries": Counter(),
|
|
"errors": []
|
|
}
|
|
|
|
# Global partner registry (partner_name → metadata)
|
|
self.global_partners: Dict[str, Dict[str, Any]] = {}
|
|
|
|
# Institution partnerships (institution_name → list of partnerships)
|
|
self.institution_partnerships: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
|
|
|
# Network graph (edges: institution → partner)
|
|
self.network_edges: List[Tuple[str, str, str]] = [] # (institution, partner, type)
|
|
|
|
def discover_glam_files(self) -> List[Path]:
|
|
"""
|
|
Discover all GLAM-related conversation JSON files.
|
|
|
|
Returns:
|
|
List of file paths
|
|
"""
|
|
logger.info(f"Scanning {self.conversation_dir} for GLAM conversation files...")
|
|
|
|
# GLAM-related filename patterns
|
|
glam_patterns = [
|
|
'glam', 'museum', 'library', 'archive', 'heritage',
|
|
'cultural', 'gallery', 'collection', 'catalog'
|
|
]
|
|
|
|
all_json_files = list(self.conversation_dir.glob("*.json"))
|
|
|
|
# Filter for GLAM-related files
|
|
glam_files = []
|
|
for file_path in all_json_files:
|
|
filename_lower = file_path.name.lower()
|
|
if any(pattern in filename_lower for pattern in glam_patterns):
|
|
glam_files.append(file_path)
|
|
|
|
logger.info(f"Found {len(glam_files)} GLAM conversation files (out of {len(all_json_files)} total)")
|
|
|
|
self.stats["total_files"] = len(glam_files)
|
|
|
|
return sorted(glam_files) # Sort for reproducibility
|
|
|
|
def extract_country_from_filename(self, filename: str) -> str:
|
|
"""
|
|
Extract country/region name from conversation filename.
|
|
|
|
Args:
|
|
filename: Conversation filename
|
|
|
|
Returns:
|
|
Country name (or "Unknown" if not found)
|
|
|
|
Examples:
|
|
- "Brazilian_GLAM_collection_inventories.json" → "Brazil"
|
|
- "Mexican_GLAM_inventories.json" → "Mexico"
|
|
- "Zeeland_GLAM_resources.json" → "Netherlands (Zeeland)"
|
|
- "Turkish_Archives_Digital_Repositories.json" → "Turkey"
|
|
"""
|
|
# Comprehensive country/region mapping (adjectives + country names)
|
|
# Covers 100+ countries from conversation filenames
|
|
country_mapping = {
|
|
# Americas
|
|
'brazilian': 'Brazil', 'brazil': 'Brazil',
|
|
'mexican': 'Mexico', 'mexico': 'Mexico',
|
|
'canadian': 'Canada', 'canada': 'Canada',
|
|
'chilean': 'Chile', 'chile': 'Chile',
|
|
'argentine': 'Argentina', 'argentina': 'Argentina',
|
|
'colombian': 'Colombia', 'colombia': 'Colombia',
|
|
'peruvian': 'Peru', 'peru': 'Peru',
|
|
'cuban': 'Cuba', 'cuba': 'Cuba',
|
|
'panamanian': 'Panama', 'panama': 'Panama',
|
|
'nicaraguan': 'Nicaragua', 'nicaragua': 'Nicaragua',
|
|
'suriname': 'Suriname', 'surinamese': 'Suriname',
|
|
'paraguayan': 'Paraguay', 'paraguay': 'Paraguay',
|
|
'honduran': 'Honduras', 'honduras': 'Honduras',
|
|
'united_states': 'United States',
|
|
'mapping_glam_resources_in_the_united': 'United States',
|
|
|
|
# Europe
|
|
'dutch': 'Netherlands', 'netherlands': 'Netherlands',
|
|
'zeeland': 'Netherlands (Zeeland)',
|
|
'limburg': 'Netherlands (Limburg)',
|
|
'gelderland': 'Netherlands (Gelderland)',
|
|
'drenthe': 'Netherlands (Drenthe)',
|
|
'groningen': 'Netherlands (Groningen)',
|
|
'turkish': 'Turkey', 'turkey': 'Turkey',
|
|
'polish': 'Poland', 'poland': 'Poland',
|
|
'hungarian': 'Hungary', 'hungary': 'Hungary',
|
|
'norwegian': 'Norway', 'norway': 'Norway',
|
|
'portuguese': 'Portugal', 'portugal': 'Portugal',
|
|
'belgian': 'Belgium', 'belgium': 'Belgium',
|
|
'swedish': 'Sweden', 'sweden': 'Sweden',
|
|
'estonian': 'Estonia', 'estonia': 'Estonia',
|
|
'croatian': 'Croatia', 'croatia': 'Croatia',
|
|
'swiss': 'Switzerland', 'switzerland': 'Switzerland',
|
|
'moldavian': 'Moldova', 'moldova': 'Moldova',
|
|
'bulgarian': 'Bulgaria', 'bulgaria': 'Bulgaria',
|
|
'romanian': 'Romania', 'romania': 'Romania',
|
|
'albanian': 'Albania', 'albania': 'Albania',
|
|
'bosnian': 'Bosnia and Herzegovina', 'bosnia': 'Bosnia and Herzegovina',
|
|
'danish': 'Denmark', 'denmark': 'Denmark',
|
|
'austrian': 'Austria', 'austria': 'Austria',
|
|
'slovakian': 'Slovakia', 'slovakia': 'Slovakia',
|
|
'latvian': 'Latvia', 'latvia': 'Latvia',
|
|
|
|
# Asia
|
|
'vietnamese': 'Vietnam', 'vietnam': 'Vietnam',
|
|
'japanese': 'Japan', 'japan': 'Japan',
|
|
'thai': 'Thailand', 'thailand': 'Thailand',
|
|
'taiwan': 'Taiwan', 'taiwanese': 'Taiwan',
|
|
'korean': 'South Korea', 'south_korea': 'South Korea',
|
|
'malaysian': 'Malaysia', 'malaysia': 'Malaysia',
|
|
'pakistani': 'Pakistan', 'pakistan': 'Pakistan',
|
|
'iranian': 'Iran', 'iran': 'Iran',
|
|
'uzbekistan': 'Uzbekistan', 'uzbek': 'Uzbekistan',
|
|
'armenian': 'Armenia', 'armenia': 'Armenia',
|
|
'azerbaijan': 'Azerbaijan', 'azerbaijani': 'Azerbaijan',
|
|
'georgian': 'Georgia', 'georgia': 'Georgia',
|
|
'nepalese': 'Nepal', 'nepal': 'Nepal',
|
|
'myanmar': 'Myanmar', 'burmese': 'Myanmar',
|
|
'cambodian': 'Cambodia', 'cambodia': 'Cambodia',
|
|
'sri_lankan': 'Sri Lanka', 'sri_lanka': 'Sri Lanka',
|
|
'tajikistan': 'Tajikistan', 'tajik': 'Tajikistan',
|
|
'turkmenistan': 'Turkmenistan', 'turkmen': 'Turkmenistan',
|
|
'philippine': 'Philippines', 'philippines': 'Philippines',
|
|
'indonesian': 'Indonesia', 'indonesia': 'Indonesia',
|
|
'bhutan': 'Bhutan', 'bhutanese': 'Bhutan',
|
|
|
|
# Middle East
|
|
'iraqi': 'Iraq', 'iraq': 'Iraq',
|
|
'jordanian': 'Jordan', 'jordan': 'Jordan',
|
|
'egyptian': 'Egypt', 'egypt': 'Egypt',
|
|
'saudi': 'Saudi Arabia', 'saudi_arabia': 'Saudi Arabia',
|
|
'qatari': 'Qatar', 'qatar': 'Qatar',
|
|
'omani': 'Oman', 'oman': 'Oman',
|
|
'emirati': 'United Arab Emirates', 'uae': 'United Arab Emirates',
|
|
'kuwaiti': 'Kuwait', 'kuwait': 'Kuwait',
|
|
'lebanese': 'Lebanon', 'lebanon': 'Lebanon',
|
|
'syrian': 'Syria', 'syria': 'Syria',
|
|
'palestinian': 'Palestine', 'palestine': 'Palestine',
|
|
'yemeni': 'Yemen', 'yemen': 'Yemen',
|
|
|
|
# Africa
|
|
'algerian': 'Algeria', 'algeria': 'Algeria',
|
|
'moroccan': 'Morocco', 'morocco': 'Morocco',
|
|
'tunisian': 'Tunisia', 'tunisia': 'Tunisia',
|
|
'south_african': 'South Africa', 'south_africa': 'South Africa',
|
|
'namibian': 'Namibia', 'namibia': 'Namibia',
|
|
'ghanaian': 'Ghana', 'ghana': 'Ghana',
|
|
'nigerian': 'Nigeria', 'nigeria': 'Nigeria',
|
|
'somali': 'Somalia', 'somalia': 'Somalia',
|
|
'malian': 'Mali', 'mali': 'Mali',
|
|
'senegalese': 'Senegal', 'senegal': 'Senegal',
|
|
'mauritanian': 'Mauritania', 'mauritania': 'Mauritania',
|
|
'kenyan': 'Kenya', 'kenya': 'Kenya',
|
|
'mozambican': 'Mozambique', 'mozambique': 'Mozambique',
|
|
'eritrean': 'Eritrea', 'eritrea': 'Eritrea',
|
|
'sudanese': 'Sudan', 'sudan': 'Sudan',
|
|
'rwandan': 'Rwanda', 'rwanda': 'Rwanda',
|
|
'zimbabwean': 'Zimbabwe', 'zimbabwe': 'Zimbabwe',
|
|
'congolese': 'Democratic Republic of the Congo',
|
|
'congo': 'Democratic Republic of the Congo',
|
|
'benin': 'Benin', 'beninese': 'Benin',
|
|
|
|
# Oceania
|
|
'australian': 'Australia', 'australia': 'Australia',
|
|
'kiribati': 'Kiribati',
|
|
'east_timor': 'Timor-Leste', 'timor': 'Timor-Leste',
|
|
|
|
# Russia
|
|
'russian': 'Russia', 'russia': 'Russia',
|
|
|
|
# Additional countries from missing mappings
|
|
'libyan': 'Libya', 'libya': 'Libya',
|
|
'greek': 'Greece', 'greece': 'Greece',
|
|
'serbian': 'Serbia', 'serbia': 'Serbia',
|
|
'indian': 'India', 'india': 'India',
|
|
'burmese': 'Myanmar', 'burma': 'Myanmar',
|
|
'afghan': 'Afghanistan', 'afghanistan': 'Afghanistan',
|
|
'laotian': 'Laos', 'laos': 'Laos',
|
|
'uruguayan': 'Uruguay', 'uruguay': 'Uruguay',
|
|
'finnish': 'Finland', 'finland': 'Finland',
|
|
'israeli': 'Israel', 'israel': 'Israel',
|
|
'cypriot': 'Cyprus', 'cyprus': 'Cyprus',
|
|
'slovak': 'Slovakia',
|
|
'slovenian': 'Slovenia', 'slovenia': 'Slovenia',
|
|
'macedonian': 'North Macedonia', 'north_macedonia': 'North Macedonia',
|
|
'ethiopian': 'Ethiopia', 'ethiopia': 'Ethiopia',
|
|
'malagasy': 'Madagascar', 'madagascar': 'Madagascar',
|
|
'new_zealand': 'New Zealand', 'new_zealander': 'New Zealand',
|
|
'haitian': 'Haiti', 'haiti': 'Haiti',
|
|
'jamaican': 'Jamaica', 'jamaica': 'Jamaica',
|
|
'vatican': 'Vatican City',
|
|
'italian': 'Italy', 'italy': 'Italy',
|
|
'arabic_emirates': 'United Arab Emirates',
|
|
'maldivian': 'Maldives', 'maldives': 'Maldives',
|
|
'burkina': 'Burkina Faso', 'burkina_faso': 'Burkina Faso',
|
|
'togolese': 'Togo', 'togo': 'Togo',
|
|
'liberian': 'Liberia', 'liberia': 'Liberia',
|
|
|
|
# Dutch provinces (additional)
|
|
'overijssel': 'Netherlands (Overijssel)',
|
|
'north_brabant': 'Netherlands (North Brabant)', 'brabant': 'Netherlands (North Brabant)',
|
|
'zuid_holland': 'Netherlands (Zuid-Holland)', 'south_holland': 'Netherlands (Zuid-Holland)',
|
|
'noord_holland': 'Netherlands (Noord-Holland)', 'north_holland': 'Netherlands (Noord-Holland)',
|
|
'friesland': 'Netherlands (Friesland)',
|
|
'flevoland': 'Netherlands (Flevoland)',
|
|
|
|
# Special cases (broader regions/platforms)
|
|
'archives_du_maroc': 'Morocco',
|
|
}
|
|
|
|
filename_lower = filename.lower()
|
|
|
|
# Try exact matching first (prioritize longer keys)
|
|
for key in sorted(country_mapping.keys(), key=len, reverse=True):
|
|
if key in filename_lower:
|
|
return country_mapping[key]
|
|
|
|
return "Unknown"
|
|
|
|
def extract_institution_name_from_conversation(self, conversation_name: str) -> str:
|
|
"""
|
|
Generate institution name from conversation title.
|
|
|
|
For batch processing, we use conversation name as institution identifier
|
|
since individual institution names aren't always explicit.
|
|
|
|
Args:
|
|
conversation_name: Conversation title
|
|
|
|
Returns:
|
|
Institution name (simplified conversation title)
|
|
"""
|
|
# Remove common suffixes
|
|
name = conversation_name
|
|
for suffix in [' conversation', ' inventory', ' resources', ' catalogues',
|
|
' and catalogues', ' digital resources', ' GLAM']:
|
|
name = name.replace(suffix, '')
|
|
|
|
# Clean up
|
|
name = name.strip()
|
|
|
|
# If still too generic, return as-is
|
|
if len(name) < 5:
|
|
return conversation_name
|
|
|
|
return name
|
|
|
|
def process_file(self, file_path: Path) -> Dict[str, Any]:
|
|
"""
|
|
Process a single conversation file and extract partnerships.
|
|
|
|
Args:
|
|
file_path: Path to conversation JSON file
|
|
|
|
Returns:
|
|
Dictionary with extraction results:
|
|
- success: bool
|
|
- partnerships: list
|
|
- error: str (if failed)
|
|
- metadata: dict
|
|
"""
|
|
logger.info(f"Processing: {file_path.name}")
|
|
|
|
try:
|
|
# Parse conversation
|
|
conversation = self.parser.parse_file(file_path)
|
|
|
|
# Extract partnerships
|
|
partnerships = self.parser.extract_partnerships(conversation)
|
|
|
|
# Extract country from filename
|
|
country = self.extract_country_from_filename(file_path.name)
|
|
|
|
# Extract institution name from conversation
|
|
institution_name = self.extract_institution_name_from_conversation(conversation.name)
|
|
|
|
# Update statistics
|
|
self.stats["processed_files"] += 1
|
|
self.stats["total_partnerships"] += len(partnerships)
|
|
self.stats["countries"][country] += 1
|
|
|
|
# Process each partnership
|
|
for partnership in partnerships:
|
|
partner_name = partnership["partner_name"]
|
|
partnership_type = partnership["partnership_type"]
|
|
|
|
# Update type counter
|
|
self.stats["partnership_types"][partnership_type] += 1
|
|
|
|
# Add to global partner registry
|
|
if partner_name not in self.global_partners:
|
|
self.global_partners[partner_name] = {
|
|
"partner_name": partner_name,
|
|
"mention_count": 0,
|
|
"partnership_types": set(),
|
|
"mentioned_in_countries": set(),
|
|
"mentioned_in_files": set(),
|
|
}
|
|
|
|
# Update partner metadata
|
|
self.global_partners[partner_name]["mention_count"] += 1
|
|
self.global_partners[partner_name]["partnership_types"].add(partnership_type)
|
|
self.global_partners[partner_name]["mentioned_in_countries"].add(country)
|
|
self.global_partners[partner_name]["mentioned_in_files"].add(file_path.name)
|
|
|
|
# Add to institution partnerships
|
|
self.institution_partnerships[institution_name].append({
|
|
**partnership,
|
|
"institution": institution_name,
|
|
"country": country,
|
|
"conversation_id": conversation.uuid,
|
|
"conversation_name": conversation.name,
|
|
})
|
|
|
|
# Add to network graph
|
|
self.network_edges.append((institution_name, partner_name, partnership_type))
|
|
|
|
logger.info(f" ✓ Extracted {len(partnerships)} partnerships from {institution_name} ({country})")
|
|
|
|
return {
|
|
"success": True,
|
|
"partnerships": partnerships,
|
|
"metadata": {
|
|
"file_path": str(file_path),
|
|
"conversation_id": conversation.uuid,
|
|
"conversation_name": conversation.name,
|
|
"country": country,
|
|
"institution_name": institution_name,
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f" ✗ Error processing {file_path.name}: {e}")
|
|
self.stats["failed_files"] += 1
|
|
self.stats["errors"].append({
|
|
"file": str(file_path),
|
|
"error": str(e)
|
|
})
|
|
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"metadata": {
|
|
"file_path": str(file_path)
|
|
}
|
|
}
|
|
|
|
def process_all_files(self, file_paths: List[Path]) -> None:
|
|
"""
|
|
Process all conversation files.
|
|
|
|
Args:
|
|
file_paths: List of file paths to process
|
|
"""
|
|
logger.info(f"Starting batch processing of {len(file_paths)} files...")
|
|
|
|
for i, file_path in enumerate(file_paths, 1):
|
|
logger.info(f"[{i}/{len(file_paths)}] {file_path.name}")
|
|
self.process_file(file_path)
|
|
|
|
# Calculate unique partners
|
|
self.stats["unique_partners"] = len(self.global_partners)
|
|
|
|
logger.info("Batch processing complete!")
|
|
logger.info(f" Total files: {self.stats['total_files']}")
|
|
logger.info(f" Processed: {self.stats['processed_files']}")
|
|
logger.info(f" Failed: {self.stats['failed_files']}")
|
|
logger.info(f" Total partnerships: {self.stats['total_partnerships']}")
|
|
logger.info(f" Unique partners: {self.stats['unique_partners']}")
|
|
|
|
def export_statistics(self, output_path: Path) -> None:
|
|
"""
|
|
Export statistics to JSON file.
|
|
|
|
Args:
|
|
output_path: Path to output JSON file
|
|
"""
|
|
logger.info(f"Exporting statistics to {output_path}...")
|
|
|
|
# Convert sets to lists for JSON serialization
|
|
global_partners_serializable = {}
|
|
for partner_name, metadata in self.global_partners.items():
|
|
global_partners_serializable[partner_name] = {
|
|
"partner_name": metadata["partner_name"],
|
|
"mention_count": metadata["mention_count"],
|
|
"partnership_types": list(metadata["partnership_types"]),
|
|
"mentioned_in_countries": list(metadata["mentioned_in_countries"]),
|
|
"mentioned_in_files": list(metadata["mentioned_in_files"]),
|
|
}
|
|
|
|
# Get top partners by mention count
|
|
top_partners = sorted(
|
|
global_partners_serializable.items(),
|
|
key=lambda x: x[1]["mention_count"],
|
|
reverse=True
|
|
)[:20]
|
|
|
|
statistics = {
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"summary": {
|
|
"total_files": self.stats["total_files"],
|
|
"processed_files": self.stats["processed_files"],
|
|
"failed_files": self.stats["failed_files"],
|
|
"total_partnerships": self.stats["total_partnerships"],
|
|
"unique_partners": self.stats["unique_partners"],
|
|
"unique_institutions": len(self.institution_partnerships),
|
|
"network_edges": len(self.network_edges),
|
|
},
|
|
"partnership_types": dict(self.stats["partnership_types"]),
|
|
"countries": dict(self.stats["countries"]),
|
|
"top_partners": [
|
|
{
|
|
"rank": i + 1,
|
|
"partner_name": partner_name,
|
|
**metadata
|
|
}
|
|
for i, (partner_name, metadata) in enumerate(top_partners)
|
|
],
|
|
"errors": self.stats["errors"][:20] # First 20 errors only
|
|
}
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(statistics, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f" ✓ Statistics exported to {output_path}")
|
|
|
|
def export_network_graph(self, output_path: Path) -> None:
|
|
"""
|
|
Export network graph data (nodes + edges) to JSON file.
|
|
|
|
Format compatible with D3.js, Gephi, Cytoscape.
|
|
|
|
Args:
|
|
output_path: Path to output JSON file
|
|
"""
|
|
logger.info(f"Exporting network graph to {output_path}...")
|
|
|
|
# Create nodes (institutions + partners)
|
|
institution_nodes = [
|
|
{
|
|
"id": institution_name,
|
|
"type": "institution",
|
|
"label": institution_name,
|
|
"partnership_count": len(partnerships)
|
|
}
|
|
for institution_name, partnerships in self.institution_partnerships.items()
|
|
]
|
|
|
|
partner_nodes = [
|
|
{
|
|
"id": partner_name,
|
|
"type": "partner",
|
|
"label": partner_name,
|
|
"mention_count": metadata["mention_count"],
|
|
"partnership_types": list(metadata["partnership_types"]),
|
|
}
|
|
for partner_name, metadata in self.global_partners.items()
|
|
]
|
|
|
|
# Create edges
|
|
edges = [
|
|
{
|
|
"source": institution,
|
|
"target": partner,
|
|
"type": partnership_type,
|
|
"label": partnership_type
|
|
}
|
|
for institution, partner, partnership_type in self.network_edges
|
|
]
|
|
|
|
network_graph = {
|
|
"nodes": institution_nodes + partner_nodes,
|
|
"edges": edges,
|
|
"metadata": {
|
|
"node_count": len(institution_nodes) + len(partner_nodes),
|
|
"institution_count": len(institution_nodes),
|
|
"partner_count": len(partner_nodes),
|
|
"edge_count": len(edges),
|
|
"created_at": datetime.now(timezone.utc).isoformat()
|
|
}
|
|
}
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(network_graph, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f" ✓ Network graph exported to {output_path}")
|
|
logger.info(f" Nodes: {network_graph['metadata']['node_count']} "
|
|
f"(institutions: {network_graph['metadata']['institution_count']}, "
|
|
f"partners: {network_graph['metadata']['partner_count']})")
|
|
logger.info(f" Edges: {network_graph['metadata']['edge_count']}")
|
|
|
|
def export_rdf_graph(self, output_path: Path) -> None:
|
|
"""
|
|
Export unified RDF graph with all partnerships.
|
|
|
|
Creates HeritageCustodian records for each institution with partnerships,
|
|
then serializes to RDF/Turtle using W3C ORG ontology.
|
|
|
|
Args:
|
|
output_path: Path to output Turtle file
|
|
"""
|
|
logger.info(f"Generating RDF graph with {len(self.institution_partnerships)} institutions...")
|
|
|
|
custodians = []
|
|
|
|
for institution_name, partnerships in self.institution_partnerships.items():
|
|
# Get country from first partnership
|
|
country = partnerships[0]["country"] if partnerships else "Unknown"
|
|
|
|
# Create Partnership objects
|
|
partnership_objects = []
|
|
for p in partnerships:
|
|
partnership_obj = Partnership(
|
|
partner_name=p["partner_name"],
|
|
partnership_type=p["partnership_type"],
|
|
description=p.get("description"),
|
|
start_date=p.get("start_date"),
|
|
end_date=p.get("end_date"),
|
|
)
|
|
partnership_objects.append(partnership_obj)
|
|
|
|
# Create HeritageCustodian
|
|
custodian = HeritageCustodian(
|
|
id=f"https://w3id.org/heritage/custodian/batch/{institution_name.lower().replace(' ', '-')}",
|
|
name=institution_name,
|
|
institution_type=InstitutionType.MIXED, # Batch extraction doesn't determine specific type
|
|
description=f"Heritage institution identified from conversation analysis ({country})",
|
|
partnerships=partnership_objects,
|
|
provenance=Provenance(
|
|
data_source=DataSource.CONVERSATION_NLP,
|
|
data_tier=DataTier.TIER_4_INFERRED,
|
|
extraction_date=datetime.now(timezone.utc),
|
|
extraction_method="Batch partnership extraction from GLAM conversations",
|
|
confidence_score=0.7, # Batch extraction lower confidence than individual
|
|
)
|
|
)
|
|
|
|
custodians.append(custodian)
|
|
|
|
# Export to RDF/Turtle
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
self.exporter.export_to_file(
|
|
custodians=custodians,
|
|
filepath=str(output_path),
|
|
format="turtle"
|
|
)
|
|
|
|
logger.info(f" ✓ RDF graph exported to {output_path}")
|
|
logger.info(f" Institutions: {len(custodians)}")
|
|
logger.info(f" Total partnerships: {sum(len(c.partnerships or []) for c in custodians)}")
|
|
|
|
|
|
def main():
|
|
"""Main entry point for batch partnership extraction"""
|
|
|
|
logger.info("=" * 80)
|
|
logger.info("GLAM Partnership Batch Extraction")
|
|
logger.info("=" * 80)
|
|
|
|
# Configuration
|
|
CONVERSATION_DIR = Path("/Users/kempersc/Documents/claude/data-2025-11-02-18-13-26-batch-0000/conversations")
|
|
EXPORT_DIR = Path(__file__).parent.parent / "data" / "exports"
|
|
|
|
# Validate conversation directory
|
|
if not CONVERSATION_DIR.exists():
|
|
logger.error(f"Conversation directory not found: {CONVERSATION_DIR}")
|
|
sys.exit(1)
|
|
|
|
# Create extractor
|
|
extractor = PartnershipBatchExtractor(CONVERSATION_DIR)
|
|
|
|
# Step 1: Discover GLAM files
|
|
glam_files = extractor.discover_glam_files()
|
|
|
|
if not glam_files:
|
|
logger.error("No GLAM conversation files found!")
|
|
sys.exit(1)
|
|
|
|
# Step 2: Process all files
|
|
extractor.process_all_files(glam_files)
|
|
|
|
# Step 3: Export statistics
|
|
extractor.export_statistics(EXPORT_DIR / "partnership_statistics.json")
|
|
|
|
# Step 4: Export network graph
|
|
extractor.export_network_graph(EXPORT_DIR / "partner_network.json")
|
|
|
|
# Step 5: Export RDF graph
|
|
extractor.export_rdf_graph(EXPORT_DIR / "global_glam_partnerships.ttl")
|
|
|
|
logger.info("=" * 80)
|
|
logger.info("Batch extraction complete!")
|
|
logger.info(f"Results available in: {EXPORT_DIR}")
|
|
logger.info("=" * 80)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|