glam/scripts/batch_extract_partnerships.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

683 lines
28 KiB
Python

#!/usr/bin/env python3
"""
Batch extract partnerships from all GLAM conversation files.
This script:
1. Discovers all GLAM-related conversation JSON files
2. Extracts partnerships using ConversationParser
3. Deduplicates partners globally
4. Generates statistics and network data
5. Exports unified RDF graph with all partnerships
Usage:
python scripts/batch_extract_partnerships.py
Outputs:
- data/exports/global_glam_partnerships.ttl (RDF/Turtle)
- data/exports/partnership_statistics.json (summary statistics)
- data/exports/partner_network.json (network graph data)
- logs/partnership_extraction.log (processing log)
"""
import json
import logging
from collections import Counter, defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any, Set, Tuple
import sys
# Add src to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from glam_extractor.parsers.conversation import ConversationParser
from glam_extractor.exporters.rdf_exporter import RDFExporter
from glam_extractor.models import (
HeritageCustodian, Partnership, Provenance, InstitutionType,
DataSource, DataTier, OrganizationStatus
)
# Configure logging
LOG_DIR = Path(__file__).parent.parent / "logs"
LOG_DIR.mkdir(exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(LOG_DIR / "partnership_extraction.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class PartnershipBatchExtractor:
"""Batch processor for partnership extraction from conversation files"""
def __init__(self, conversation_dir: Path):
"""
Initialize batch extractor.
Args:
conversation_dir: Directory containing conversation JSON files
"""
self.conversation_dir = Path(conversation_dir)
self.parser = ConversationParser()
self.exporter = RDFExporter()
# Statistics tracking
self.stats = {
"total_files": 0,
"processed_files": 0,
"failed_files": 0,
"total_partnerships": 0,
"unique_partners": 0,
"partnership_types": Counter(),
"countries": Counter(),
"errors": []
}
# Global partner registry (partner_name → metadata)
self.global_partners: Dict[str, Dict[str, Any]] = {}
# Institution partnerships (institution_name → list of partnerships)
self.institution_partnerships: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
# Network graph (edges: institution → partner)
self.network_edges: List[Tuple[str, str, str]] = [] # (institution, partner, type)
def discover_glam_files(self) -> List[Path]:
"""
Discover all GLAM-related conversation JSON files.
Returns:
List of file paths
"""
logger.info(f"Scanning {self.conversation_dir} for GLAM conversation files...")
# GLAM-related filename patterns
glam_patterns = [
'glam', 'museum', 'library', 'archive', 'heritage',
'cultural', 'gallery', 'collection', 'catalog'
]
all_json_files = list(self.conversation_dir.glob("*.json"))
# Filter for GLAM-related files
glam_files = []
for file_path in all_json_files:
filename_lower = file_path.name.lower()
if any(pattern in filename_lower for pattern in glam_patterns):
glam_files.append(file_path)
logger.info(f"Found {len(glam_files)} GLAM conversation files (out of {len(all_json_files)} total)")
self.stats["total_files"] = len(glam_files)
return sorted(glam_files) # Sort for reproducibility
def extract_country_from_filename(self, filename: str) -> str:
"""
Extract country/region name from conversation filename.
Args:
filename: Conversation filename
Returns:
Country name (or "Unknown" if not found)
Examples:
- "Brazilian_GLAM_collection_inventories.json""Brazil"
- "Mexican_GLAM_inventories.json""Mexico"
- "Zeeland_GLAM_resources.json""Netherlands (Zeeland)"
- "Turkish_Archives_Digital_Repositories.json""Turkey"
"""
# Comprehensive country/region mapping (adjectives + country names)
# Covers 100+ countries from conversation filenames
country_mapping = {
# Americas
'brazilian': 'Brazil', 'brazil': 'Brazil',
'mexican': 'Mexico', 'mexico': 'Mexico',
'canadian': 'Canada', 'canada': 'Canada',
'chilean': 'Chile', 'chile': 'Chile',
'argentine': 'Argentina', 'argentina': 'Argentina',
'colombian': 'Colombia', 'colombia': 'Colombia',
'peruvian': 'Peru', 'peru': 'Peru',
'cuban': 'Cuba', 'cuba': 'Cuba',
'panamanian': 'Panama', 'panama': 'Panama',
'nicaraguan': 'Nicaragua', 'nicaragua': 'Nicaragua',
'suriname': 'Suriname', 'surinamese': 'Suriname',
'paraguayan': 'Paraguay', 'paraguay': 'Paraguay',
'honduran': 'Honduras', 'honduras': 'Honduras',
'united_states': 'United States',
'mapping_glam_resources_in_the_united': 'United States',
# Europe
'dutch': 'Netherlands', 'netherlands': 'Netherlands',
'zeeland': 'Netherlands (Zeeland)',
'limburg': 'Netherlands (Limburg)',
'gelderland': 'Netherlands (Gelderland)',
'drenthe': 'Netherlands (Drenthe)',
'groningen': 'Netherlands (Groningen)',
'turkish': 'Turkey', 'turkey': 'Turkey',
'polish': 'Poland', 'poland': 'Poland',
'hungarian': 'Hungary', 'hungary': 'Hungary',
'norwegian': 'Norway', 'norway': 'Norway',
'portuguese': 'Portugal', 'portugal': 'Portugal',
'belgian': 'Belgium', 'belgium': 'Belgium',
'swedish': 'Sweden', 'sweden': 'Sweden',
'estonian': 'Estonia', 'estonia': 'Estonia',
'croatian': 'Croatia', 'croatia': 'Croatia',
'swiss': 'Switzerland', 'switzerland': 'Switzerland',
'moldavian': 'Moldova', 'moldova': 'Moldova',
'bulgarian': 'Bulgaria', 'bulgaria': 'Bulgaria',
'romanian': 'Romania', 'romania': 'Romania',
'albanian': 'Albania', 'albania': 'Albania',
'bosnian': 'Bosnia and Herzegovina', 'bosnia': 'Bosnia and Herzegovina',
'danish': 'Denmark', 'denmark': 'Denmark',
'austrian': 'Austria', 'austria': 'Austria',
'slovakian': 'Slovakia', 'slovakia': 'Slovakia',
'latvian': 'Latvia', 'latvia': 'Latvia',
# Asia
'vietnamese': 'Vietnam', 'vietnam': 'Vietnam',
'japanese': 'Japan', 'japan': 'Japan',
'thai': 'Thailand', 'thailand': 'Thailand',
'taiwan': 'Taiwan', 'taiwanese': 'Taiwan',
'korean': 'South Korea', 'south_korea': 'South Korea',
'malaysian': 'Malaysia', 'malaysia': 'Malaysia',
'pakistani': 'Pakistan', 'pakistan': 'Pakistan',
'iranian': 'Iran', 'iran': 'Iran',
'uzbekistan': 'Uzbekistan', 'uzbek': 'Uzbekistan',
'armenian': 'Armenia', 'armenia': 'Armenia',
'azerbaijan': 'Azerbaijan', 'azerbaijani': 'Azerbaijan',
'georgian': 'Georgia', 'georgia': 'Georgia',
'nepalese': 'Nepal', 'nepal': 'Nepal',
'myanmar': 'Myanmar', 'burmese': 'Myanmar',
'cambodian': 'Cambodia', 'cambodia': 'Cambodia',
'sri_lankan': 'Sri Lanka', 'sri_lanka': 'Sri Lanka',
'tajikistan': 'Tajikistan', 'tajik': 'Tajikistan',
'turkmenistan': 'Turkmenistan', 'turkmen': 'Turkmenistan',
'philippine': 'Philippines', 'philippines': 'Philippines',
'indonesian': 'Indonesia', 'indonesia': 'Indonesia',
'bhutan': 'Bhutan', 'bhutanese': 'Bhutan',
# Middle East
'iraqi': 'Iraq', 'iraq': 'Iraq',
'jordanian': 'Jordan', 'jordan': 'Jordan',
'egyptian': 'Egypt', 'egypt': 'Egypt',
'saudi': 'Saudi Arabia', 'saudi_arabia': 'Saudi Arabia',
'qatari': 'Qatar', 'qatar': 'Qatar',
'omani': 'Oman', 'oman': 'Oman',
'emirati': 'United Arab Emirates', 'uae': 'United Arab Emirates',
'kuwaiti': 'Kuwait', 'kuwait': 'Kuwait',
'lebanese': 'Lebanon', 'lebanon': 'Lebanon',
'syrian': 'Syria', 'syria': 'Syria',
'palestinian': 'Palestine', 'palestine': 'Palestine',
'yemeni': 'Yemen', 'yemen': 'Yemen',
# Africa
'algerian': 'Algeria', 'algeria': 'Algeria',
'moroccan': 'Morocco', 'morocco': 'Morocco',
'tunisian': 'Tunisia', 'tunisia': 'Tunisia',
'south_african': 'South Africa', 'south_africa': 'South Africa',
'namibian': 'Namibia', 'namibia': 'Namibia',
'ghanaian': 'Ghana', 'ghana': 'Ghana',
'nigerian': 'Nigeria', 'nigeria': 'Nigeria',
'somali': 'Somalia', 'somalia': 'Somalia',
'malian': 'Mali', 'mali': 'Mali',
'senegalese': 'Senegal', 'senegal': 'Senegal',
'mauritanian': 'Mauritania', 'mauritania': 'Mauritania',
'kenyan': 'Kenya', 'kenya': 'Kenya',
'mozambican': 'Mozambique', 'mozambique': 'Mozambique',
'eritrean': 'Eritrea', 'eritrea': 'Eritrea',
'sudanese': 'Sudan', 'sudan': 'Sudan',
'rwandan': 'Rwanda', 'rwanda': 'Rwanda',
'zimbabwean': 'Zimbabwe', 'zimbabwe': 'Zimbabwe',
'congolese': 'Democratic Republic of the Congo',
'congo': 'Democratic Republic of the Congo',
'benin': 'Benin', 'beninese': 'Benin',
# Oceania
'australian': 'Australia', 'australia': 'Australia',
'kiribati': 'Kiribati',
'east_timor': 'Timor-Leste', 'timor': 'Timor-Leste',
# Russia
'russian': 'Russia', 'russia': 'Russia',
# Additional countries from missing mappings
'libyan': 'Libya', 'libya': 'Libya',
'greek': 'Greece', 'greece': 'Greece',
'serbian': 'Serbia', 'serbia': 'Serbia',
'indian': 'India', 'india': 'India',
'burmese': 'Myanmar', 'burma': 'Myanmar',
'afghan': 'Afghanistan', 'afghanistan': 'Afghanistan',
'laotian': 'Laos', 'laos': 'Laos',
'uruguayan': 'Uruguay', 'uruguay': 'Uruguay',
'finnish': 'Finland', 'finland': 'Finland',
'israeli': 'Israel', 'israel': 'Israel',
'cypriot': 'Cyprus', 'cyprus': 'Cyprus',
'slovak': 'Slovakia',
'slovenian': 'Slovenia', 'slovenia': 'Slovenia',
'macedonian': 'North Macedonia', 'north_macedonia': 'North Macedonia',
'ethiopian': 'Ethiopia', 'ethiopia': 'Ethiopia',
'malagasy': 'Madagascar', 'madagascar': 'Madagascar',
'new_zealand': 'New Zealand', 'new_zealander': 'New Zealand',
'haitian': 'Haiti', 'haiti': 'Haiti',
'jamaican': 'Jamaica', 'jamaica': 'Jamaica',
'vatican': 'Vatican City',
'italian': 'Italy', 'italy': 'Italy',
'arabic_emirates': 'United Arab Emirates',
'maldivian': 'Maldives', 'maldives': 'Maldives',
'burkina': 'Burkina Faso', 'burkina_faso': 'Burkina Faso',
'togolese': 'Togo', 'togo': 'Togo',
'liberian': 'Liberia', 'liberia': 'Liberia',
# Dutch provinces (additional)
'overijssel': 'Netherlands (Overijssel)',
'north_brabant': 'Netherlands (North Brabant)', 'brabant': 'Netherlands (North Brabant)',
'zuid_holland': 'Netherlands (Zuid-Holland)', 'south_holland': 'Netherlands (Zuid-Holland)',
'noord_holland': 'Netherlands (Noord-Holland)', 'north_holland': 'Netherlands (Noord-Holland)',
'friesland': 'Netherlands (Friesland)',
'flevoland': 'Netherlands (Flevoland)',
# Special cases (broader regions/platforms)
'archives_du_maroc': 'Morocco',
}
filename_lower = filename.lower()
# Try exact matching first (prioritize longer keys)
for key in sorted(country_mapping.keys(), key=len, reverse=True):
if key in filename_lower:
return country_mapping[key]
return "Unknown"
def extract_institution_name_from_conversation(self, conversation_name: str) -> str:
"""
Generate institution name from conversation title.
For batch processing, we use conversation name as institution identifier
since individual institution names aren't always explicit.
Args:
conversation_name: Conversation title
Returns:
Institution name (simplified conversation title)
"""
# Remove common suffixes
name = conversation_name
for suffix in [' conversation', ' inventory', ' resources', ' catalogues',
' and catalogues', ' digital resources', ' GLAM']:
name = name.replace(suffix, '')
# Clean up
name = name.strip()
# If still too generic, return as-is
if len(name) < 5:
return conversation_name
return name
def process_file(self, file_path: Path) -> Dict[str, Any]:
"""
Process a single conversation file and extract partnerships.
Args:
file_path: Path to conversation JSON file
Returns:
Dictionary with extraction results:
- success: bool
- partnerships: list
- error: str (if failed)
- metadata: dict
"""
logger.info(f"Processing: {file_path.name}")
try:
# Parse conversation
conversation = self.parser.parse_file(file_path)
# Extract partnerships
partnerships = self.parser.extract_partnerships(conversation)
# Extract country from filename
country = self.extract_country_from_filename(file_path.name)
# Extract institution name from conversation
institution_name = self.extract_institution_name_from_conversation(conversation.name)
# Update statistics
self.stats["processed_files"] += 1
self.stats["total_partnerships"] += len(partnerships)
self.stats["countries"][country] += 1
# Process each partnership
for partnership in partnerships:
partner_name = partnership["partner_name"]
partnership_type = partnership["partnership_type"]
# Update type counter
self.stats["partnership_types"][partnership_type] += 1
# Add to global partner registry
if partner_name not in self.global_partners:
self.global_partners[partner_name] = {
"partner_name": partner_name,
"mention_count": 0,
"partnership_types": set(),
"mentioned_in_countries": set(),
"mentioned_in_files": set(),
}
# Update partner metadata
self.global_partners[partner_name]["mention_count"] += 1
self.global_partners[partner_name]["partnership_types"].add(partnership_type)
self.global_partners[partner_name]["mentioned_in_countries"].add(country)
self.global_partners[partner_name]["mentioned_in_files"].add(file_path.name)
# Add to institution partnerships
self.institution_partnerships[institution_name].append({
**partnership,
"institution": institution_name,
"country": country,
"conversation_id": conversation.uuid,
"conversation_name": conversation.name,
})
# Add to network graph
self.network_edges.append((institution_name, partner_name, partnership_type))
logger.info(f" ✓ Extracted {len(partnerships)} partnerships from {institution_name} ({country})")
return {
"success": True,
"partnerships": partnerships,
"metadata": {
"file_path": str(file_path),
"conversation_id": conversation.uuid,
"conversation_name": conversation.name,
"country": country,
"institution_name": institution_name,
}
}
except Exception as e:
logger.error(f" ✗ Error processing {file_path.name}: {e}")
self.stats["failed_files"] += 1
self.stats["errors"].append({
"file": str(file_path),
"error": str(e)
})
return {
"success": False,
"error": str(e),
"metadata": {
"file_path": str(file_path)
}
}
def process_all_files(self, file_paths: List[Path]) -> None:
"""
Process all conversation files.
Args:
file_paths: List of file paths to process
"""
logger.info(f"Starting batch processing of {len(file_paths)} files...")
for i, file_path in enumerate(file_paths, 1):
logger.info(f"[{i}/{len(file_paths)}] {file_path.name}")
self.process_file(file_path)
# Calculate unique partners
self.stats["unique_partners"] = len(self.global_partners)
logger.info("Batch processing complete!")
logger.info(f" Total files: {self.stats['total_files']}")
logger.info(f" Processed: {self.stats['processed_files']}")
logger.info(f" Failed: {self.stats['failed_files']}")
logger.info(f" Total partnerships: {self.stats['total_partnerships']}")
logger.info(f" Unique partners: {self.stats['unique_partners']}")
def export_statistics(self, output_path: Path) -> None:
"""
Export statistics to JSON file.
Args:
output_path: Path to output JSON file
"""
logger.info(f"Exporting statistics to {output_path}...")
# Convert sets to lists for JSON serialization
global_partners_serializable = {}
for partner_name, metadata in self.global_partners.items():
global_partners_serializable[partner_name] = {
"partner_name": metadata["partner_name"],
"mention_count": metadata["mention_count"],
"partnership_types": list(metadata["partnership_types"]),
"mentioned_in_countries": list(metadata["mentioned_in_countries"]),
"mentioned_in_files": list(metadata["mentioned_in_files"]),
}
# Get top partners by mention count
top_partners = sorted(
global_partners_serializable.items(),
key=lambda x: x[1]["mention_count"],
reverse=True
)[:20]
statistics = {
"extraction_date": datetime.now(timezone.utc).isoformat(),
"summary": {
"total_files": self.stats["total_files"],
"processed_files": self.stats["processed_files"],
"failed_files": self.stats["failed_files"],
"total_partnerships": self.stats["total_partnerships"],
"unique_partners": self.stats["unique_partners"],
"unique_institutions": len(self.institution_partnerships),
"network_edges": len(self.network_edges),
},
"partnership_types": dict(self.stats["partnership_types"]),
"countries": dict(self.stats["countries"]),
"top_partners": [
{
"rank": i + 1,
"partner_name": partner_name,
**metadata
}
for i, (partner_name, metadata) in enumerate(top_partners)
],
"errors": self.stats["errors"][:20] # First 20 errors only
}
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(statistics, f, indent=2, ensure_ascii=False)
logger.info(f" ✓ Statistics exported to {output_path}")
def export_network_graph(self, output_path: Path) -> None:
"""
Export network graph data (nodes + edges) to JSON file.
Format compatible with D3.js, Gephi, Cytoscape.
Args:
output_path: Path to output JSON file
"""
logger.info(f"Exporting network graph to {output_path}...")
# Create nodes (institutions + partners)
institution_nodes = [
{
"id": institution_name,
"type": "institution",
"label": institution_name,
"partnership_count": len(partnerships)
}
for institution_name, partnerships in self.institution_partnerships.items()
]
partner_nodes = [
{
"id": partner_name,
"type": "partner",
"label": partner_name,
"mention_count": metadata["mention_count"],
"partnership_types": list(metadata["partnership_types"]),
}
for partner_name, metadata in self.global_partners.items()
]
# Create edges
edges = [
{
"source": institution,
"target": partner,
"type": partnership_type,
"label": partnership_type
}
for institution, partner, partnership_type in self.network_edges
]
network_graph = {
"nodes": institution_nodes + partner_nodes,
"edges": edges,
"metadata": {
"node_count": len(institution_nodes) + len(partner_nodes),
"institution_count": len(institution_nodes),
"partner_count": len(partner_nodes),
"edge_count": len(edges),
"created_at": datetime.now(timezone.utc).isoformat()
}
}
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(network_graph, f, indent=2, ensure_ascii=False)
logger.info(f" ✓ Network graph exported to {output_path}")
logger.info(f" Nodes: {network_graph['metadata']['node_count']} "
f"(institutions: {network_graph['metadata']['institution_count']}, "
f"partners: {network_graph['metadata']['partner_count']})")
logger.info(f" Edges: {network_graph['metadata']['edge_count']}")
def export_rdf_graph(self, output_path: Path) -> None:
"""
Export unified RDF graph with all partnerships.
Creates HeritageCustodian records for each institution with partnerships,
then serializes to RDF/Turtle using W3C ORG ontology.
Args:
output_path: Path to output Turtle file
"""
logger.info(f"Generating RDF graph with {len(self.institution_partnerships)} institutions...")
custodians = []
for institution_name, partnerships in self.institution_partnerships.items():
# Get country from first partnership
country = partnerships[0]["country"] if partnerships else "Unknown"
# Create Partnership objects
partnership_objects = []
for p in partnerships:
partnership_obj = Partnership(
partner_name=p["partner_name"],
partnership_type=p["partnership_type"],
description=p.get("description"),
start_date=p.get("start_date"),
end_date=p.get("end_date"),
)
partnership_objects.append(partnership_obj)
# Create HeritageCustodian
custodian = HeritageCustodian(
id=f"https://w3id.org/heritage/custodian/batch/{institution_name.lower().replace(' ', '-')}",
name=institution_name,
institution_type=InstitutionType.MIXED, # Batch extraction doesn't determine specific type
description=f"Heritage institution identified from conversation analysis ({country})",
partnerships=partnership_objects,
provenance=Provenance(
data_source=DataSource.CONVERSATION_NLP,
data_tier=DataTier.TIER_4_INFERRED,
extraction_date=datetime.now(timezone.utc),
extraction_method="Batch partnership extraction from GLAM conversations",
confidence_score=0.7, # Batch extraction lower confidence than individual
)
)
custodians.append(custodian)
# Export to RDF/Turtle
output_path.parent.mkdir(parents=True, exist_ok=True)
self.exporter.export_to_file(
custodians=custodians,
filepath=str(output_path),
format="turtle"
)
logger.info(f" ✓ RDF graph exported to {output_path}")
logger.info(f" Institutions: {len(custodians)}")
logger.info(f" Total partnerships: {sum(len(c.partnerships or []) for c in custodians)}")
def main():
"""Main entry point for batch partnership extraction"""
logger.info("=" * 80)
logger.info("GLAM Partnership Batch Extraction")
logger.info("=" * 80)
# Configuration
CONVERSATION_DIR = Path("/Users/kempersc/Documents/claude/data-2025-11-02-18-13-26-batch-0000/conversations")
EXPORT_DIR = Path(__file__).parent.parent / "data" / "exports"
# Validate conversation directory
if not CONVERSATION_DIR.exists():
logger.error(f"Conversation directory not found: {CONVERSATION_DIR}")
sys.exit(1)
# Create extractor
extractor = PartnershipBatchExtractor(CONVERSATION_DIR)
# Step 1: Discover GLAM files
glam_files = extractor.discover_glam_files()
if not glam_files:
logger.error("No GLAM conversation files found!")
sys.exit(1)
# Step 2: Process all files
extractor.process_all_files(glam_files)
# Step 3: Export statistics
extractor.export_statistics(EXPORT_DIR / "partnership_statistics.json")
# Step 4: Export network graph
extractor.export_network_graph(EXPORT_DIR / "partner_network.json")
# Step 5: Export RDF graph
extractor.export_rdf_graph(EXPORT_DIR / "global_glam_partnerships.ttl")
logger.info("=" * 80)
logger.info("Batch extraction complete!")
logger.info(f"Results available in: {EXPORT_DIR}")
logger.info("=" * 80)
if __name__ == "__main__":
main()