#!/usr/bin/env python3 """ Batch extract partnerships from all GLAM conversation files. This script: 1. Discovers all GLAM-related conversation JSON files 2. Extracts partnerships using ConversationParser 3. Deduplicates partners globally 4. Generates statistics and network data 5. Exports unified RDF graph with all partnerships Usage: python scripts/batch_extract_partnerships.py Outputs: - data/exports/global_glam_partnerships.ttl (RDF/Turtle) - data/exports/partnership_statistics.json (summary statistics) - data/exports/partner_network.json (network graph data) - logs/partnership_extraction.log (processing log) """ import json import logging from collections import Counter, defaultdict from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Any, Set, Tuple import sys # Add src to path for imports sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from glam_extractor.parsers.conversation import ConversationParser from glam_extractor.exporters.rdf_exporter import RDFExporter from glam_extractor.models import ( HeritageCustodian, Partnership, Provenance, InstitutionType, DataSource, DataTier, OrganizationStatus ) # Configure logging LOG_DIR = Path(__file__).parent.parent / "logs" LOG_DIR.mkdir(exist_ok=True) logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(LOG_DIR / "partnership_extraction.log"), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) class PartnershipBatchExtractor: """Batch processor for partnership extraction from conversation files""" def __init__(self, conversation_dir: Path): """ Initialize batch extractor. Args: conversation_dir: Directory containing conversation JSON files """ self.conversation_dir = Path(conversation_dir) self.parser = ConversationParser() self.exporter = RDFExporter() # Statistics tracking self.stats = { "total_files": 0, "processed_files": 0, "failed_files": 0, "total_partnerships": 0, "unique_partners": 0, "partnership_types": Counter(), "countries": Counter(), "errors": [] } # Global partner registry (partner_name → metadata) self.global_partners: Dict[str, Dict[str, Any]] = {} # Institution partnerships (institution_name → list of partnerships) self.institution_partnerships: Dict[str, List[Dict[str, Any]]] = defaultdict(list) # Network graph (edges: institution → partner) self.network_edges: List[Tuple[str, str, str]] = [] # (institution, partner, type) def discover_glam_files(self) -> List[Path]: """ Discover all GLAM-related conversation JSON files. Returns: List of file paths """ logger.info(f"Scanning {self.conversation_dir} for GLAM conversation files...") # GLAM-related filename patterns glam_patterns = [ 'glam', 'museum', 'library', 'archive', 'heritage', 'cultural', 'gallery', 'collection', 'catalog' ] all_json_files = list(self.conversation_dir.glob("*.json")) # Filter for GLAM-related files glam_files = [] for file_path in all_json_files: filename_lower = file_path.name.lower() if any(pattern in filename_lower for pattern in glam_patterns): glam_files.append(file_path) logger.info(f"Found {len(glam_files)} GLAM conversation files (out of {len(all_json_files)} total)") self.stats["total_files"] = len(glam_files) return sorted(glam_files) # Sort for reproducibility def extract_country_from_filename(self, filename: str) -> str: """ Extract country/region name from conversation filename. Args: filename: Conversation filename Returns: Country name (or "Unknown" if not found) Examples: - "Brazilian_GLAM_collection_inventories.json" → "Brazil" - "Mexican_GLAM_inventories.json" → "Mexico" - "Zeeland_GLAM_resources.json" → "Netherlands (Zeeland)" - "Turkish_Archives_Digital_Repositories.json" → "Turkey" """ # Comprehensive country/region mapping (adjectives + country names) # Covers 100+ countries from conversation filenames country_mapping = { # Americas 'brazilian': 'Brazil', 'brazil': 'Brazil', 'mexican': 'Mexico', 'mexico': 'Mexico', 'canadian': 'Canada', 'canada': 'Canada', 'chilean': 'Chile', 'chile': 'Chile', 'argentine': 'Argentina', 'argentina': 'Argentina', 'colombian': 'Colombia', 'colombia': 'Colombia', 'peruvian': 'Peru', 'peru': 'Peru', 'cuban': 'Cuba', 'cuba': 'Cuba', 'panamanian': 'Panama', 'panama': 'Panama', 'nicaraguan': 'Nicaragua', 'nicaragua': 'Nicaragua', 'suriname': 'Suriname', 'surinamese': 'Suriname', 'paraguayan': 'Paraguay', 'paraguay': 'Paraguay', 'honduran': 'Honduras', 'honduras': 'Honduras', 'united_states': 'United States', 'mapping_glam_resources_in_the_united': 'United States', # Europe 'dutch': 'Netherlands', 'netherlands': 'Netherlands', 'zeeland': 'Netherlands (Zeeland)', 'limburg': 'Netherlands (Limburg)', 'gelderland': 'Netherlands (Gelderland)', 'drenthe': 'Netherlands (Drenthe)', 'groningen': 'Netherlands (Groningen)', 'turkish': 'Turkey', 'turkey': 'Turkey', 'polish': 'Poland', 'poland': 'Poland', 'hungarian': 'Hungary', 'hungary': 'Hungary', 'norwegian': 'Norway', 'norway': 'Norway', 'portuguese': 'Portugal', 'portugal': 'Portugal', 'belgian': 'Belgium', 'belgium': 'Belgium', 'swedish': 'Sweden', 'sweden': 'Sweden', 'estonian': 'Estonia', 'estonia': 'Estonia', 'croatian': 'Croatia', 'croatia': 'Croatia', 'swiss': 'Switzerland', 'switzerland': 'Switzerland', 'moldavian': 'Moldova', 'moldova': 'Moldova', 'bulgarian': 'Bulgaria', 'bulgaria': 'Bulgaria', 'romanian': 'Romania', 'romania': 'Romania', 'albanian': 'Albania', 'albania': 'Albania', 'bosnian': 'Bosnia and Herzegovina', 'bosnia': 'Bosnia and Herzegovina', 'danish': 'Denmark', 'denmark': 'Denmark', 'austrian': 'Austria', 'austria': 'Austria', 'slovakian': 'Slovakia', 'slovakia': 'Slovakia', 'latvian': 'Latvia', 'latvia': 'Latvia', # Asia 'vietnamese': 'Vietnam', 'vietnam': 'Vietnam', 'japanese': 'Japan', 'japan': 'Japan', 'thai': 'Thailand', 'thailand': 'Thailand', 'taiwan': 'Taiwan', 'taiwanese': 'Taiwan', 'korean': 'South Korea', 'south_korea': 'South Korea', 'malaysian': 'Malaysia', 'malaysia': 'Malaysia', 'pakistani': 'Pakistan', 'pakistan': 'Pakistan', 'iranian': 'Iran', 'iran': 'Iran', 'uzbekistan': 'Uzbekistan', 'uzbek': 'Uzbekistan', 'armenian': 'Armenia', 'armenia': 'Armenia', 'azerbaijan': 'Azerbaijan', 'azerbaijani': 'Azerbaijan', 'georgian': 'Georgia', 'georgia': 'Georgia', 'nepalese': 'Nepal', 'nepal': 'Nepal', 'myanmar': 'Myanmar', 'burmese': 'Myanmar', 'cambodian': 'Cambodia', 'cambodia': 'Cambodia', 'sri_lankan': 'Sri Lanka', 'sri_lanka': 'Sri Lanka', 'tajikistan': 'Tajikistan', 'tajik': 'Tajikistan', 'turkmenistan': 'Turkmenistan', 'turkmen': 'Turkmenistan', 'philippine': 'Philippines', 'philippines': 'Philippines', 'indonesian': 'Indonesia', 'indonesia': 'Indonesia', 'bhutan': 'Bhutan', 'bhutanese': 'Bhutan', # Middle East 'iraqi': 'Iraq', 'iraq': 'Iraq', 'jordanian': 'Jordan', 'jordan': 'Jordan', 'egyptian': 'Egypt', 'egypt': 'Egypt', 'saudi': 'Saudi Arabia', 'saudi_arabia': 'Saudi Arabia', 'qatari': 'Qatar', 'qatar': 'Qatar', 'omani': 'Oman', 'oman': 'Oman', 'emirati': 'United Arab Emirates', 'uae': 'United Arab Emirates', 'kuwaiti': 'Kuwait', 'kuwait': 'Kuwait', 'lebanese': 'Lebanon', 'lebanon': 'Lebanon', 'syrian': 'Syria', 'syria': 'Syria', 'palestinian': 'Palestine', 'palestine': 'Palestine', 'yemeni': 'Yemen', 'yemen': 'Yemen', # Africa 'algerian': 'Algeria', 'algeria': 'Algeria', 'moroccan': 'Morocco', 'morocco': 'Morocco', 'tunisian': 'Tunisia', 'tunisia': 'Tunisia', 'south_african': 'South Africa', 'south_africa': 'South Africa', 'namibian': 'Namibia', 'namibia': 'Namibia', 'ghanaian': 'Ghana', 'ghana': 'Ghana', 'nigerian': 'Nigeria', 'nigeria': 'Nigeria', 'somali': 'Somalia', 'somalia': 'Somalia', 'malian': 'Mali', 'mali': 'Mali', 'senegalese': 'Senegal', 'senegal': 'Senegal', 'mauritanian': 'Mauritania', 'mauritania': 'Mauritania', 'kenyan': 'Kenya', 'kenya': 'Kenya', 'mozambican': 'Mozambique', 'mozambique': 'Mozambique', 'eritrean': 'Eritrea', 'eritrea': 'Eritrea', 'sudanese': 'Sudan', 'sudan': 'Sudan', 'rwandan': 'Rwanda', 'rwanda': 'Rwanda', 'zimbabwean': 'Zimbabwe', 'zimbabwe': 'Zimbabwe', 'congolese': 'Democratic Republic of the Congo', 'congo': 'Democratic Republic of the Congo', 'benin': 'Benin', 'beninese': 'Benin', # Oceania 'australian': 'Australia', 'australia': 'Australia', 'kiribati': 'Kiribati', 'east_timor': 'Timor-Leste', 'timor': 'Timor-Leste', # Russia 'russian': 'Russia', 'russia': 'Russia', # Additional countries from missing mappings 'libyan': 'Libya', 'libya': 'Libya', 'greek': 'Greece', 'greece': 'Greece', 'serbian': 'Serbia', 'serbia': 'Serbia', 'indian': 'India', 'india': 'India', 'burmese': 'Myanmar', 'burma': 'Myanmar', 'afghan': 'Afghanistan', 'afghanistan': 'Afghanistan', 'laotian': 'Laos', 'laos': 'Laos', 'uruguayan': 'Uruguay', 'uruguay': 'Uruguay', 'finnish': 'Finland', 'finland': 'Finland', 'israeli': 'Israel', 'israel': 'Israel', 'cypriot': 'Cyprus', 'cyprus': 'Cyprus', 'slovak': 'Slovakia', 'slovenian': 'Slovenia', 'slovenia': 'Slovenia', 'macedonian': 'North Macedonia', 'north_macedonia': 'North Macedonia', 'ethiopian': 'Ethiopia', 'ethiopia': 'Ethiopia', 'malagasy': 'Madagascar', 'madagascar': 'Madagascar', 'new_zealand': 'New Zealand', 'new_zealander': 'New Zealand', 'haitian': 'Haiti', 'haiti': 'Haiti', 'jamaican': 'Jamaica', 'jamaica': 'Jamaica', 'vatican': 'Vatican City', 'italian': 'Italy', 'italy': 'Italy', 'arabic_emirates': 'United Arab Emirates', 'maldivian': 'Maldives', 'maldives': 'Maldives', 'burkina': 'Burkina Faso', 'burkina_faso': 'Burkina Faso', 'togolese': 'Togo', 'togo': 'Togo', 'liberian': 'Liberia', 'liberia': 'Liberia', # Dutch provinces (additional) 'overijssel': 'Netherlands (Overijssel)', 'north_brabant': 'Netherlands (North Brabant)', 'brabant': 'Netherlands (North Brabant)', 'zuid_holland': 'Netherlands (Zuid-Holland)', 'south_holland': 'Netherlands (Zuid-Holland)', 'noord_holland': 'Netherlands (Noord-Holland)', 'north_holland': 'Netherlands (Noord-Holland)', 'friesland': 'Netherlands (Friesland)', 'flevoland': 'Netherlands (Flevoland)', # Special cases (broader regions/platforms) 'archives_du_maroc': 'Morocco', } filename_lower = filename.lower() # Try exact matching first (prioritize longer keys) for key in sorted(country_mapping.keys(), key=len, reverse=True): if key in filename_lower: return country_mapping[key] return "Unknown" def extract_institution_name_from_conversation(self, conversation_name: str) -> str: """ Generate institution name from conversation title. For batch processing, we use conversation name as institution identifier since individual institution names aren't always explicit. Args: conversation_name: Conversation title Returns: Institution name (simplified conversation title) """ # Remove common suffixes name = conversation_name for suffix in [' conversation', ' inventory', ' resources', ' catalogues', ' and catalogues', ' digital resources', ' GLAM']: name = name.replace(suffix, '') # Clean up name = name.strip() # If still too generic, return as-is if len(name) < 5: return conversation_name return name def process_file(self, file_path: Path) -> Dict[str, Any]: """ Process a single conversation file and extract partnerships. Args: file_path: Path to conversation JSON file Returns: Dictionary with extraction results: - success: bool - partnerships: list - error: str (if failed) - metadata: dict """ logger.info(f"Processing: {file_path.name}") try: # Parse conversation conversation = self.parser.parse_file(file_path) # Extract partnerships partnerships = self.parser.extract_partnerships(conversation) # Extract country from filename country = self.extract_country_from_filename(file_path.name) # Extract institution name from conversation institution_name = self.extract_institution_name_from_conversation(conversation.name) # Update statistics self.stats["processed_files"] += 1 self.stats["total_partnerships"] += len(partnerships) self.stats["countries"][country] += 1 # Process each partnership for partnership in partnerships: partner_name = partnership["partner_name"] partnership_type = partnership["partnership_type"] # Update type counter self.stats["partnership_types"][partnership_type] += 1 # Add to global partner registry if partner_name not in self.global_partners: self.global_partners[partner_name] = { "partner_name": partner_name, "mention_count": 0, "partnership_types": set(), "mentioned_in_countries": set(), "mentioned_in_files": set(), } # Update partner metadata self.global_partners[partner_name]["mention_count"] += 1 self.global_partners[partner_name]["partnership_types"].add(partnership_type) self.global_partners[partner_name]["mentioned_in_countries"].add(country) self.global_partners[partner_name]["mentioned_in_files"].add(file_path.name) # Add to institution partnerships self.institution_partnerships[institution_name].append({ **partnership, "institution": institution_name, "country": country, "conversation_id": conversation.uuid, "conversation_name": conversation.name, }) # Add to network graph self.network_edges.append((institution_name, partner_name, partnership_type)) logger.info(f" ✓ Extracted {len(partnerships)} partnerships from {institution_name} ({country})") return { "success": True, "partnerships": partnerships, "metadata": { "file_path": str(file_path), "conversation_id": conversation.uuid, "conversation_name": conversation.name, "country": country, "institution_name": institution_name, } } except Exception as e: logger.error(f" ✗ Error processing {file_path.name}: {e}") self.stats["failed_files"] += 1 self.stats["errors"].append({ "file": str(file_path), "error": str(e) }) return { "success": False, "error": str(e), "metadata": { "file_path": str(file_path) } } def process_all_files(self, file_paths: List[Path]) -> None: """ Process all conversation files. Args: file_paths: List of file paths to process """ logger.info(f"Starting batch processing of {len(file_paths)} files...") for i, file_path in enumerate(file_paths, 1): logger.info(f"[{i}/{len(file_paths)}] {file_path.name}") self.process_file(file_path) # Calculate unique partners self.stats["unique_partners"] = len(self.global_partners) logger.info("Batch processing complete!") logger.info(f" Total files: {self.stats['total_files']}") logger.info(f" Processed: {self.stats['processed_files']}") logger.info(f" Failed: {self.stats['failed_files']}") logger.info(f" Total partnerships: {self.stats['total_partnerships']}") logger.info(f" Unique partners: {self.stats['unique_partners']}") def export_statistics(self, output_path: Path) -> None: """ Export statistics to JSON file. Args: output_path: Path to output JSON file """ logger.info(f"Exporting statistics to {output_path}...") # Convert sets to lists for JSON serialization global_partners_serializable = {} for partner_name, metadata in self.global_partners.items(): global_partners_serializable[partner_name] = { "partner_name": metadata["partner_name"], "mention_count": metadata["mention_count"], "partnership_types": list(metadata["partnership_types"]), "mentioned_in_countries": list(metadata["mentioned_in_countries"]), "mentioned_in_files": list(metadata["mentioned_in_files"]), } # Get top partners by mention count top_partners = sorted( global_partners_serializable.items(), key=lambda x: x[1]["mention_count"], reverse=True )[:20] statistics = { "extraction_date": datetime.now(timezone.utc).isoformat(), "summary": { "total_files": self.stats["total_files"], "processed_files": self.stats["processed_files"], "failed_files": self.stats["failed_files"], "total_partnerships": self.stats["total_partnerships"], "unique_partners": self.stats["unique_partners"], "unique_institutions": len(self.institution_partnerships), "network_edges": len(self.network_edges), }, "partnership_types": dict(self.stats["partnership_types"]), "countries": dict(self.stats["countries"]), "top_partners": [ { "rank": i + 1, "partner_name": partner_name, **metadata } for i, (partner_name, metadata) in enumerate(top_partners) ], "errors": self.stats["errors"][:20] # First 20 errors only } output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(statistics, f, indent=2, ensure_ascii=False) logger.info(f" ✓ Statistics exported to {output_path}") def export_network_graph(self, output_path: Path) -> None: """ Export network graph data (nodes + edges) to JSON file. Format compatible with D3.js, Gephi, Cytoscape. Args: output_path: Path to output JSON file """ logger.info(f"Exporting network graph to {output_path}...") # Create nodes (institutions + partners) institution_nodes = [ { "id": institution_name, "type": "institution", "label": institution_name, "partnership_count": len(partnerships) } for institution_name, partnerships in self.institution_partnerships.items() ] partner_nodes = [ { "id": partner_name, "type": "partner", "label": partner_name, "mention_count": metadata["mention_count"], "partnership_types": list(metadata["partnership_types"]), } for partner_name, metadata in self.global_partners.items() ] # Create edges edges = [ { "source": institution, "target": partner, "type": partnership_type, "label": partnership_type } for institution, partner, partnership_type in self.network_edges ] network_graph = { "nodes": institution_nodes + partner_nodes, "edges": edges, "metadata": { "node_count": len(institution_nodes) + len(partner_nodes), "institution_count": len(institution_nodes), "partner_count": len(partner_nodes), "edge_count": len(edges), "created_at": datetime.now(timezone.utc).isoformat() } } output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(network_graph, f, indent=2, ensure_ascii=False) logger.info(f" ✓ Network graph exported to {output_path}") logger.info(f" Nodes: {network_graph['metadata']['node_count']} " f"(institutions: {network_graph['metadata']['institution_count']}, " f"partners: {network_graph['metadata']['partner_count']})") logger.info(f" Edges: {network_graph['metadata']['edge_count']}") def export_rdf_graph(self, output_path: Path) -> None: """ Export unified RDF graph with all partnerships. Creates HeritageCustodian records for each institution with partnerships, then serializes to RDF/Turtle using W3C ORG ontology. Args: output_path: Path to output Turtle file """ logger.info(f"Generating RDF graph with {len(self.institution_partnerships)} institutions...") custodians = [] for institution_name, partnerships in self.institution_partnerships.items(): # Get country from first partnership country = partnerships[0]["country"] if partnerships else "Unknown" # Create Partnership objects partnership_objects = [] for p in partnerships: partnership_obj = Partnership( partner_name=p["partner_name"], partnership_type=p["partnership_type"], description=p.get("description"), start_date=p.get("start_date"), end_date=p.get("end_date"), ) partnership_objects.append(partnership_obj) # Create HeritageCustodian custodian = HeritageCustodian( id=f"https://w3id.org/heritage/custodian/batch/{institution_name.lower().replace(' ', '-')}", name=institution_name, institution_type=InstitutionType.MIXED, # Batch extraction doesn't determine specific type description=f"Heritage institution identified from conversation analysis ({country})", partnerships=partnership_objects, provenance=Provenance( data_source=DataSource.CONVERSATION_NLP, data_tier=DataTier.TIER_4_INFERRED, extraction_date=datetime.now(timezone.utc), extraction_method="Batch partnership extraction from GLAM conversations", confidence_score=0.7, # Batch extraction lower confidence than individual ) ) custodians.append(custodian) # Export to RDF/Turtle output_path.parent.mkdir(parents=True, exist_ok=True) self.exporter.export_to_file( custodians=custodians, filepath=str(output_path), format="turtle" ) logger.info(f" ✓ RDF graph exported to {output_path}") logger.info(f" Institutions: {len(custodians)}") logger.info(f" Total partnerships: {sum(len(c.partnerships or []) for c in custodians)}") def main(): """Main entry point for batch partnership extraction""" logger.info("=" * 80) logger.info("GLAM Partnership Batch Extraction") logger.info("=" * 80) # Configuration CONVERSATION_DIR = Path("/Users/kempersc/Documents/claude/data-2025-11-02-18-13-26-batch-0000/conversations") EXPORT_DIR = Path(__file__).parent.parent / "data" / "exports" # Validate conversation directory if not CONVERSATION_DIR.exists(): logger.error(f"Conversation directory not found: {CONVERSATION_DIR}") sys.exit(1) # Create extractor extractor = PartnershipBatchExtractor(CONVERSATION_DIR) # Step 1: Discover GLAM files glam_files = extractor.discover_glam_files() if not glam_files: logger.error("No GLAM conversation files found!") sys.exit(1) # Step 2: Process all files extractor.process_all_files(glam_files) # Step 3: Export statistics extractor.export_statistics(EXPORT_DIR / "partnership_statistics.json") # Step 4: Export network graph extractor.export_network_graph(EXPORT_DIR / "partner_network.json") # Step 5: Export RDF graph extractor.export_rdf_graph(EXPORT_DIR / "global_glam_partnerships.ttl") logger.info("=" * 80) logger.info("Batch extraction complete!") logger.info(f"Results available in: {EXPORT_DIR}") logger.info("=" * 80) if __name__ == "__main__": main()