#!/usr/bin/env python3 """ Batch extract heritage institutions from conversation JSON files. This script processes all conversation files in docs/reflection/ and extracts heritage institution data (TIER_4) using NLP pattern matching. Usage: python extract_conversations_batch.py [--limit N] [--country CODE] """ import json import re import yaml from pathlib import Path from datetime import datetime, timezone from typing import List, Dict, Any, Set, Tuple from collections import defaultdict import argparse # Institution type keywords (from AGENTS.md taxonomy) INSTITUTION_KEYWORDS = { 'MUSEUM': [ 'museum', 'museo', 'museu', 'musée', 'muzeum', 'muzej', 'art gallery', 'kunstmuseum', 'kunsthal' ], 'LIBRARY': [ 'library', 'biblioteca', 'bibliothek', 'bibliotheek', 'bibliothèque', 'biblioteka', 'national library', 'public library', 'university library' ], 'ARCHIVE': [ 'archive', 'archiv', 'archivo', 'arquivo', 'archief', 'national archive', 'state archive', 'regional archive' ], 'GALLERY': [ 'gallery', 'galerie', 'galería', 'kunsthal', 'art center' ], 'RESEARCH_CENTER': [ 'research center', 'research centre', 'research institute', 'documentation center', 'knowledge center' ], 'BOTANICAL_ZOO': [ 'botanical garden', 'botanic garden', 'arboretum', 'zoo', 'zoological garden', 'zoological park' ], 'EDUCATION_PROVIDER': [ 'university', 'universidad', 'universiteit', 'université', 'college', 'school', 'institute', 'academy' ], 'HOLY_SITES': [ 'church', 'cathedral', 'mosque', 'temple', 'synagogue', 'monastery', 'abbey', 'shrine' ] } # ISIL code pattern (e.g., NL-AsdRM, US-MBMM, BR-RjBN) ISIL_PATTERN = re.compile(r'\b([A-Z]{2}-[A-Za-z0-9]+)\b') # Website URL pattern URL_PATTERN = re.compile(r'https?://[^\s<>"]+') # Country codes (ISO 3166-1 alpha-2) COUNTRY_CODES = { 'NL', 'US', 'BR', 'GB', 'FR', 'DE', 'ES', 'IT', 'PT', 'BE', 'AR', 'MX', 'CL', 'CO', 'PE', 'VE', 'EC', 'BO', 'PY', 'UY', 'JP', 'CN', 'IN', 'KR', 'TH', 'VN', 'ID', 'PH', 'MY', 'SG', 'EG', 'MA', 'DZ', 'TN', 'LY', 'ZA', 'NG', 'KE', 'GH', 'ET', 'AU', 'NZ', 'CA', 'RU', 'TR', 'SA', 'AE', 'QA', 'KW', 'OM', 'PL', 'CZ', 'HU', 'RO', 'BG', 'GR', 'HR', 'RS', 'UA', 'BY' } class ConversationExtractor: """Extract heritage institutions from Claude conversation files.""" def __init__(self, verbose: bool = True): self.verbose = verbose self.stats = defaultdict(int) self.extracted_institutions = [] self.seen_names = set() def log(self, message: str): """Print log message if verbose.""" if self.verbose: print(message) def extract_country_from_filename(self, filename: str) -> str: """Extract country name from conversation filename.""" # Examples: # - Brazilian_GLAM_collection_inventories.json → Brazil # - Mexican_GLAM_inventories_and_catalogues.json → Mexico # - Panamanian_cultural_heritage_resources.json → Panama country_map = { 'brazilian': 'BR', 'brazil': 'BR', 'mexican': 'MX', 'mexico': 'MX', 'panamanian': 'PA', 'panama': 'PA', 'argentine': 'AR', 'argentina': 'AR', 'argentinian': 'AR', 'chilean': 'CL', 'chile': 'CL', 'colombian': 'CO', 'colombia': 'CO', 'canadian': 'CA', 'canada': 'CA', 'american': 'US', 'united states': 'US', 'dutch': 'NL', 'netherlands': 'NL', 'holland': 'NL', 'german': 'DE', 'germany': 'DE', 'french': 'FR', 'france': 'FR', 'spanish': 'ES', 'spain': 'ES', 'italian': 'IT', 'italy': 'IT', 'portuguese': 'PT', 'portugal': 'PT', 'belgian': 'BE', 'belgium': 'BE', 'austrian': 'AT', 'austria': 'AT', 'japanese': 'JP', 'japan': 'JP', 'chinese': 'CN', 'china': 'CN', 'indian': 'IN', 'india': 'IN', 'egyptian': 'EG', 'egypt': 'EG', 'moroccan': 'MA', 'morocco': 'MA', 'algerian': 'DZ', 'algeria': 'DZ', 'tunisian': 'TN', 'tunisia': 'TN', 'libyan': 'LY', 'libya': 'LY', 'south african': 'ZA', 'south africa': 'ZA', 'nigerian': 'NG', 'nigeria': 'NG', 'kenyan': 'KE', 'kenya': 'KE', 'ghanaian': 'GH', 'ghana': 'GH', 'ethiopian': 'ET', 'ethiopia': 'ET', 'pakistani': 'PK', 'pakistan': 'PK', 'afghan': 'AF', 'afghanistan': 'AF', 'iraqi': 'IQ', 'iraq': 'IQ', 'hungarian': 'HU', 'hungary': 'HU', 'polish': 'PL', 'poland': 'PL', 'czech': 'CZ', 'czech republic': 'CZ', 'romanian': 'RO', 'romania': 'RO', 'bulgarian': 'BG', 'bulgaria': 'BG', 'greek': 'GR', 'greece': 'GR', 'croatian': 'HR', 'croatia': 'HR', 'serbian': 'RS', 'serbia': 'RS', 'ukrainian': 'UA', 'ukraine': 'UA', 'belarusian': 'BY', 'belarus': 'BY', 'thai': 'TH', 'thailand': 'TH', 'vietnamese': 'VN', 'vietnam': 'VN', 'indonesian': 'ID', 'indonesia': 'ID', 'filipino': 'PH', 'philippines': 'PH', 'malaysian': 'MY', 'malaysia': 'MY', 'singaporean': 'SG', 'singapore': 'SG', 'australian': 'AU', 'australia': 'AU', 'new zealand': 'NZ', 'russian': 'RU', 'russia': 'RU', 'turkish': 'TR', 'turkey': 'TR', 'saudi': 'SA', 'saudi arabia': 'SA', 'emirati': 'AE', 'uae': 'AE', 'emirates': 'AE', 'qatari': 'QA', 'qatar': 'QA', 'kuwaiti': 'KW', 'kuwait': 'KW', 'omani': 'OM', 'oman': 'OM', 'cuban': 'CU', 'cuba': 'CU', 'madagascan': 'MG', 'madagascar': 'MG', 'togolese': 'TG', 'togo': 'TG', 'zeeland': 'NL', # Dutch province 'limburg': 'NL', # Dutch province } filename_lower = filename.lower() for country_name, code in country_map.items(): if country_name in filename_lower: return code return 'XX' # Unknown country def classify_institution_type(self, text: str) -> str: """Classify institution type based on keywords in text.""" text_lower = text.lower() # Count keyword matches per type scores = defaultdict(int) for inst_type, keywords in INSTITUTION_KEYWORDS.items(): for keyword in keywords: if keyword in text_lower: scores[inst_type] += 1 # Return type with most matches if scores: return max(scores.items(), key=lambda x: x[1])[0] return 'UNKNOWN' def extract_identifiers(self, text: str) -> List[Dict[str, str]]: """Extract identifiers (ISIL, URLs) from text.""" identifiers = [] # Extract ISIL codes for match in ISIL_PATTERN.finditer(text): isil_code = match.group(1) # Verify it's a real ISIL (starts with valid country code) country_prefix = isil_code.split('-')[0] if country_prefix in COUNTRY_CODES: identifiers.append({ 'identifier_scheme': 'ISIL', 'identifier_value': isil_code, 'identifier_url': f'https://isil.org/{isil_code}' }) # Extract website URLs for match in URL_PATTERN.finditer(text): url = match.group(0) # Clean trailing punctuation url = url.rstrip('.,;:)') identifiers.append({ 'identifier_scheme': 'Website', 'identifier_value': url, 'identifier_url': url }) return identifiers def extract_location(self, text: str, country_code: str) -> Dict[str, str]: """Extract location information from text.""" # Simple city extraction (look for "in CITY", "located in CITY", etc.) location_patterns = [ r'(?:in|at|located in|based in)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?),\s*(?:[A-Z]{2}|[A-Z][a-z]+)' ] for pattern in location_patterns: match = re.search(pattern, text) if match: city = match.group(1).strip() return { 'city': city, 'country': country_code } return {'country': country_code} def extract_institutions_from_text(self, text: str, country_code: str) -> List[Dict[str, Any]]: """Extract institution mentions from a text block.""" institutions = [] # Split text into sentences sentences = re.split(r'[.!?]\s+', text) for sentence in sentences: # Look for institution mentions (patterns like "The X Museum", "X Library", etc.) # This is a simplified approach - could be enhanced with NER # Pattern 1: "The [Institution Name]" followed by institution keyword pattern1 = r'(?:The|the)\s+([A-Z][^.!?]{5,80}?(?:' + '|'.join( [kw.title() for keywords in INSTITUTION_KEYWORDS.values() for kw in keywords] ) + r'))' for match in re.finditer(pattern1, sentence): name = match.group(1).strip() # Skip if already seen (deduplicate within file) name_normalized = name.lower() if name_normalized in self.seen_names: continue # Extract additional info inst_type = self.classify_institution_type(sentence) identifiers = self.extract_identifiers(sentence) location = self.extract_location(sentence, country_code) # Only add if we have a meaningful name if len(name) > 5 and inst_type != 'UNKNOWN': self.seen_names.add(name_normalized) institutions.append({ 'name': name, 'institution_type': inst_type, 'identifiers': identifiers, 'location': location, 'source_text': sentence[:200] # Keep snippet for verification }) return institutions def process_conversation(self, filepath: Path) -> List[Dict[str, Any]]: """Process a single conversation file and extract institutions.""" self.log(f"📄 Processing: {filepath.name}") try: with open(filepath, 'r', encoding='utf-8') as f: conversation = json.load(f) except Exception as e: self.log(f" ❌ Error reading file: {e}") self.stats['errors'] += 1 return [] # Extract country from filename country_code = self.extract_country_from_filename(filepath.name) # Collect all text from assistant messages full_text = [] for message in conversation.get('chat_messages', []): if message.get('sender') == 'assistant': text = message.get('text', '') if text: full_text.append(text) if not full_text: self.log(f" ⚠️ No assistant messages found") self.stats['empty'] += 1 return [] # Extract institutions from combined text combined_text = '\n\n'.join(full_text) institutions = self.extract_institutions_from_text(combined_text, country_code) # Add provenance metadata for inst in institutions: inst['provenance'] = { 'data_source': 'CONVERSATION_NLP', 'data_tier': 'TIER_4_INFERRED', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Pattern-based NLP extraction from conversation', 'confidence_score': 0.6, # Lower confidence for conversational data 'conversation_id': conversation.get('uuid', 'unknown'), 'conversation_name': conversation.get('name', ''), 'source_file': filepath.name } self.log(f" ✅ Extracted {len(institutions)} institutions") self.stats['processed'] += 1 self.stats['institutions_found'] += len(institutions) return institutions def process_all_conversations(self, conversations_dir: Path, limit: int | None = None, country_filter: str | None = None) -> List[Dict[str, Any]]: """Process all conversation files in a directory.""" conversation_files = sorted(conversations_dir.glob('*.json')) if country_filter: conversation_files = [f for f in conversation_files if country_filter.lower() in f.name.lower()] if limit: conversation_files = conversation_files[:limit] self.log(f"\n🔍 Found {len(conversation_files)} conversation files to process") if country_filter: self.log(f" Filtering by country: {country_filter}") if limit: self.log(f" Limited to first {limit} files") self.log("") all_institutions = [] for filepath in conversation_files: institutions = self.process_conversation(filepath) all_institutions.extend(institutions) # Brief progress update every 10 files if self.stats['processed'] % 10 == 0: self.log(f" Progress: {self.stats['processed']}/{len(conversation_files)} files, " f"{self.stats['institutions_found']} institutions") return all_institutions def convert_to_linkml(self, institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Convert extracted institutions to LinkML format.""" linkml_institutions = [] for idx, inst in enumerate(institutions, 1): # Generate ID name_slug = re.sub(r'[^a-z0-9]+', '-', inst['name'].lower())[:50] country = inst['location'].get('country', 'xx') inst_id = f"https://w3id.org/heritage/custodian/{country.lower()}/{name_slug}-conv{idx}" linkml_inst = { 'id': inst_id, 'name': inst['name'], 'institution_type': inst['institution_type'], 'provenance': inst['provenance'] } # Add identifiers if present if inst.get('identifiers'): linkml_inst['identifiers'] = inst['identifiers'] # Add location if present if inst.get('location'): linkml_inst['locations'] = [inst['location']] # Add description (source text snippet) if inst.get('source_text'): linkml_inst['description'] = f"Extracted from conversation: {inst['source_text'][:150]}..." linkml_institutions.append(linkml_inst) return linkml_institutions def main(): parser = argparse.ArgumentParser(description='Extract institutions from conversation files') parser.add_argument('--limit', type=int, help='Limit number of files to process') parser.add_argument('--country', type=str, help='Filter by country name (e.g., "Brazil")') parser.add_argument('--output', type=str, default='data/instances/conversations_extracted.yaml', help='Output YAML file path') parser.add_argument('--quiet', action='store_true', help='Suppress progress messages') args = parser.parse_args() # Setup paths project_root = Path(__file__).parent conversations_dir = project_root / 'docs' / 'reflection' output_path = project_root / args.output # Create output directory if needed output_path.parent.mkdir(parents=True, exist_ok=True) # Extract institutions extractor = ConversationExtractor(verbose=not args.quiet) institutions = extractor.process_all_conversations( conversations_dir, limit=args.limit, country_filter=args.country ) # Convert to LinkML format print(f"\n📝 Converting to LinkML format...") linkml_institutions = extractor.convert_to_linkml(institutions) # Save to YAML print(f"💾 Saving to {output_path}...") with open(output_path, 'w', encoding='utf-8') as f: yaml.dump(linkml_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False) # Print statistics print(f"\n📊 Extraction Summary:") print(f" Files processed: {extractor.stats['processed']}") print(f" Files with errors: {extractor.stats['errors']}") print(f" Empty files: {extractor.stats['empty']}") print(f" Total institutions extracted: {len(linkml_institutions)}") print(f" Unique institution names: {len(extractor.seen_names)}") print(f" Output file: {output_path}") print(f" File size: {output_path.stat().st_size / 1024:.1f} KB") # Country distribution country_dist = defaultdict(int) for inst in linkml_institutions: country = inst.get('locations', [{}])[0].get('country', 'XX') country_dist[country] += 1 print(f"\n🌍 Country Distribution (Top 10):") for country, count in sorted(country_dist.items(), key=lambda x: -x[1])[:10]: print(f" {country}: {count}") print(f"\n✅ Extraction complete!") if __name__ == '__main__': main()