glam/extract_conversations_batch.py
2025-11-19 23:25:22 +01:00

449 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Batch extract heritage institutions from conversation JSON files.
This script processes all conversation files in docs/reflection/ and extracts
heritage institution data (TIER_4) using NLP pattern matching.
Usage:
python extract_conversations_batch.py [--limit N] [--country CODE]
"""
import json
import re
import yaml
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any, Set, Tuple
from collections import defaultdict
import argparse
# Institution type keywords (from AGENTS.md taxonomy)
INSTITUTION_KEYWORDS = {
'MUSEUM': [
'museum', 'museo', 'museu', 'musée', 'muzeum', 'muzej',
'art gallery', 'kunstmuseum', 'kunsthal'
],
'LIBRARY': [
'library', 'biblioteca', 'bibliothek', 'bibliotheek', 'bibliothèque',
'biblioteka', 'national library', 'public library', 'university library'
],
'ARCHIVE': [
'archive', 'archiv', 'archivo', 'arquivo', 'archief',
'national archive', 'state archive', 'regional archive'
],
'GALLERY': [
'gallery', 'galerie', 'galería', 'kunsthal', 'art center'
],
'RESEARCH_CENTER': [
'research center', 'research centre', 'research institute',
'documentation center', 'knowledge center'
],
'BOTANICAL_ZOO': [
'botanical garden', 'botanic garden', 'arboretum',
'zoo', 'zoological garden', 'zoological park'
],
'EDUCATION_PROVIDER': [
'university', 'universidad', 'universiteit', 'université',
'college', 'school', 'institute', 'academy'
],
'HOLY_SITES': [
'church', 'cathedral', 'mosque', 'temple', 'synagogue',
'monastery', 'abbey', 'shrine'
]
}
# ISIL code pattern (e.g., NL-AsdRM, US-MBMM, BR-RjBN)
ISIL_PATTERN = re.compile(r'\b([A-Z]{2}-[A-Za-z0-9]+)\b')
# Website URL pattern
URL_PATTERN = re.compile(r'https?://[^\s<>"]+')
# Country codes (ISO 3166-1 alpha-2)
COUNTRY_CODES = {
'NL', 'US', 'BR', 'GB', 'FR', 'DE', 'ES', 'IT', 'PT', 'BE',
'AR', 'MX', 'CL', 'CO', 'PE', 'VE', 'EC', 'BO', 'PY', 'UY',
'JP', 'CN', 'IN', 'KR', 'TH', 'VN', 'ID', 'PH', 'MY', 'SG',
'EG', 'MA', 'DZ', 'TN', 'LY', 'ZA', 'NG', 'KE', 'GH', 'ET',
'AU', 'NZ', 'CA', 'RU', 'TR', 'SA', 'AE', 'QA', 'KW', 'OM',
'PL', 'CZ', 'HU', 'RO', 'BG', 'GR', 'HR', 'RS', 'UA', 'BY'
}
class ConversationExtractor:
"""Extract heritage institutions from Claude conversation files."""
def __init__(self, verbose: bool = True):
self.verbose = verbose
self.stats = defaultdict(int)
self.extracted_institutions = []
self.seen_names = set()
def log(self, message: str):
"""Print log message if verbose."""
if self.verbose:
print(message)
def extract_country_from_filename(self, filename: str) -> str:
"""Extract country name from conversation filename."""
# Examples:
# - Brazilian_GLAM_collection_inventories.json → Brazil
# - Mexican_GLAM_inventories_and_catalogues.json → Mexico
# - Panamanian_cultural_heritage_resources.json → Panama
country_map = {
'brazilian': 'BR', 'brazil': 'BR',
'mexican': 'MX', 'mexico': 'MX',
'panamanian': 'PA', 'panama': 'PA',
'argentine': 'AR', 'argentina': 'AR', 'argentinian': 'AR',
'chilean': 'CL', 'chile': 'CL',
'colombian': 'CO', 'colombia': 'CO',
'canadian': 'CA', 'canada': 'CA',
'american': 'US', 'united states': 'US',
'dutch': 'NL', 'netherlands': 'NL', 'holland': 'NL',
'german': 'DE', 'germany': 'DE',
'french': 'FR', 'france': 'FR',
'spanish': 'ES', 'spain': 'ES',
'italian': 'IT', 'italy': 'IT',
'portuguese': 'PT', 'portugal': 'PT',
'belgian': 'BE', 'belgium': 'BE',
'austrian': 'AT', 'austria': 'AT',
'japanese': 'JP', 'japan': 'JP',
'chinese': 'CN', 'china': 'CN',
'indian': 'IN', 'india': 'IN',
'egyptian': 'EG', 'egypt': 'EG',
'moroccan': 'MA', 'morocco': 'MA',
'algerian': 'DZ', 'algeria': 'DZ',
'tunisian': 'TN', 'tunisia': 'TN',
'libyan': 'LY', 'libya': 'LY',
'south african': 'ZA', 'south africa': 'ZA',
'nigerian': 'NG', 'nigeria': 'NG',
'kenyan': 'KE', 'kenya': 'KE',
'ghanaian': 'GH', 'ghana': 'GH',
'ethiopian': 'ET', 'ethiopia': 'ET',
'pakistani': 'PK', 'pakistan': 'PK',
'afghan': 'AF', 'afghanistan': 'AF',
'iraqi': 'IQ', 'iraq': 'IQ',
'hungarian': 'HU', 'hungary': 'HU',
'polish': 'PL', 'poland': 'PL',
'czech': 'CZ', 'czech republic': 'CZ',
'romanian': 'RO', 'romania': 'RO',
'bulgarian': 'BG', 'bulgaria': 'BG',
'greek': 'GR', 'greece': 'GR',
'croatian': 'HR', 'croatia': 'HR',
'serbian': 'RS', 'serbia': 'RS',
'ukrainian': 'UA', 'ukraine': 'UA',
'belarusian': 'BY', 'belarus': 'BY',
'thai': 'TH', 'thailand': 'TH',
'vietnamese': 'VN', 'vietnam': 'VN',
'indonesian': 'ID', 'indonesia': 'ID',
'filipino': 'PH', 'philippines': 'PH',
'malaysian': 'MY', 'malaysia': 'MY',
'singaporean': 'SG', 'singapore': 'SG',
'australian': 'AU', 'australia': 'AU',
'new zealand': 'NZ',
'russian': 'RU', 'russia': 'RU',
'turkish': 'TR', 'turkey': 'TR',
'saudi': 'SA', 'saudi arabia': 'SA',
'emirati': 'AE', 'uae': 'AE', 'emirates': 'AE',
'qatari': 'QA', 'qatar': 'QA',
'kuwaiti': 'KW', 'kuwait': 'KW',
'omani': 'OM', 'oman': 'OM',
'cuban': 'CU', 'cuba': 'CU',
'madagascan': 'MG', 'madagascar': 'MG',
'togolese': 'TG', 'togo': 'TG',
'zeeland': 'NL', # Dutch province
'limburg': 'NL', # Dutch province
}
filename_lower = filename.lower()
for country_name, code in country_map.items():
if country_name in filename_lower:
return code
return 'XX' # Unknown country
def classify_institution_type(self, text: str) -> str:
"""Classify institution type based on keywords in text."""
text_lower = text.lower()
# Count keyword matches per type
scores = defaultdict(int)
for inst_type, keywords in INSTITUTION_KEYWORDS.items():
for keyword in keywords:
if keyword in text_lower:
scores[inst_type] += 1
# Return type with most matches
if scores:
return max(scores.items(), key=lambda x: x[1])[0]
return 'UNKNOWN'
def extract_identifiers(self, text: str) -> List[Dict[str, str]]:
"""Extract identifiers (ISIL, URLs) from text."""
identifiers = []
# Extract ISIL codes
for match in ISIL_PATTERN.finditer(text):
isil_code = match.group(1)
# Verify it's a real ISIL (starts with valid country code)
country_prefix = isil_code.split('-')[0]
if country_prefix in COUNTRY_CODES:
identifiers.append({
'identifier_scheme': 'ISIL',
'identifier_value': isil_code,
'identifier_url': f'https://isil.org/{isil_code}'
})
# Extract website URLs
for match in URL_PATTERN.finditer(text):
url = match.group(0)
# Clean trailing punctuation
url = url.rstrip('.,;:)')
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': url,
'identifier_url': url
})
return identifiers
def extract_location(self, text: str, country_code: str) -> Dict[str, str]:
"""Extract location information from text."""
# Simple city extraction (look for "in CITY", "located in CITY", etc.)
location_patterns = [
r'(?:in|at|located in|based in)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)',
r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?),\s*(?:[A-Z]{2}|[A-Z][a-z]+)'
]
for pattern in location_patterns:
match = re.search(pattern, text)
if match:
city = match.group(1).strip()
return {
'city': city,
'country': country_code
}
return {'country': country_code}
def extract_institutions_from_text(self, text: str, country_code: str) -> List[Dict[str, Any]]:
"""Extract institution mentions from a text block."""
institutions = []
# Split text into sentences
sentences = re.split(r'[.!?]\s+', text)
for sentence in sentences:
# Look for institution mentions (patterns like "The X Museum", "X Library", etc.)
# This is a simplified approach - could be enhanced with NER
# Pattern 1: "The [Institution Name]" followed by institution keyword
pattern1 = r'(?:The|the)\s+([A-Z][^.!?]{5,80}?(?:' + '|'.join(
[kw.title() for keywords in INSTITUTION_KEYWORDS.values() for kw in keywords]
) + r'))'
for match in re.finditer(pattern1, sentence):
name = match.group(1).strip()
# Skip if already seen (deduplicate within file)
name_normalized = name.lower()
if name_normalized in self.seen_names:
continue
# Extract additional info
inst_type = self.classify_institution_type(sentence)
identifiers = self.extract_identifiers(sentence)
location = self.extract_location(sentence, country_code)
# Only add if we have a meaningful name
if len(name) > 5 and inst_type != 'UNKNOWN':
self.seen_names.add(name_normalized)
institutions.append({
'name': name,
'institution_type': inst_type,
'identifiers': identifiers,
'location': location,
'source_text': sentence[:200] # Keep snippet for verification
})
return institutions
def process_conversation(self, filepath: Path) -> List[Dict[str, Any]]:
"""Process a single conversation file and extract institutions."""
self.log(f"📄 Processing: {filepath.name}")
try:
with open(filepath, 'r', encoding='utf-8') as f:
conversation = json.load(f)
except Exception as e:
self.log(f" ❌ Error reading file: {e}")
self.stats['errors'] += 1
return []
# Extract country from filename
country_code = self.extract_country_from_filename(filepath.name)
# Collect all text from assistant messages
full_text = []
for message in conversation.get('chat_messages', []):
if message.get('sender') == 'assistant':
text = message.get('text', '')
if text:
full_text.append(text)
if not full_text:
self.log(f" ⚠️ No assistant messages found")
self.stats['empty'] += 1
return []
# Extract institutions from combined text
combined_text = '\n\n'.join(full_text)
institutions = self.extract_institutions_from_text(combined_text, country_code)
# Add provenance metadata
for inst in institutions:
inst['provenance'] = {
'data_source': 'CONVERSATION_NLP',
'data_tier': 'TIER_4_INFERRED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Pattern-based NLP extraction from conversation',
'confidence_score': 0.6, # Lower confidence for conversational data
'conversation_id': conversation.get('uuid', 'unknown'),
'conversation_name': conversation.get('name', ''),
'source_file': filepath.name
}
self.log(f" ✅ Extracted {len(institutions)} institutions")
self.stats['processed'] += 1
self.stats['institutions_found'] += len(institutions)
return institutions
def process_all_conversations(self, conversations_dir: Path, limit: int | None = None,
country_filter: str | None = None) -> List[Dict[str, Any]]:
"""Process all conversation files in a directory."""
conversation_files = sorted(conversations_dir.glob('*.json'))
if country_filter:
conversation_files = [f for f in conversation_files
if country_filter.lower() in f.name.lower()]
if limit:
conversation_files = conversation_files[:limit]
self.log(f"\n🔍 Found {len(conversation_files)} conversation files to process")
if country_filter:
self.log(f" Filtering by country: {country_filter}")
if limit:
self.log(f" Limited to first {limit} files")
self.log("")
all_institutions = []
for filepath in conversation_files:
institutions = self.process_conversation(filepath)
all_institutions.extend(institutions)
# Brief progress update every 10 files
if self.stats['processed'] % 10 == 0:
self.log(f" Progress: {self.stats['processed']}/{len(conversation_files)} files, "
f"{self.stats['institutions_found']} institutions")
return all_institutions
def convert_to_linkml(self, institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Convert extracted institutions to LinkML format."""
linkml_institutions = []
for idx, inst in enumerate(institutions, 1):
# Generate ID
name_slug = re.sub(r'[^a-z0-9]+', '-', inst['name'].lower())[:50]
country = inst['location'].get('country', 'xx')
inst_id = f"https://w3id.org/heritage/custodian/{country.lower()}/{name_slug}-conv{idx}"
linkml_inst = {
'id': inst_id,
'name': inst['name'],
'institution_type': inst['institution_type'],
'provenance': inst['provenance']
}
# Add identifiers if present
if inst.get('identifiers'):
linkml_inst['identifiers'] = inst['identifiers']
# Add location if present
if inst.get('location'):
linkml_inst['locations'] = [inst['location']]
# Add description (source text snippet)
if inst.get('source_text'):
linkml_inst['description'] = f"Extracted from conversation: {inst['source_text'][:150]}..."
linkml_institutions.append(linkml_inst)
return linkml_institutions
def main():
parser = argparse.ArgumentParser(description='Extract institutions from conversation files')
parser.add_argument('--limit', type=int, help='Limit number of files to process')
parser.add_argument('--country', type=str, help='Filter by country name (e.g., "Brazil")')
parser.add_argument('--output', type=str, default='data/instances/conversations_extracted.yaml',
help='Output YAML file path')
parser.add_argument('--quiet', action='store_true', help='Suppress progress messages')
args = parser.parse_args()
# Setup paths
project_root = Path(__file__).parent
conversations_dir = project_root / 'docs' / 'reflection'
output_path = project_root / args.output
# Create output directory if needed
output_path.parent.mkdir(parents=True, exist_ok=True)
# Extract institutions
extractor = ConversationExtractor(verbose=not args.quiet)
institutions = extractor.process_all_conversations(
conversations_dir,
limit=args.limit,
country_filter=args.country
)
# Convert to LinkML format
print(f"\n📝 Converting to LinkML format...")
linkml_institutions = extractor.convert_to_linkml(institutions)
# Save to YAML
print(f"💾 Saving to {output_path}...")
with open(output_path, 'w', encoding='utf-8') as f:
yaml.dump(linkml_institutions, f, allow_unicode=True, sort_keys=False,
default_flow_style=False)
# Print statistics
print(f"\n📊 Extraction Summary:")
print(f" Files processed: {extractor.stats['processed']}")
print(f" Files with errors: {extractor.stats['errors']}")
print(f" Empty files: {extractor.stats['empty']}")
print(f" Total institutions extracted: {len(linkml_institutions)}")
print(f" Unique institution names: {len(extractor.seen_names)}")
print(f" Output file: {output_path}")
print(f" File size: {output_path.stat().st_size / 1024:.1f} KB")
# Country distribution
country_dist = defaultdict(int)
for inst in linkml_institutions:
country = inst.get('locations', [{}])[0].get('country', 'XX')
country_dist[country] += 1
print(f"\n🌍 Country Distribution (Top 10):")
for country, count in sorted(country_dist.items(), key=lambda x: -x[1])[:10]:
print(f" {country}: {count}")
print(f"\n✅ Extraction complete!")
if __name__ == '__main__':
main()