449 lines
18 KiB
Python
449 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Batch extract heritage institutions from conversation JSON files.
|
|
|
|
This script processes all conversation files in docs/reflection/ and extracts
|
|
heritage institution data (TIER_4) using NLP pattern matching.
|
|
|
|
Usage:
|
|
python extract_conversations_batch.py [--limit N] [--country CODE]
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Any, Set, Tuple
|
|
from collections import defaultdict
|
|
import argparse
|
|
|
|
# Institution type keywords (from AGENTS.md taxonomy)
|
|
INSTITUTION_KEYWORDS = {
|
|
'MUSEUM': [
|
|
'museum', 'museo', 'museu', 'musée', 'muzeum', 'muzej',
|
|
'art gallery', 'kunstmuseum', 'kunsthal'
|
|
],
|
|
'LIBRARY': [
|
|
'library', 'biblioteca', 'bibliothek', 'bibliotheek', 'bibliothèque',
|
|
'biblioteka', 'national library', 'public library', 'university library'
|
|
],
|
|
'ARCHIVE': [
|
|
'archive', 'archiv', 'archivo', 'arquivo', 'archief',
|
|
'national archive', 'state archive', 'regional archive'
|
|
],
|
|
'GALLERY': [
|
|
'gallery', 'galerie', 'galería', 'kunsthal', 'art center'
|
|
],
|
|
'RESEARCH_CENTER': [
|
|
'research center', 'research centre', 'research institute',
|
|
'documentation center', 'knowledge center'
|
|
],
|
|
'BOTANICAL_ZOO': [
|
|
'botanical garden', 'botanic garden', 'arboretum',
|
|
'zoo', 'zoological garden', 'zoological park'
|
|
],
|
|
'EDUCATION_PROVIDER': [
|
|
'university', 'universidad', 'universiteit', 'université',
|
|
'college', 'school', 'institute', 'academy'
|
|
],
|
|
'HOLY_SITES': [
|
|
'church', 'cathedral', 'mosque', 'temple', 'synagogue',
|
|
'monastery', 'abbey', 'shrine'
|
|
]
|
|
}
|
|
|
|
# ISIL code pattern (e.g., NL-AsdRM, US-MBMM, BR-RjBN)
|
|
ISIL_PATTERN = re.compile(r'\b([A-Z]{2}-[A-Za-z0-9]+)\b')
|
|
|
|
# Website URL pattern
|
|
URL_PATTERN = re.compile(r'https?://[^\s<>"]+')
|
|
|
|
# Country codes (ISO 3166-1 alpha-2)
|
|
COUNTRY_CODES = {
|
|
'NL', 'US', 'BR', 'GB', 'FR', 'DE', 'ES', 'IT', 'PT', 'BE',
|
|
'AR', 'MX', 'CL', 'CO', 'PE', 'VE', 'EC', 'BO', 'PY', 'UY',
|
|
'JP', 'CN', 'IN', 'KR', 'TH', 'VN', 'ID', 'PH', 'MY', 'SG',
|
|
'EG', 'MA', 'DZ', 'TN', 'LY', 'ZA', 'NG', 'KE', 'GH', 'ET',
|
|
'AU', 'NZ', 'CA', 'RU', 'TR', 'SA', 'AE', 'QA', 'KW', 'OM',
|
|
'PL', 'CZ', 'HU', 'RO', 'BG', 'GR', 'HR', 'RS', 'UA', 'BY'
|
|
}
|
|
|
|
|
|
class ConversationExtractor:
|
|
"""Extract heritage institutions from Claude conversation files."""
|
|
|
|
def __init__(self, verbose: bool = True):
|
|
self.verbose = verbose
|
|
self.stats = defaultdict(int)
|
|
self.extracted_institutions = []
|
|
self.seen_names = set()
|
|
|
|
def log(self, message: str):
|
|
"""Print log message if verbose."""
|
|
if self.verbose:
|
|
print(message)
|
|
|
|
def extract_country_from_filename(self, filename: str) -> str:
|
|
"""Extract country name from conversation filename."""
|
|
# Examples:
|
|
# - Brazilian_GLAM_collection_inventories.json → Brazil
|
|
# - Mexican_GLAM_inventories_and_catalogues.json → Mexico
|
|
# - Panamanian_cultural_heritage_resources.json → Panama
|
|
|
|
country_map = {
|
|
'brazilian': 'BR', 'brazil': 'BR',
|
|
'mexican': 'MX', 'mexico': 'MX',
|
|
'panamanian': 'PA', 'panama': 'PA',
|
|
'argentine': 'AR', 'argentina': 'AR', 'argentinian': 'AR',
|
|
'chilean': 'CL', 'chile': 'CL',
|
|
'colombian': 'CO', 'colombia': 'CO',
|
|
'canadian': 'CA', 'canada': 'CA',
|
|
'american': 'US', 'united states': 'US',
|
|
'dutch': 'NL', 'netherlands': 'NL', 'holland': 'NL',
|
|
'german': 'DE', 'germany': 'DE',
|
|
'french': 'FR', 'france': 'FR',
|
|
'spanish': 'ES', 'spain': 'ES',
|
|
'italian': 'IT', 'italy': 'IT',
|
|
'portuguese': 'PT', 'portugal': 'PT',
|
|
'belgian': 'BE', 'belgium': 'BE',
|
|
'austrian': 'AT', 'austria': 'AT',
|
|
'japanese': 'JP', 'japan': 'JP',
|
|
'chinese': 'CN', 'china': 'CN',
|
|
'indian': 'IN', 'india': 'IN',
|
|
'egyptian': 'EG', 'egypt': 'EG',
|
|
'moroccan': 'MA', 'morocco': 'MA',
|
|
'algerian': 'DZ', 'algeria': 'DZ',
|
|
'tunisian': 'TN', 'tunisia': 'TN',
|
|
'libyan': 'LY', 'libya': 'LY',
|
|
'south african': 'ZA', 'south africa': 'ZA',
|
|
'nigerian': 'NG', 'nigeria': 'NG',
|
|
'kenyan': 'KE', 'kenya': 'KE',
|
|
'ghanaian': 'GH', 'ghana': 'GH',
|
|
'ethiopian': 'ET', 'ethiopia': 'ET',
|
|
'pakistani': 'PK', 'pakistan': 'PK',
|
|
'afghan': 'AF', 'afghanistan': 'AF',
|
|
'iraqi': 'IQ', 'iraq': 'IQ',
|
|
'hungarian': 'HU', 'hungary': 'HU',
|
|
'polish': 'PL', 'poland': 'PL',
|
|
'czech': 'CZ', 'czech republic': 'CZ',
|
|
'romanian': 'RO', 'romania': 'RO',
|
|
'bulgarian': 'BG', 'bulgaria': 'BG',
|
|
'greek': 'GR', 'greece': 'GR',
|
|
'croatian': 'HR', 'croatia': 'HR',
|
|
'serbian': 'RS', 'serbia': 'RS',
|
|
'ukrainian': 'UA', 'ukraine': 'UA',
|
|
'belarusian': 'BY', 'belarus': 'BY',
|
|
'thai': 'TH', 'thailand': 'TH',
|
|
'vietnamese': 'VN', 'vietnam': 'VN',
|
|
'indonesian': 'ID', 'indonesia': 'ID',
|
|
'filipino': 'PH', 'philippines': 'PH',
|
|
'malaysian': 'MY', 'malaysia': 'MY',
|
|
'singaporean': 'SG', 'singapore': 'SG',
|
|
'australian': 'AU', 'australia': 'AU',
|
|
'new zealand': 'NZ',
|
|
'russian': 'RU', 'russia': 'RU',
|
|
'turkish': 'TR', 'turkey': 'TR',
|
|
'saudi': 'SA', 'saudi arabia': 'SA',
|
|
'emirati': 'AE', 'uae': 'AE', 'emirates': 'AE',
|
|
'qatari': 'QA', 'qatar': 'QA',
|
|
'kuwaiti': 'KW', 'kuwait': 'KW',
|
|
'omani': 'OM', 'oman': 'OM',
|
|
'cuban': 'CU', 'cuba': 'CU',
|
|
'madagascan': 'MG', 'madagascar': 'MG',
|
|
'togolese': 'TG', 'togo': 'TG',
|
|
'zeeland': 'NL', # Dutch province
|
|
'limburg': 'NL', # Dutch province
|
|
}
|
|
|
|
filename_lower = filename.lower()
|
|
for country_name, code in country_map.items():
|
|
if country_name in filename_lower:
|
|
return code
|
|
|
|
return 'XX' # Unknown country
|
|
|
|
def classify_institution_type(self, text: str) -> str:
|
|
"""Classify institution type based on keywords in text."""
|
|
text_lower = text.lower()
|
|
|
|
# Count keyword matches per type
|
|
scores = defaultdict(int)
|
|
for inst_type, keywords in INSTITUTION_KEYWORDS.items():
|
|
for keyword in keywords:
|
|
if keyword in text_lower:
|
|
scores[inst_type] += 1
|
|
|
|
# Return type with most matches
|
|
if scores:
|
|
return max(scores.items(), key=lambda x: x[1])[0]
|
|
|
|
return 'UNKNOWN'
|
|
|
|
def extract_identifiers(self, text: str) -> List[Dict[str, str]]:
|
|
"""Extract identifiers (ISIL, URLs) from text."""
|
|
identifiers = []
|
|
|
|
# Extract ISIL codes
|
|
for match in ISIL_PATTERN.finditer(text):
|
|
isil_code = match.group(1)
|
|
# Verify it's a real ISIL (starts with valid country code)
|
|
country_prefix = isil_code.split('-')[0]
|
|
if country_prefix in COUNTRY_CODES:
|
|
identifiers.append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': isil_code,
|
|
'identifier_url': f'https://isil.org/{isil_code}'
|
|
})
|
|
|
|
# Extract website URLs
|
|
for match in URL_PATTERN.finditer(text):
|
|
url = match.group(0)
|
|
# Clean trailing punctuation
|
|
url = url.rstrip('.,;:)')
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': url,
|
|
'identifier_url': url
|
|
})
|
|
|
|
return identifiers
|
|
|
|
def extract_location(self, text: str, country_code: str) -> Dict[str, str]:
|
|
"""Extract location information from text."""
|
|
# Simple city extraction (look for "in CITY", "located in CITY", etc.)
|
|
location_patterns = [
|
|
r'(?:in|at|located in|based in)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)',
|
|
r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?),\s*(?:[A-Z]{2}|[A-Z][a-z]+)'
|
|
]
|
|
|
|
for pattern in location_patterns:
|
|
match = re.search(pattern, text)
|
|
if match:
|
|
city = match.group(1).strip()
|
|
return {
|
|
'city': city,
|
|
'country': country_code
|
|
}
|
|
|
|
return {'country': country_code}
|
|
|
|
def extract_institutions_from_text(self, text: str, country_code: str) -> List[Dict[str, Any]]:
|
|
"""Extract institution mentions from a text block."""
|
|
institutions = []
|
|
|
|
# Split text into sentences
|
|
sentences = re.split(r'[.!?]\s+', text)
|
|
|
|
for sentence in sentences:
|
|
# Look for institution mentions (patterns like "The X Museum", "X Library", etc.)
|
|
# This is a simplified approach - could be enhanced with NER
|
|
|
|
# Pattern 1: "The [Institution Name]" followed by institution keyword
|
|
pattern1 = r'(?:The|the)\s+([A-Z][^.!?]{5,80}?(?:' + '|'.join(
|
|
[kw.title() for keywords in INSTITUTION_KEYWORDS.values() for kw in keywords]
|
|
) + r'))'
|
|
|
|
for match in re.finditer(pattern1, sentence):
|
|
name = match.group(1).strip()
|
|
|
|
# Skip if already seen (deduplicate within file)
|
|
name_normalized = name.lower()
|
|
if name_normalized in self.seen_names:
|
|
continue
|
|
|
|
# Extract additional info
|
|
inst_type = self.classify_institution_type(sentence)
|
|
identifiers = self.extract_identifiers(sentence)
|
|
location = self.extract_location(sentence, country_code)
|
|
|
|
# Only add if we have a meaningful name
|
|
if len(name) > 5 and inst_type != 'UNKNOWN':
|
|
self.seen_names.add(name_normalized)
|
|
institutions.append({
|
|
'name': name,
|
|
'institution_type': inst_type,
|
|
'identifiers': identifiers,
|
|
'location': location,
|
|
'source_text': sentence[:200] # Keep snippet for verification
|
|
})
|
|
|
|
return institutions
|
|
|
|
def process_conversation(self, filepath: Path) -> List[Dict[str, Any]]:
|
|
"""Process a single conversation file and extract institutions."""
|
|
self.log(f"📄 Processing: {filepath.name}")
|
|
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
conversation = json.load(f)
|
|
except Exception as e:
|
|
self.log(f" ❌ Error reading file: {e}")
|
|
self.stats['errors'] += 1
|
|
return []
|
|
|
|
# Extract country from filename
|
|
country_code = self.extract_country_from_filename(filepath.name)
|
|
|
|
# Collect all text from assistant messages
|
|
full_text = []
|
|
for message in conversation.get('chat_messages', []):
|
|
if message.get('sender') == 'assistant':
|
|
text = message.get('text', '')
|
|
if text:
|
|
full_text.append(text)
|
|
|
|
if not full_text:
|
|
self.log(f" ⚠️ No assistant messages found")
|
|
self.stats['empty'] += 1
|
|
return []
|
|
|
|
# Extract institutions from combined text
|
|
combined_text = '\n\n'.join(full_text)
|
|
institutions = self.extract_institutions_from_text(combined_text, country_code)
|
|
|
|
# Add provenance metadata
|
|
for inst in institutions:
|
|
inst['provenance'] = {
|
|
'data_source': 'CONVERSATION_NLP',
|
|
'data_tier': 'TIER_4_INFERRED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Pattern-based NLP extraction from conversation',
|
|
'confidence_score': 0.6, # Lower confidence for conversational data
|
|
'conversation_id': conversation.get('uuid', 'unknown'),
|
|
'conversation_name': conversation.get('name', ''),
|
|
'source_file': filepath.name
|
|
}
|
|
|
|
self.log(f" ✅ Extracted {len(institutions)} institutions")
|
|
self.stats['processed'] += 1
|
|
self.stats['institutions_found'] += len(institutions)
|
|
|
|
return institutions
|
|
|
|
def process_all_conversations(self, conversations_dir: Path, limit: int | None = None,
|
|
country_filter: str | None = None) -> List[Dict[str, Any]]:
|
|
"""Process all conversation files in a directory."""
|
|
conversation_files = sorted(conversations_dir.glob('*.json'))
|
|
|
|
if country_filter:
|
|
conversation_files = [f for f in conversation_files
|
|
if country_filter.lower() in f.name.lower()]
|
|
|
|
if limit:
|
|
conversation_files = conversation_files[:limit]
|
|
|
|
self.log(f"\n🔍 Found {len(conversation_files)} conversation files to process")
|
|
if country_filter:
|
|
self.log(f" Filtering by country: {country_filter}")
|
|
if limit:
|
|
self.log(f" Limited to first {limit} files")
|
|
self.log("")
|
|
|
|
all_institutions = []
|
|
|
|
for filepath in conversation_files:
|
|
institutions = self.process_conversation(filepath)
|
|
all_institutions.extend(institutions)
|
|
|
|
# Brief progress update every 10 files
|
|
if self.stats['processed'] % 10 == 0:
|
|
self.log(f" Progress: {self.stats['processed']}/{len(conversation_files)} files, "
|
|
f"{self.stats['institutions_found']} institutions")
|
|
|
|
return all_institutions
|
|
|
|
def convert_to_linkml(self, institutions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Convert extracted institutions to LinkML format."""
|
|
linkml_institutions = []
|
|
|
|
for idx, inst in enumerate(institutions, 1):
|
|
# Generate ID
|
|
name_slug = re.sub(r'[^a-z0-9]+', '-', inst['name'].lower())[:50]
|
|
country = inst['location'].get('country', 'xx')
|
|
inst_id = f"https://w3id.org/heritage/custodian/{country.lower()}/{name_slug}-conv{idx}"
|
|
|
|
linkml_inst = {
|
|
'id': inst_id,
|
|
'name': inst['name'],
|
|
'institution_type': inst['institution_type'],
|
|
'provenance': inst['provenance']
|
|
}
|
|
|
|
# Add identifiers if present
|
|
if inst.get('identifiers'):
|
|
linkml_inst['identifiers'] = inst['identifiers']
|
|
|
|
# Add location if present
|
|
if inst.get('location'):
|
|
linkml_inst['locations'] = [inst['location']]
|
|
|
|
# Add description (source text snippet)
|
|
if inst.get('source_text'):
|
|
linkml_inst['description'] = f"Extracted from conversation: {inst['source_text'][:150]}..."
|
|
|
|
linkml_institutions.append(linkml_inst)
|
|
|
|
return linkml_institutions
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Extract institutions from conversation files')
|
|
parser.add_argument('--limit', type=int, help='Limit number of files to process')
|
|
parser.add_argument('--country', type=str, help='Filter by country name (e.g., "Brazil")')
|
|
parser.add_argument('--output', type=str, default='data/instances/conversations_extracted.yaml',
|
|
help='Output YAML file path')
|
|
parser.add_argument('--quiet', action='store_true', help='Suppress progress messages')
|
|
args = parser.parse_args()
|
|
|
|
# Setup paths
|
|
project_root = Path(__file__).parent
|
|
conversations_dir = project_root / 'docs' / 'reflection'
|
|
output_path = project_root / args.output
|
|
|
|
# Create output directory if needed
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Extract institutions
|
|
extractor = ConversationExtractor(verbose=not args.quiet)
|
|
institutions = extractor.process_all_conversations(
|
|
conversations_dir,
|
|
limit=args.limit,
|
|
country_filter=args.country
|
|
)
|
|
|
|
# Convert to LinkML format
|
|
print(f"\n📝 Converting to LinkML format...")
|
|
linkml_institutions = extractor.convert_to_linkml(institutions)
|
|
|
|
# Save to YAML
|
|
print(f"💾 Saving to {output_path}...")
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(linkml_institutions, f, allow_unicode=True, sort_keys=False,
|
|
default_flow_style=False)
|
|
|
|
# Print statistics
|
|
print(f"\n📊 Extraction Summary:")
|
|
print(f" Files processed: {extractor.stats['processed']}")
|
|
print(f" Files with errors: {extractor.stats['errors']}")
|
|
print(f" Empty files: {extractor.stats['empty']}")
|
|
print(f" Total institutions extracted: {len(linkml_institutions)}")
|
|
print(f" Unique institution names: {len(extractor.seen_names)}")
|
|
print(f" Output file: {output_path}")
|
|
print(f" File size: {output_path.stat().st_size / 1024:.1f} KB")
|
|
|
|
# Country distribution
|
|
country_dist = defaultdict(int)
|
|
for inst in linkml_institutions:
|
|
country = inst.get('locations', [{}])[0].get('country', 'XX')
|
|
country_dist[country] += 1
|
|
|
|
print(f"\n🌍 Country Distribution (Top 10):")
|
|
for country, count in sorted(country_dist.items(), key=lambda x: -x[1])[:10]:
|
|
print(f" {country}: {count}")
|
|
|
|
print(f"\n✅ Extraction complete!")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|