#!/usr/bin/env python3 """ Process Chilean GLAM institutions from conversation file. Extracts institutions from comprehensive provincial directory format. """ import json import re from datetime import datetime, timezone from pathlib import Path from typing import Dict, List from collections import defaultdict # Reuse non-institution filter from Mexican script NON_INSTITUTION_TERMS = { "dublin core", "mods", "mets", "vra core", "object id", "marc21", "marc 21", "handle system", "orcid", "doi", "master formats", "documents", "audio/video", "storage", "climate control", "digital objects", "photography", "tiff", "jpeg", "pdf/a", "api", "oai-pmh", "rest", "iiif", "current status", "museums", "archives", "libraries", "archaeological sites", "digital objects", "access", "platform", "network", "system", "standard", "format", "preservation", "digitization", "metadata" } def is_real_institution(name: str) -> bool: """Check if extracted name is a real institution.""" name_lower = name.lower().strip() if name_lower in NON_INSTITUTION_TERMS: return False words = set(name_lower.split()) if words <= NON_INSTITUTION_TERMS: return False if len(name_lower) < 5: return False return True def extract_institutions_from_report(report_text: str) -> List[Dict]: """Extract Chilean institutions from the provincial directory report. Chilean format uses inline bold text within paragraphs: Example: "**Copiapó Province** anticipates... the new **Museo Regional de Atacama**..." """ institutions = [] # Chilean-specific filter terms (in addition to NON_INSTITUTION_TERMS) chilean_skip_terms = { 'province', 'region', 'unesco', 'serpat', 'dibam', 'national monuments', 'cultural heritage', 'digital preservation', 'atacama province', 'copiapó province', 'huasco province', 'chañaral province', 'antofagasta region', 'metropolitan region', 'maule region', 'valparaíso region', 'biobío region', 'los lagos region', 'araucanía region', 'tarapacá region', 'arica and parinacota region', 'aysén region', 'magallanes region', 'coquimbo region', 'atacama region', "o'higgins region", 'los ríos region', 'ñuble region' } # Institution keywords in Spanish institution_keywords = [ 'museo', 'museum', 'biblioteca', 'library', 'archivo', 'archiv', 'universidad', 'university', 'instituto', 'institute', 'fundación', 'foundation', 'centro', 'center', 'galería', 'gallery', 'servicio', 'consejo', 'dirección', 'departamento', 'academia', 'sociedad' ] # Find all bold text instances: **something** bold_pattern = re.compile(r'\*\*([^*]+?)\*\*') matches = bold_pattern.findall(report_text) # Track current region/province context current_province = None lines = report_text.split('\n') for match in matches: candidate = match.strip() candidate_lower = candidate.lower() # Skip empty or very short if len(candidate) < 5: continue # Skip section headers (Province, Region) if 'province' in candidate_lower or 'region' in candidate_lower: # Update context for location tracking if 'province' in candidate_lower: current_province = candidate.replace(' Province', '').strip() continue # Skip generic Chilean terms if candidate_lower in chilean_skip_terms: continue # Skip metadata standards and technical terms if not is_real_institution(candidate): continue # Must contain at least one institution keyword has_keyword = any(keyword in candidate_lower for keyword in institution_keywords) if not has_keyword: continue # Skip if it's just a keyword by itself if candidate_lower in institution_keywords: continue # Extract institution inst_record = { 'name': candidate, 'urls': [], 'emails': [], 'province': current_province, 'institution_type': 'MIXED' } # Classify institution type if 'museo' in candidate_lower or 'museum' in candidate_lower: inst_record['institution_type'] = 'MUSEUM' elif 'archivo' in candidate_lower or 'archiv' in candidate_lower: inst_record['institution_type'] = 'ARCHIVE' elif 'biblioteca' in candidate_lower or 'library' in candidate_lower: inst_record['institution_type'] = 'LIBRARY' elif 'universidad' in candidate_lower or 'university' in candidate_lower: inst_record['institution_type'] = 'EDUCATION_PROVIDER' elif 'servicio nacional' in candidate_lower or 'consejo' in candidate_lower or 'dirección' in candidate_lower: inst_record['institution_type'] = 'OFFICIAL_INSTITUTION' elif 'fundación' in candidate_lower or 'foundation' in candidate_lower: inst_record['institution_type'] = 'RESEARCH_CENTER' # Try to find URLs/emails in context (look in surrounding text) # Find where this institution appears in the full text context_start = report_text.find(f'**{candidate}**') if context_start != -1: # Get 500 characters after the mention context = report_text[context_start:context_start+500] # Extract URLs url_matches = re.findall(r'https?://[^\s\)\],]+', context) for url in url_matches: url = url.rstrip(',.;:') if url not in inst_record['urls']: inst_record['urls'].append(url) # Extract emails email_matches = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', context) for email in email_matches: if email not in inst_record['emails']: inst_record['emails'].append(email) institutions.append(inst_record) return institutions def normalize_name(name: str) -> str: """Normalize institution name for deduplication.""" name = re.sub(r'\([^)]*\)', '', name) name = name.lower().strip() name = re.sub(r'\s+', ' ', name) return name def deduplicate_institutions(institutions: List[Dict]) -> List[Dict]: """Deduplicate by normalized name, merging metadata.""" name_map = {} for inst in institutions: norm_name = normalize_name(inst['name']) if norm_name in name_map: # Merge URLs and emails existing = name_map[norm_name] existing['urls'] = list(set(existing.get('urls', []) + inst.get('urls', []))) existing['emails'] = list(set(existing.get('emails', []) + inst.get('emails', []))) # Prefer more specific type if existing.get('institution_type') == 'MIXED' and inst.get('institution_type') != 'MIXED': existing['institution_type'] = inst['institution_type'] else: name_map[norm_name] = inst return list(name_map.values()) def generate_ghcid(country_code: str, inst_type: str, name: str, index: int) -> str: """Generate GHCID identifier.""" type_codes = { 'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G', 'OFFICIAL_INSTITUTION': 'O', 'RESEARCH_CENTER': 'R', 'EDUCATION_PROVIDER': 'E', 'MIXED': 'M', 'UNKNOWN': 'U' } type_code = type_codes.get(inst_type, 'U') slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')[:30] return f"https://w3id.org/heritage/custodian/{country_code.lower()}/{type_code.lower()}-{slug}-{index:04d}" def convert_to_linkml_yaml(institutions: List[Dict], conversation_id: str, source_file_path: str) -> str: """Convert to LinkML YAML format.""" yaml_lines = [ "---", "# Chilean GLAM Institutions", f"# Extracted from: {source_file_path}", f"# Conversation ID: {conversation_id}", "" ] for i, inst in enumerate(institutions, 1): ghcid = generate_ghcid('CL', inst.get('institution_type', 'MIXED'), inst['name'], i) # Handle names with quotes name = inst['name'] if '"' in name: name_escaped = name.replace("'", "''") name_field = f" name: '{name_escaped}'" else: name_field = f" name: \"{name}\"" yaml_lines.append(f"- id: {ghcid}") yaml_lines.append(name_field) yaml_lines.append(f" institution_type: {inst.get('institution_type', 'MIXED')}") # Locations if inst.get('province'): yaml_lines.append(f" locations:") yaml_lines.append(f" - region: {inst['province']}") yaml_lines.append(f" country: CL") # Identifiers if inst.get('urls') or inst.get('emails'): yaml_lines.append(f" identifiers:") for url in inst.get('urls', []): yaml_lines.append(f" - identifier_scheme: Website") yaml_lines.append(f" identifier_value: {url}") yaml_lines.append(f" identifier_url: {url}") for email in inst.get('emails', []): yaml_lines.append(f" - identifier_scheme: Email") yaml_lines.append(f" identifier_value: {email}") # Provenance with source reference extraction_date = datetime.now(timezone.utc).isoformat() yaml_lines.append(f" provenance:") yaml_lines.append(f" data_source: CONVERSATION_NLP") yaml_lines.append(f" data_tier: TIER_4_INFERRED") yaml_lines.append(f" extraction_date: \"{extraction_date}\"") yaml_lines.append(f" extraction_method: \"Inline bold text extraction from provincial directory\"") yaml_lines.append(f" confidence_score: 0.85") yaml_lines.append(f" conversation_id: \"{conversation_id}\"") yaml_lines.append(f" source_url: \"file://{source_file_path}\"") yaml_lines.append("") return '\n'.join(yaml_lines) def main(): # Load conversation file file_path = "/Users/kempersc/Documents/claude/data-2025-11-02-18-13-26-batch-0000/conversations/2025-09-22T14-43-14-edc75d66-ee42-4199-8e22-65b0d2347922-Chilean_GLAM_inventories_research.json" print("Loading Chilean GLAM conversation...") with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) conversation_id = data['uuid'] # Find the comprehensive provincial directory (usually the longest artifact) report_text = None max_length = 0 for msg in data.get('chat_messages', []): for content in msg.get('content', []): if content.get('type') == 'tool_use' and content.get('name') == 'artifacts': artifact = content.get('input', {}).get('content', '') if len(artifact) > max_length: max_length = len(artifact) report_text = artifact if not report_text: print("ERROR: No artifact found in conversation!") return print(f"Found report with {len(report_text)} characters") # Extract institutions print("\nExtracting institutions from provincial directory...") institutions = extract_institutions_from_report(report_text) print(f"Extracted: {len(institutions)} institutions") # Deduplicate print("\nDeduplicating by normalized name...") final_institutions = deduplicate_institutions(institutions) print(f"Final: {len(final_institutions)} unique institutions") # Statistics print("\n" + "="*60) print("FINAL STATISTICS") print("="*60) type_counts = defaultdict(int) province_counts = defaultdict(int) with_urls = 0 with_emails = 0 for inst in final_institutions: type_counts[inst.get('institution_type', 'MIXED')] += 1 if inst.get('province'): province_counts[inst['province']] += 1 if inst.get('urls'): with_urls += 1 if inst.get('emails'): with_emails += 1 print(f"\nTotal institutions: {len(final_institutions)}") print(f"\nBy type:") for itype, count in sorted(type_counts.items(), key=lambda x: -x[1]): print(f" {itype}: {count}") print(f"\nTop 15 provinces by institution count:") for province, count in sorted(province_counts.items(), key=lambda x: -x[1])[:15]: print(f" {province}: {count}") print(f"\nIdentifiers:") print(f" Institutions with URLs: {with_urls} ({with_urls/len(final_institutions)*100:.1f}%)") print(f" Institutions with emails: {with_emails} ({with_emails/len(final_institutions)*100:.1f}%)") # Convert to LinkML YAML print("\nConverting to LinkML YAML format...") yaml_output = convert_to_linkml_yaml(final_institutions, conversation_id, file_path) # Write output output_path = Path('/Users/kempersc/apps/glam/data/instances/chilean_institutions.yaml') output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: f.write(yaml_output) print(f"\nOutput written to: {output_path}") # Save JSON json_output = { 'institutions': final_institutions, 'statistics': { 'total': len(final_institutions), 'by_type': dict(type_counts), 'by_province': dict(province_counts), 'with_urls': with_urls, 'with_emails': with_emails } } json_path = '/tmp/chilean_institutions_final.json' with open(json_path, 'w', encoding='utf-8') as f: json.dump(json_output, f, indent=2, ensure_ascii=False) print(f"JSON version saved to: {json_path}") if __name__ == '__main__': main()