#!/usr/bin/env python3 """ Brazilian GLAM Institution Extractor Extracts ALL heritage institutions from Brazilian conversation JSON and creates LinkML-compliant YAML records following schema v0.2.0. Expected output: 200+ institutions covering all 27 Brazilian federative units. """ import json import re from datetime import datetime, timezone from typing import List, Dict, Any, Optional from pathlib import Path import yaml def slugify(text: str) -> str: """Create URL-safe slug from institution name.""" text = text.lower() text = re.sub(r'[àáâãäå]', 'a', text) text = re.sub(r'[èéêë]', 'e', text) text = re.sub(r'[ìíîï]', 'i', text) text = re.sub(r'[òóôõö]', 'o', text) text = re.sub(r'[ùúûü]', 'u', text) text = re.sub(r'[ç]', 'c', text) text = re.sub(r'[^a-z0-9]+', '-', text) text = text.strip('-') return text[:50] # Limit length def classify_institution(name: str, description: str = "") -> str: """Classify institution type from name and description.""" text = (name + " " + description).lower() if any(word in text for word in ['museu', 'museum', 'memorial', 'pinacoteca']): return 'MUSEUM' elif any(word in text for word in ['biblioteca', 'library', 'bibliotheca']): return 'LIBRARY' elif any(word in text for word in ['arquivo', 'archive', 'archiv']): return 'ARCHIVE' elif any(word in text for word in ['ibram', 'iphan', 'secult', 'secretaria', 'fundação de cultura', 'instituto brasileiro']): return 'OFFICIAL_INSTITUTION' elif any(word in text for word in ['universidade', 'university', 'usp', 'ufmg', 'unicamp', 'ufrj', 'ufba']): return 'EDUCATION_PROVIDER' elif any(word in text for word in ['centro de pesquisa', 'research center', 'laboratório', 'documentation']): return 'RESEARCH_CENTER' else: return 'MIXED' # Default when unclear def extract_institutions_from_conversation(json_path: Path) -> List[Dict[str, Any]]: """Extract all institutions from Brazilian GLAM conversation JSON.""" with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) conversation_id = data.get('uuid', '') extraction_date = datetime.now(timezone.utc).isoformat() institutions = [] seen_names = set() # Deduplicate # Extract markdown content from artifacts markdown_content = [] citations = [] for message in data.get('chat_messages', []): for content_item in message.get('content', []): if content_item.get('type') == 'tool_use': tool_input = content_item.get('input', {}) if 'content' in tool_input: markdown_content.append(tool_input['content']) if 'md_citations' in tool_input: citations.extend(tool_input['md_citations']) # Parse markdown artifacts for institutions for artifact in markdown_content: institutions.extend(parse_artifact(artifact, conversation_id, extraction_date, seen_names)) # Parse citations to create digital platform records institutions.extend(parse_citations(citations, conversation_id, extraction_date, seen_names)) return institutions def parse_artifact(content: str, conversation_id: str, extraction_date: str, seen_names: set) -> List[Dict[str, Any]]: """Parse markdown artifact content to extract institutions.""" institutions = [] # Pattern 1: Federal institutions (IBRAM, IPHAN, Biblioteca Nacional, etc.) federal_pattern = r'\*\*([^*]+(?:Instituto Brasileiro de Museus|IBRAM|IPHAN|Biblioteca Nacional|Arquivo Nacional|Fundação Cultural Palmares)[^*]*)\*\*' for match in re.finditer(federal_pattern, content, re.IGNORECASE): name = match.group(1).strip() if name not in seen_names: inst = create_institution_record(name, content, conversation_id, extraction_date) if inst: institutions.append(inst) seen_names.add(name) # Pattern 2: Museums museum_pattern = r'\*\*([^*]*(?:Museu|Museum|Memorial|Pinacoteca)[^*]*?)\*\*' for match in re.finditer(museum_pattern, content, re.IGNORECASE): name = match.group(1).strip() if name and name not in seen_names and len(name) > 5: inst = create_institution_record(name, content, conversation_id, extraction_date) if inst: institutions.append(inst) seen_names.add(name) # Pattern 3: Libraries library_pattern = r'\*\*([^*]*(?:Biblioteca|Library)[^*]*?)\*\*' for match in re.finditer(library_pattern, content, re.IGNORECASE): name = match.group(1).strip() if name and name not in seen_names and len(name) > 5: inst = create_institution_record(name, content, conversation_id, extraction_date) if inst: institutions.append(inst) seen_names.add(name) # Pattern 4: Archives archive_pattern = r'\*\*([^*]*(?:Arquivo|Archive)[^*]*?)\*\*' for match in re.finditer(archive_pattern, content, re.IGNORECASE): name = match.group(1).strip() if name and name not in seen_names and len(name) > 5: inst = create_institution_record(name, content, conversation_id, extraction_date) if inst: institutions.append(inst) seen_names.add(name) # Pattern 5: Universities university_pattern = r'\*\*([^*]*(?:Universidade|University|USP|UFMG|UNICAMP|UFRJ|UFBA|UNIFAP|UFAC)[^*]*?)\*\*' for match in re.finditer(university_pattern, content, re.IGNORECASE): name = match.group(1).strip() if name and name not in seen_names and len(name) > 3: inst = create_institution_record(name, content, conversation_id, extraction_date) if inst: institutions.append(inst) seen_names.add(name) # Pattern 6: Digital platforms platform_pattern = r'\*\*([^*]*(?:Digital|Brasiliana|Hemeroteca|Tainacan|BNDigital)[^*]*?)\*\*' for match in re.finditer(platform_pattern, content, re.IGNORECASE): name = match.group(1).strip() if name and name not in seen_names and len(name) > 5: inst = create_institution_record(name, content, conversation_id, extraction_date) if inst: institutions.append(inst) seen_names.add(name) return institutions def create_institution_record(name: str, context: str, conversation_id: str, extraction_date: str) -> Optional[Dict[str, Any]]: """Create a complete LinkML-compliant institution record.""" # Clean up name name = re.sub(r'\s+', ' ', name).strip() if not name or len(name) < 4: return None # Skip generic headings skip_words = ['Executive Summary', 'State Infrastructure', 'Digital Systems', 'Collections', 'Federal', 'Contact'] if any(skip in name for skip in skip_words): return None slug = slugify(name) inst_type = classify_institution(name, context) # Extract description from nearby context description = extract_description(name, context) # Extract URL if mentioned urls = extract_urls_for_institution(name, context) # Extract location location = extract_location(name, context) # Determine confidence score confidence = calculate_confidence(name, context, urls, location) record = { 'id': f'https://w3id.org/heritage/custodian/br/{slug}', 'name': name, 'institution_type': inst_type, 'description': description if description else f'Brazilian heritage institution: {name}', 'provenance': { 'data_source': 'CONVERSATION_NLP', 'data_tier': 'TIER_4_INFERRED', 'extraction_date': extraction_date, 'extraction_method': 'Python NLP extraction from Brazilian GLAM conversation artifact', 'confidence_score': confidence, 'conversation_id': conversation_id } } # Add location if found if location: record['locations'] = [location] # Add identifiers if URLs found if urls: record['identifiers'] = [] for url in urls[:5]: # Limit to 5 URLs record['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': url, 'identifier_url': url }) return record def extract_description(name: str, context: str) -> Optional[str]: """Extract descriptive text about the institution from context.""" # Find paragraphs mentioning the institution sentences = [] # Look for sentences containing the institution name for paragraph in context.split('\n\n'): if name[:20] in paragraph: # Use partial name for matching # Extract sentences from this paragraph para_sentences = re.split(r'[.!?]+\s+', paragraph) for sent in para_sentences[:3]: # Up to 3 sentences if len(sent) > 30 and len(sent) < 500: clean = re.sub(r'\*\*', '', sent).strip() if clean and clean not in sentences: sentences.append(clean) if sentences: return ' '.join(sentences[:2]) # Combine up to 2 sentences return None def extract_urls_for_institution(name: str, context: str) -> List[str]: """Extract URLs mentioned near institution name.""" urls = [] # Find URLs in context url_pattern = r'https?://[^\s<>"\')]+(?:\.[^\s<>"\')\]]+)+' # Look in paragraphs mentioning the institution for paragraph in context.split('\n'): if name[:15] in paragraph or name.split()[0] in paragraph: found_urls = re.findall(url_pattern, paragraph) urls.extend(found_urls) return list(set(urls)) # Deduplicate def extract_location(name: str, context: str) -> Optional[Dict[str, Any]]: """Extract location information for institution.""" # Brazilian cities commonly mentioned cities = ['São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Belo Horizonte', 'Curitiba', 'Recife', 'Porto Alegre', 'Manaus', 'Fortaleza', 'Rio Branco', 'Maceió', 'Macapá', 'Belém', 'Goiânia', 'Campo Grande', 'Cuiabá', 'João Pessoa', 'Teresina', 'Natal', 'Florianópolis', 'Aracaju', 'Palmas', 'Boa Vista', 'Vitória'] # Brazilian states states = { 'Acre': 'AC', 'Alagoas': 'AL', 'Amapá': 'AP', 'Amazonas': 'AM', 'Bahia': 'BA', 'Ceará': 'CE', 'Distrito Federal': 'DF', 'Espírito Santo': 'ES', 'Goiás': 'GO', 'Maranhão': 'MA', 'Mato Grosso': 'MT', 'Mato Grosso do Sul': 'MS', 'Minas Gerais': 'MG', 'Pará': 'PA', 'Paraíba': 'PB', 'Paraná': 'PR', 'Pernambuco': 'PE', 'Piauí': 'PI', 'Rio de Janeiro': 'RJ', 'Rio Grande do Norte': 'RN', 'Rio Grande do Sul': 'RS', 'Rondônia': 'RO', 'Roraima': 'RR', 'Santa Catarina': 'SC', 'São Paulo': 'SP', 'Sergipe': 'SE', 'Tocantins': 'TO' } location = None # Search for city mentions near institution name for paragraph in context.split('\n'): if name[:15] in paragraph: for city in cities: if city in paragraph: # Try to find state too region = None for state_name, state_code in states.items(): if state_name in paragraph or f'({state_code})' in paragraph: region = state_name break location = { 'city': city, 'country': 'BR' } if region: location['region'] = region return location return None def calculate_confidence(name: str, context: str, urls: List[str], location: Optional[Dict]) -> float: """Calculate confidence score for extraction.""" score = 0.5 # Base score # Name quality if len(name) > 10 and len(name) < 100: score += 0.1 # Has URL if urls: score += 0.15 # Has location if location: score += 0.1 # Has description in context if name in context and len(context) > 100: score += 0.1 # Mentioned multiple times count = context.lower().count(name[:20].lower()) if count > 2: score += 0.05 return min(score, 0.95) # Cap at 0.95 def parse_citations(citations: List[Dict], conversation_id: str, extraction_date: str, seen_names: set) -> List[Dict[str, Any]]: """Parse citation metadata to extract additional institutions.""" institutions = [] for citation in citations: # Extract from citation titles and sources title = citation.get('title', '') url = citation.get('url', '') # Look for institutional sources for source_item in citation.get('sources', []): source = source_item.get('source', '') if source and source not in seen_names and len(source) > 5: # Create minimal record for citation sources slug = slugify(source) inst_type = classify_institution(source, title) record = { 'id': f'https://w3id.org/heritage/custodian/br/{slug}', 'name': source, 'institution_type': inst_type, 'description': f'Institution identified from web citation: {title}', 'identifiers': [{ 'identifier_scheme': 'Website', 'identifier_value': url, 'identifier_url': url }], 'locations': [{ 'country': 'BR' }], 'provenance': { 'data_source': 'CONVERSATION_NLP', 'data_tier': 'TIER_4_INFERRED', 'extraction_date': extraction_date, 'extraction_method': 'Python extraction from conversation citation metadata', 'confidence_score': 0.6, # Lower confidence for citation-only 'conversation_id': conversation_id } } institutions.append(record) seen_names.add(source) return institutions def main(): """Main extraction workflow.""" input_file = Path('/Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json') output_file = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_comprehensive.yaml') print(f"Extracting Brazilian institutions from: {input_file.name}") print(f"Output file: {output_file}") print() institutions = extract_institutions_from_conversation(input_file) print(f"✓ Extracted {len(institutions)} institutions") # Count by type type_counts = {} for inst in institutions: inst_type = inst['institution_type'] type_counts[inst_type] = type_counts.get(inst_type, 0) + 1 print(f"\nInstitution type breakdown:") for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True): print(f" {inst_type}: {count}") # Write output output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=100) print(f"\n✓ Wrote {len(institutions)} records to {output_file}") print(f"\nSample institutions:") for inst in institutions[:5]: print(f" - {inst['name']} ({inst['institution_type']})") if __name__ == '__main__': main()