#!/usr/bin/env python3 """ Brazilian GLAM Institution Extractor v2.0 Improved extraction focusing on structured bullet-point lists from the state-by-state artifact section. Extracts actual institution names (not sentence fragments). Expected output: 150-200 quality institutions with proper names, types, and metadata. Source file: /Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json """ import json import re from datetime import datetime, timezone from typing import List, Dict, Any, Optional, Tuple from pathlib import Path import yaml def slugify(text: str) -> str: """Create URL-safe slug from institution name.""" text = text.lower() text = re.sub(r'[àáâãäå]', 'a', text) text = re.sub(r'[èéêë]', 'e', text) text = re.sub(r'[ìíîï]', 'i', text) text = re.sub(r'[òóôõö]', 'o', text) text = re.sub(r'[ùúûü]', 'u', text) text = re.sub(r'[ç]', 'c', text) text = re.sub(r'[^a-z0-9]+', '-', text) text = text.strip('-') return text[:50] def classify_institution(name: str, description: str = "") -> str: """Classify institution type from name and description. Priority order: 1. Check name first (most reliable) 2. Then description (less reliable, may contain misleading keywords) """ name_lower = name.lower() desc_lower = description.lower() # Check NAME first for strong indicators if 'museu' in name_lower or 'museum' in name_lower or 'memorial' in name_lower or 'pinacoteca' in name_lower: return 'MUSEUM' elif 'arquivo' in name_lower or 'archiv' in name_lower: return 'ARCHIVE' elif 'biblioteca' in name_lower or 'library' in name_lower or 'bibliotheca' in name_lower: return 'LIBRARY' # For acronyms and ambiguous names, check description text = (name_lower + " " + desc_lower) if any(word in text for word in ['museu', 'museum', 'memorial', 'pinacoteca', 'casa de cultura', 'mam-', 'mam ', 'marco', 'musear']): return 'MUSEUM' elif 'arquivo' in text or 'archiv' in text: return 'ARCHIVE' elif 'biblioteca' in text or 'bce' in text: return 'LIBRARY' elif any(word in text for word in ['ibram', 'iphan', 'secult', 'secretaria', 'fundação de cultura', 'instituto brasileiro', 'fpc/', 'ipac', 'unesco']): return 'OFFICIAL_INSTITUTION' elif any(word in text for word in ['universidade', 'university', 'ufac', 'ufal', 'unifap', 'ufpa', 'usp', 'ufmg', 'unicamp', 'ufrj', 'ufba', 'ufam', 'ufc', 'unb', 'ufg', 'ufma', 'ufmt', 'ufms', 'ufpe', 'ufpi', 'ufrn', 'ufpb', 'ufrgs', 'ufpr', 'ufsc', 'ufes', 'repository', 'repositories']): return 'EDUCATION_PROVIDER' elif any(word in text for word in ['centro de pesquisa', 'research center', 'laboratório', 'documentation', 'cepap', 'instituto histórico', 'instituto geográfico']): return 'RESEARCH_CENTER' elif any(word in text for word in ['teatro', 'centro cultural', 'ccbb', 'centro dragão', 'geopark', 'casa das', 'projects', 'projetos']): return 'MIXED' else: return 'MIXED' def parse_state_sections(content: str, source_file_path: str) -> List[Dict[str, Any]]: """Parse state-by-state sections to extract institution records.""" institutions = [] # Brazilian states mapping (for region extraction) states = { 'ACRE': 'AC', 'ALAGOAS': 'AL', 'AMAPÁ': 'AP', 'AMAZONAS': 'AM', 'BAHIA': 'BA', 'CEARÁ': 'CE', 'DISTRITO FEDERAL': 'DF', 'ESPÍRITO SANTO': 'ES', 'GOIÁS': 'GO', 'MARANHÃO': 'MA', 'MATO GROSSO': 'MT', 'MATO GROSSO DO SUL': 'MS', 'MINAS GERAIS': 'MG', 'PARÁ': 'PA', 'PARAÍBA': 'PB', 'PARANÁ': 'PR', 'PERNAMBUCO': 'PE', 'PIAUÍ': 'PI', 'RIO DE JANEIRO': 'RJ', 'RIO GRANDE DO NORTE': 'RN', 'RIO GRANDE DO SUL': 'RS', 'RONDÔNIA': 'RO', 'RORAIMA': 'RR', 'SANTA CATARINA': 'SC', 'SÃO PAULO': 'SP', 'SERGIPE': 'SE', 'TOCANTINS': 'TO' } # Split content by state headers (## STATE_NAME (XX)) state_pattern = r'## ([A-ZÀÁÂÃÄÅÈÉÊËÌÍÎÏÒÓÔÕÖÙÚÛÜ\s]+) \(([A-Z]{2})\)' state_sections = re.split(state_pattern, content) current_state = None current_state_code = None for i, section in enumerate(state_sections): # Check if this is a state name if i % 3 == 1: # State names appear at positions 1, 4, 7, etc. current_state = section.strip() elif i % 3 == 2: # State codes appear at positions 2, 5, 8, etc. current_state_code = section.strip() elif i % 3 == 0 and i > 0: # Content appears at positions 3, 6, 9, etc. # Parse institutions from this state's section (only if state info is available) if current_state and current_state_code: state_institutions = parse_institutions_from_section( section, current_state, current_state_code, source_file_path ) institutions.extend(state_institutions) return institutions def parse_institutions_from_section(section: str, state_name: str, state_code: str, source_file_path: str) -> List[Dict[str, Any]]: """Parse individual institutions from a state section.""" institutions = [] # Pattern: - **Institution Name**: description/details # Match bullet points with bold institution names bullet_pattern = r'^[\s]*[-•]\s*\*\*([^*:]+?)(?:\s*\([^)]+\))?\*\*:?\s*(.*)$' lines = section.split('\n') for line in lines: match = re.match(bullet_pattern, line, re.MULTILINE) if match: name = match.group(1).strip() details = match.group(2).strip() # Skip section headers and generic labels skip_words = ['State Infrastructure', 'Digital Systems', 'Collections', 'Digital Initiatives', 'Contact', 'Federal', 'Technical', 'Metadata', 'Notable', 'Key Features', 'Preservation', 'Ongoing', 'Major', 'Systems', 'Coverage', 'Database'] if any(skip in name for skip in skip_words): continue # Skip Contact entries (phone numbers, addresses) if re.search(r'\(\d{2}\)\s*\d', name): # Brazilian phone format continue # Skip if name is too short or too long if len(name) < 3 or len(name) > 150: continue # Extract URL from details url_match = re.search(r'https?://[^\s,]+', details) url = url_match.group(0) if url_match else None # Extract city if mentioned city = extract_city_from_details(details, name) # Create institution record inst = create_institution_record_v2( name=name, description=details, state_name=state_name, state_code=state_code, city=city, url=url, source_file_path=source_file_path ) if inst: institutions.append(inst) return institutions def extract_city_from_details(details: str, name: str) -> Optional[str]: """Extract city name from institution details or name.""" # Common Brazilian cities cities = [ 'São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Belo Horizonte', 'Curitiba', 'Recife', 'Porto Alegre', 'Manaus', 'Fortaleza', 'Rio Branco', 'Maceió', 'Macapá', 'Belém', 'Goiânia', 'Campo Grande', 'Cuiabá', 'João Pessoa', 'Teresina', 'Natal', 'Florianópolis', 'Aracaju', 'Palmas', 'Boa Vista', 'Vitória', 'São Luís', 'Campinas', 'Santos', 'Niterói', 'Ouro Preto', 'Petrópolis', 'Paraty', 'Olinda', 'Tiradentes', 'Diamantina', 'Cachoeira', 'São Cristóvão' ] # Check in details first for city in cities: if city in details or city in name: return city return None def create_institution_record_v2( name: str, description: str, state_name: str, state_code: str, city: Optional[str], url: Optional[str], source_file_path: str ) -> Optional[Dict[str, Any]]: """Create LinkML-compliant institution record from structured data.""" # Clean name name = re.sub(r'\s+', ' ', name).strip() # Create slug for ID slug = slugify(name) # Classify institution type inst_type = classify_institution(name, description) # Calculate confidence based on available data confidence = 0.7 # Base for structured extraction if url: confidence += 0.1 if city: confidence += 0.05 if len(description) > 50: confidence += 0.1 confidence = min(confidence, 0.95) # Create location record location = { 'country': 'BR', 'region': state_name } if city: location['city'] = city # Create record record = { 'id': f'https://w3id.org/heritage/custodian/br/{slug}', 'name': name, 'institution_type': inst_type, 'locations': [location], 'provenance': { 'data_source': 'CONVERSATION_NLP', 'data_tier': 'TIER_4_INFERRED', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Python structured extraction from Brazilian state-by-state artifact v2.0', 'confidence_score': confidence, 'conversation_id': '2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5', 'source_url': f'file://{source_file_path}' } } # Add description if meaningful if description and len(description) > 10: # Clean up description desc = description # Remove URLs from description text desc = re.sub(r'https?://[^\s,]+', '', desc).strip() # Limit length if len(desc) > 500: desc = desc[:497] + '...' if desc: record['description'] = desc # Add URL as identifier if url: record['identifiers'] = [{ 'identifier_scheme': 'Website', 'identifier_value': url, 'identifier_url': url }] return record def extract_institutions_from_conversation(json_path: Path) -> List[Dict[str, Any]]: """Main extraction function.""" with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) # Extract markdown artifacts markdown_artifacts = [] for message in data.get('chat_messages', []): for content_item in message.get('content', []): if content_item.get('type') == 'tool_use': tool_input = content_item.get('input', {}) if 'content' in tool_input: markdown_artifacts.append(tool_input['content']) # Process all artifacts all_institutions = [] seen_names = set() # Convert Path to absolute string for source_url source_file_path = str(json_path.resolve()) for artifact in markdown_artifacts: # Parse state sections institutions = parse_state_sections(artifact, source_file_path) # Deduplicate by name for inst in institutions: name = inst['name'] if name not in seen_names: all_institutions.append(inst) seen_names.add(name) return all_institutions def main(): """Main extraction workflow.""" input_file = Path('/Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json') output_file = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_v2.yaml') print(f"Brazilian GLAM Institution Extractor v2.0") print(f"=" * 60) print(f"Input: {input_file.name}") print(f"Output: {output_file}") print() institutions = extract_institutions_from_conversation(input_file) print(f"✓ Extracted {len(institutions)} institutions") print() # Type breakdown type_counts = {} for inst in institutions: inst_type = inst['institution_type'] type_counts[inst_type] = type_counts.get(inst_type, 0) + 1 print(f"Institution Type Distribution:") for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True): print(f" {inst_type:25s}: {count:3d}") print() # Location coverage states_covered = set() for inst in institutions: if inst.get('locations'): state = inst['locations'][0].get('region') if state: states_covered.add(state) print(f"Geographic Coverage:") print(f" States covered: {len(states_covered)}/27") print() # URL coverage with_urls = sum(1 for inst in institutions if inst.get('identifiers')) print(f"Data Quality:") print(f" Institutions with URLs: {with_urls}/{len(institutions)} ({100*with_urls/len(institutions):.1f}%)") # Average confidence avg_confidence = sum(inst['provenance']['confidence_score'] for inst in institutions) / len(institutions) print(f" Average confidence: {avg_confidence:.3f}") print() # Write output output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, 'w', encoding='utf-8') as f: yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=120) print(f"✓ Wrote {len(institutions)} records to {output_file}") print() print(f"Sample Institutions:") for inst in institutions[:10]: city = inst['locations'][0].get('city', '(state level)') print(f" - {inst['name'][:50]:50s} | {inst['institution_type']:12s} | {city}") if __name__ == '__main__': main()