#!/usr/bin/env python3 """ Extract Mexican GLAM institutions from conversation JSON files. Version 2: Improved markdown parsing for structured institution lists. Follows GLAM Data Extraction project specifications. """ import json import re from typing import List, Dict, Any, Tuple from datetime import datetime, timezone from collections import defaultdict class MexicanGLAMExtractor: """Extract heritage institutions from Mexican GLAM conversation artifacts.""" # URL patterns URL_PATTERN = re.compile(r'https?://[^\s\)]+') # Email patterns EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') # Phone patterns PHONE_PATTERN = re.compile(r'(?:Phone|Tel|Teléfono):\s*([0-9\s\-\(\)]+)') # Address patterns ADDRESS_PATTERN = re.compile(r'Address:\s*([^\n]+)') # Mexican states for location matching MEXICAN_STATES = { 'AGUASCALIENTES', 'BAJA CALIFORNIA', 'BAJA CALIFORNIA SUR', 'CAMPECHE', 'CHIAPAS', 'CHIHUAHUA', 'COAHUILA', 'COLIMA', 'DURANGO', 'GUANAJUATO', 'GUERRERO', 'HIDALGO', 'JALISCO', 'MÉXICO', 'MICHOACÁN', 'MORELOS', 'NAYARIT', 'NUEVO LEÓN', 'OAXACA', 'PUEBLA', 'QUERÉTARO', 'QUINTANA ROO', 'SAN LUIS POTOSÍ', 'SINALOA', 'SONORA', 'TABASCO', 'TAMAULIPAS', 'TLAXCALA', 'VERACRUZ', 'YUCATÁN', 'ZACATECAS', 'CIUDAD DE MÉXICO' } # Common Mexican cities MEXICAN_CITIES = { 'México', 'Guadalajara', 'Monterrey', 'Puebla', 'Toluca', 'Tijuana', 'León', 'Ciudad Juárez', 'Zapopan', 'Mérida', 'Aguascalientes', 'Querétaro', 'Morelia', 'Hermosillo', 'Saltillo', 'Mexicali', 'Culiacán', 'Chihuahua', 'Oaxaca', 'Veracruz', 'Acapulco', 'Cancún', 'Cuernavaca', 'Pachuca', 'Durango', 'Tepic', 'Tuxtla Gutiérrez', 'Villahermosa', 'Campeche', 'Chetumal', 'Zacatecas', 'Colima', 'Guanajuato', 'San Luis Potosí' } def __init__(self): self.extracted_institutions = {} self.institution_id_counter = 1 def extract_from_conversation_file(self, filepath: str) -> Dict[str, Any]: """Extract institutions from a single conversation JSON file.""" with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) conversation_id = data.get('uuid', '') conversation_name = data.get('name', '') # Extract from all artifact content in the conversation for msg in data.get('chat_messages', []): for content in msg.get('content', []): if content.get('type') == 'tool_use' and content.get('name') == 'artifacts': artifact_text = content.get('input', {}).get('content', '') if artifact_text: self._extract_from_artifact(artifact_text, conversation_id, conversation_name) return { 'conversation_id': conversation_id, 'conversation_name': conversation_name, 'institutions_found': len(self.extracted_institutions) } def _extract_from_artifact(self, text: str, conversation_id: str, conversation_name: str): """Extract institutions from artifact text content.""" # Strategy 1: Extract from state-by-state sections self._extract_from_state_sections(text, conversation_id, conversation_name) # Strategy 2: Extract from national platforms section self._extract_national_platforms(text, conversation_id, conversation_name) # Strategy 3: Extract inline mentions with URLs self._extract_inline_mentions(text, conversation_id, conversation_name) def _extract_from_state_sections(self, text: str, conversation_id: str, conversation_name: str): """Extract institutions from state-by-state directory sections.""" # Find all state sections (### STATE_NAME) state_pattern = re.compile(r'###\s+([A-ZÁÉÍÓÚÑ\s]+)\n\n(.*?)(?=\n###\s+[A-ZÁÉÍÓÚÑ]|\Z)', re.DOTALL) for match in state_pattern.finditer(text): state_name = match.group(1).strip() section_content = match.group(2) # Only process if it's a valid Mexican state if state_name.upper() not in self.MEXICAN_STATES: continue # Extract institutions from bullet lists with bold names self._extract_from_bullet_lists(section_content, state_name, conversation_id, conversation_name) def _extract_from_bullet_lists(self, text: str, state: str, conversation_id: str, conversation_name: str): """Extract institutions from markdown bullet lists with bold names.""" # Pattern: - **Institution Name**: description or - **Institution Name (ACRONYM)**: description # Also handles multi-line entries with nested bullets pattern = re.compile(r'^-\s+\*\*([^*]+?)\*\*:?\s*(.*?)(?=\n-\s+\*\*|\n\n|\Z)', re.MULTILINE | re.DOTALL) for match in pattern.finditer(text): institution_name = match.group(1).strip() description_block = match.group(2).strip() # Classify institution type inst_type = self._classify_institution_type(institution_name, description_block) if not inst_type: continue # Skip if we can't determine type # Extract metadata from description block metadata = self._extract_metadata_from_block(description_block) # Add institution self._add_institution( name=institution_name, institution_type=inst_type, state=state, metadata=metadata, conversation_id=conversation_id, conversation_name=conversation_name ) def _extract_national_platforms(self, text: str, conversation_id: str, conversation_name: str): """Extract national-level platforms and institutions.""" # Look for "National" or "Federal" sections national_sections = re.finditer( r'##\s+(?:National|Federal|Core\s+National).*?\n(.*?)(?=\n##|\Z)', text, re.DOTALL | re.IGNORECASE ) for section_match in national_sections: section_content = section_match.group(1) # Extract from bullet lists self._extract_from_bullet_lists(section_content, '', conversation_id, conversation_name) def _extract_inline_mentions(self, text: str, conversation_id: str, conversation_name: str): """Extract institutions mentioned inline with URLs.""" # Pattern 1: **Name**\n- **URL**: https://... url_pattern = re.compile( r'\*\*([^*]+?)\*\*\s*\n\s*-\s*\*\*URL\*\*:\s*(https?://[^\s\)]+)', re.MULTILINE ) for match in url_pattern.finditer(text): institution_name = match.group(1).strip() url = match.group(2).strip().rstrip('.,;)') # Get context for better metadata context_start = max(0, match.start() - 200) context_end = min(len(text), match.end() + 500) context = text[context_start:context_end] metadata = self._extract_metadata_from_block(context) metadata['urls'] = [url] + metadata.get('urls', []) inst_type = self._classify_institution_type(institution_name, context) if inst_type: self._add_institution( name=institution_name, institution_type=inst_type, state='', metadata=metadata, conversation_id=conversation_id, conversation_name=conversation_name ) # Pattern 2: Institution name at/URL/website: URL inline_pattern = re.compile( r'\b((?:Museo|Biblioteca|Archivo|Instituto|Centro|Sistema|Fonoteca|Mapoteca)\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]+?)\s+(?:at|URL|website|portal):\s*(https?://[^\s\)]+)', re.IGNORECASE ) for match in inline_pattern.finditer(text): institution_name = match.group(1).strip() url = match.group(2).strip().rstrip('.,;)') inst_type = self._classify_institution_type(institution_name, '') if inst_type: self._add_institution( name=institution_name, institution_type=inst_type, state='', metadata={'urls': [url]}, conversation_id=conversation_id, conversation_name=conversation_name ) def _classify_institution_type(self, name: str, description: str) -> str: """Classify institution type based on name and description.""" name_lower = name.lower() desc_lower = description.lower() combined = name_lower + ' ' + desc_lower # Classification rules if any(term in name_lower for term in ['museo', 'museum']): return 'MUSEUM' elif any(term in name_lower for term in ['biblioteca', 'library', 'hemeroteca']): return 'LIBRARY' elif any(term in name_lower for term in ['archivo', 'archive', 'mapoteca']): return 'ARCHIVE' elif any(term in name_lower for term in ['galería', 'gallery']): return 'GALLERY' elif any(term in name_lower for term in ['instituto', 'institute', 'centro', 'center']): # Check if it's a research center or museum if 'museo' in combined or 'museum' in combined: return 'MUSEUM' else: return 'RESEARCH_CENTER' elif any(term in name_lower for term in ['secretaría', 'sistema', 'fonoteca', 'imcine', 'red nacional', 'mediateca', 'mexicana', 'memórica']): return 'OFFICIAL_INSTITUTION' return '' # Unknown type def _extract_metadata_from_block(self, block: str) -> Dict[str, Any]: """Extract metadata (URLs, addresses, emails, phones) from description block.""" metadata = { 'urls': [], 'emails': [], 'phones': [], 'address': '', 'city': '', 'description': '', } # Extract URLs urls = self.URL_PATTERN.findall(block) metadata['urls'] = [url.rstrip('.,;)') for url in urls[:3]] # Extract emails metadata['emails'] = self.EMAIL_PATTERN.findall(block)[:2] # Extract phones phone_matches = self.PHONE_PATTERN.findall(block) metadata['phones'] = [p.strip() for p in phone_matches][:2] # Extract address address_match = self.ADDRESS_PATTERN.search(block) if address_match: metadata['address'] = address_match.group(1).strip().rstrip('.,;') # Extract city from address or text for city in self.MEXICAN_CITIES: if city.lower() in block.lower(): metadata['city'] = city break # Extract description (first sentence or line) lines = block.split('\n') for line in lines: line = line.strip().lstrip('-').strip() if line and not any(line.startswith(prefix) for prefix in ['Address:', 'Phone:', 'Email:', 'URL:', 'Director:']): # Clean up the line desc = re.sub(r'\*\*', '', line) # Remove bold markers desc = re.sub(r'\[.*?\]\(.*?\)', '', desc) # Remove markdown links if len(desc) > 20: metadata['description'] = desc[:300] break return metadata def _add_institution(self, name: str, institution_type: str, state: str, metadata: Dict[str, Any], conversation_id: str, conversation_name: str): """Add or update an institution record.""" # Normalize name for deduplication name_normalized = self._normalize_name(name) # Check if already exists if name_normalized in self.extracted_institutions: # Update existing record existing = self.extracted_institutions[name_normalized] existing['provenance']['confidence_score'] = min(1.0, existing['provenance']['confidence_score'] + 0.15) # Merge metadata if metadata.get('urls'): for url in metadata['urls']: if not any(id.get('identifier_value') == url for id in existing['identifiers']): existing['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': url, 'identifier_url': url, }) if state and not existing['locations']: existing['locations'].append({'city': metadata.get('city', ''), 'region': state, 'country': 'MX'}) return # Create new record institution_id = f"mx-glam-{self.institution_id_counter:04d}" self.institution_id_counter += 1 record = { 'id': institution_id, 'name': name, 'name_normalized': name_normalized, 'institution_type': institution_type, 'alternative_names': [], 'description': metadata.get('description', ''), 'locations': [], 'identifiers': [], 'digital_platforms': [], 'provenance': { 'data_source': 'CONVERSATION_NLP', 'data_tier': 'TIER_4_INFERRED', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Markdown parsing + pattern matching from Mexican GLAM conversations', 'confidence_score': self._calculate_confidence(name, metadata), 'conversation_id': conversation_id, 'source_url': None, } } # Add location if available if metadata.get('city') or state: location = { 'city': metadata.get('city', ''), 'region': state, 'country': 'MX', } if metadata.get('address'): location['street_address'] = metadata['address'] record['locations'].append(location) # Add identifiers for url in metadata.get('urls', []): record['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': url, 'identifier_url': url, }) for email in metadata.get('emails', []): record['identifiers'].append({ 'identifier_scheme': 'Email', 'identifier_value': email, }) self.extracted_institutions[name_normalized] = record def _normalize_name(self, name: str) -> str: """Normalize institution name for deduplication.""" # Remove acronyms in parentheses name = re.sub(r'\s*\([A-Z]+\)\s*', ' ', name) # Remove extra whitespace, lowercase normalized = re.sub(r'\s+', ' ', name.strip().lower()) return normalized def _calculate_confidence(self, name: str, metadata: Dict[str, Any]) -> float: """Calculate confidence score for extraction.""" confidence = 0.7 # Base confidence for structured extraction # Increase confidence based on available metadata if metadata.get('urls'): confidence += 0.15 if metadata.get('address'): confidence += 0.1 if metadata.get('description'): confidence += 0.05 return min(1.0, confidence) def get_results(self) -> Dict[str, Any]: """Get extraction results with statistics.""" institutions_list = list(self.extracted_institutions.values()) # Calculate statistics type_counts = defaultdict(int) state_counts = defaultdict(int) for inst in institutions_list: type_counts[inst['institution_type']] += 1 for loc in inst.get('locations', []): if loc.get('region'): state_counts[loc['region']] += 1 return { 'total_institutions': len(institutions_list), 'institutions': institutions_list, 'statistics': { 'by_type': dict(type_counts), 'by_state': dict(state_counts), 'with_urls': sum(1 for i in institutions_list if any(id.get('identifier_scheme') == 'Website' for id in i.get('identifiers', []))), 'with_locations': sum(1 for i in institutions_list if i.get('locations')), 'with_descriptions': sum(1 for i in institutions_list if i.get('description')), }, 'extraction_metadata': { 'extraction_date': datetime.now(timezone.utc).isoformat(), 'data_tier': 'TIER_4_INFERRED', 'data_source': 'CONVERSATION_NLP', } } def main(): """Main extraction workflow.""" print("Mexican GLAM Institution Extractor v2") print("=" * 60) extractor = MexicanGLAMExtractor() # Process both conversation files files = [ 'mexican_glam_1.json', 'mexican_glam_2.json' ] for filepath in files: print(f"\nProcessing: {filepath}") result = extractor.extract_from_conversation_file(filepath) print(f" Conversation: {result['conversation_name']}") print(f" UUID: {result['conversation_id']}") print(f" Running total: {result['institutions_found']} institutions") # Get final results results = extractor.get_results() # Print statistics print("\n" + "=" * 60) print("EXTRACTION RESULTS") print("=" * 60) print(f"Total institutions extracted: {results['total_institutions']}") print(f"\nBy institution type:") for inst_type, count in sorted(results['statistics']['by_type'].items(), key=lambda x: x[1], reverse=True): print(f" {inst_type}: {count}") print(f"\nTop 15 states by institution count:") state_counts = results['statistics']['by_state'] for state, count in sorted(state_counts.items(), key=lambda x: x[1], reverse=True)[:15]: print(f" {state}: {count}") print(f"\nMetadata completeness:") print(f" With URLs: {results['statistics']['with_urls']}") print(f" With locations: {results['statistics']['with_locations']}") print(f" With descriptions: {results['statistics']['with_descriptions']}") # Save to JSON output_file = 'mexican_glam_extracted.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\nResults saved to: {output_file}") # Show sample institutions print("\n" + "=" * 60) print("SAMPLE INSTITUTIONS (first 15):") print("=" * 60) for i, inst in enumerate(results['institutions'][:15], 1): print(f"\n{i}. {inst['name']}") print(f" Type: {inst['institution_type']}") print(f" Confidence: {inst['provenance']['confidence_score']:.2f}") if inst.get('locations'): loc = inst['locations'][0] city_state = f"{loc.get('city', '')}, {loc.get('region', '')}".strip(', ') if city_state: print(f" Location: {city_state}") if inst.get('description'): desc = inst['description'][:100] + '...' if len(inst['description']) > 100 else inst['description'] print(f" Description: {desc}") urls = [id['identifier_value'] for id in inst.get('identifiers', []) if id['identifier_scheme'] == 'Website'] if urls: print(f" URL: {urls[0]}") if __name__ == '__main__': main()