#!/usr/bin/env python3 """ Extract Mexican GLAM institutions from conversation JSON files. Follows GLAM Data Extraction project specifications. """ import json import re from typing import List, Dict, Any, Set from datetime import datetime, timezone from collections import defaultdict class MexicanGLAMExtractor: """Extract heritage institutions from Mexican GLAM conversation artifacts.""" # Comprehensive institution type patterns - capture up to natural boundaries INSTITUTION_PATTERNS = { 'MUSEUM': [ r'Museo\s+Nacional\s+de\s+(?:Antropología|Arte|Historia|Culturas\s+Populares)[A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located|operates|provides|offers|has|was|is))', r'Museo\s+Regional\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located|INAH|\())', r'Museo\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]{3,50}?(?=\s*[,.\n]|\s+(?:in|at|located|operates|provides))', r'Casa\s+Museo\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))', ], 'LIBRARY': [ r'Biblioteca\s+Nacional\s+de\s+(?:México|Antropología)[A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))', r'Biblioteca\s+(?:Central|Pública)\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))', r'Hemeroteca\s+Nacional\s+(?:Digital\s+)?de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))', r'Red\s+Nacional\s+de\s+Bibliotecas\s+Públicas', ], 'ARCHIVE': [ r'Archivo\s+General\s+de\s+(?:la\s+Nación|[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?)(?=\s*[,.\n]|\s+(?:in|at|located|holds|manages))', r'Archivo\s+(?:Histórico|Municipal|del\s+Estado)\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))', ], 'GALLERY': [ r'Galería\s+(?:de\s+Arte\s+|Nacional\s+)?[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))', ], 'RESEARCH_CENTER': [ r'Instituto\s+Nacional\s+de\s+Antropología\s+e\s+Historia(?:\s+\(INAH\))?', r'Instituto\s+Nacional\s+de\s+Bellas\s+Artes\s+y\s+Literatura(?:\s+\(INBAL\))?', r'Instituto\s+Nacional\s+de\s+Lenguas\s+Indígenas(?:\s+\(INALI\))?', r'Instituto\s+Nacional\s+de\s+Estudios\s+Históricos(?:\s+\(INEHRM\))?', ], 'OFFICIAL_INSTITUTION': [ r'Secretaría\s+de\s+Cultura', r'Sistema\s+de\s+Información\s+Cultural(?:\s+\(SIC\))?', r'Fonoteca\s+Nacional', r'IMCINE', ], } # URL patterns URL_PATTERN = re.compile(r'https?://[^\s\)]+') # Email patterns EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') # Mexican states for location matching MEXICAN_STATES = { 'AGUASCALIENTES', 'BAJA CALIFORNIA', 'BAJA CALIFORNIA SUR', 'CAMPECHE', 'CHIAPAS', 'CHIHUAHUA', 'COAHUILA', 'COLIMA', 'DURANGO', 'GUANAJUATO', 'GUERRERO', 'HIDALGO', 'JALISCO', 'MÉXICO', 'MICHOACÁN', 'MORELOS', 'NAYARIT', 'NUEVO LEÓN', 'OAXACA', 'PUEBLA', 'QUERÉTARO', 'QUINTANA ROO', 'SAN LUIS POTOSÍ', 'SINALOA', 'SONORA', 'TABASCO', 'TAMAULIPAS', 'TLAXCALA', 'VERACRUZ', 'YUCATÁN', 'ZACATECAS', 'CIUDAD DE MÉXICO' } def __init__(self): self.extracted_institutions = {} self.institution_id_counter = 1 def extract_from_conversation_file(self, filepath: str) -> Dict[str, Any]: """Extract institutions from a single conversation JSON file.""" with open(filepath, 'r', encoding='utf-8') as f: data = json.load(f) conversation_id = data.get('uuid', '') conversation_name = data.get('name', '') # Extract from all artifact content in the conversation for msg in data.get('chat_messages', []): for content in msg.get('content', []): if content.get('type') == 'tool_use' and content.get('name') == 'artifacts': artifact_text = content.get('input', {}).get('content', '') if artifact_text: self._extract_from_text(artifact_text, conversation_id, conversation_name) return { 'conversation_id': conversation_id, 'conversation_name': conversation_name, 'institutions_found': len(self.extracted_institutions) } def _extract_from_text(self, text: str, conversation_id: str, conversation_name: str): """Extract institutions from artifact text content.""" # Split text into sections for better context tracking sections = self._split_into_sections(text) for section in sections: # Extract state context from section header state = self._extract_state_from_section(section) # Extract institutions using pattern matching for inst_type, patterns in self.INSTITUTION_PATTERNS.items(): for pattern in patterns: matches = re.finditer(pattern, section, re.IGNORECASE) for match in matches: institution_name = self._clean_institution_name(match.group(0).strip()) # Skip if it's just a generic term if self._is_generic_term(institution_name): continue # Extract context around the match context = self._get_context(section, match.start(), match.end()) # Create or update institution record self._add_institution( name=institution_name, institution_type=inst_type, state=state, context=context, conversation_id=conversation_id, conversation_name=conversation_name ) def _split_into_sections(self, text: str) -> List[str]: """Split text into logical sections based on headers.""" # Split on ### headers or state names in ALL CAPS section_pattern = re.compile(r'(?:^|\n)(?:###?\s+[A-ZÁÉÍÓÚÑ]|[A-ZÁÉÍÓÚÑ]{5,})', re.MULTILINE) sections = section_pattern.split(text) return [s for s in sections if len(s.strip()) > 50] # Filter out tiny sections def _extract_state_from_section(self, section: str) -> str: """Extract Mexican state name from section text.""" # Look for state names in first 200 characters of section header = section[:200].upper() for state in self.MEXICAN_STATES: if state in header: return state return "" def _is_generic_term(self, name: str) -> bool: """Check if extracted name is just a generic term.""" generic_terms = [ 'Museo de', 'Museo del', 'Museo Nacional', 'Biblioteca de', 'Archivo de', 'Instituto Nacional', 'INAH', 'INBAL', 'Museo Regional', 'Biblioteca Nacional', 'Archivo General' ] # Must be longer than just the generic prefix name_clean = name.strip() if len(name_clean) < 15: # Too short to be a full institution name return True return False def _clean_institution_name(self, name: str) -> str: """Clean institution name by removing sentence fragments.""" # Remove trailing verbs and common sentence starters stop_words = [ r'\s+stands?\s+as\b.*', r'\s+operates?\s+.*', r'\s+provides?\s+.*', r'\s+offers?\s+.*', r'\s+manages?\s+.*', r'\s+holds?\s+.*', r'\s+contains?\s+.*', r'\s+includes?\s+.*', r'\s+features?\s+.*', r'\s+was\s+.*', r'\s+is\s+.*', r'\s+has\s+.*', r'\s+serves?\s+.*', r'\s+at\s+.*', r'\s+in\s+.*', r'\s+located\s+.*', ] cleaned = name for pattern in stop_words: cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE) # Remove trailing punctuation and whitespace cleaned = re.sub(r'[,.\s]+$', '', cleaned) return cleaned.strip() def _get_context(self, text: str, start: int, end: int, window: int = 500) -> str: """Extract context around a match for additional metadata.""" context_start = max(0, start - window) context_end = min(len(text), end + window) return text[context_start:context_end] def _add_institution(self, name: str, institution_type: str, state: str, context: str, conversation_id: str, conversation_name: str): """Add or update an institution record.""" # Normalize name for deduplication name_normalized = self._normalize_name(name) # Check if already exists if name_normalized in self.extracted_institutions: # Update existing record with additional context existing = self.extracted_institutions[name_normalized] existing['provenance']['confidence_score'] = min(1.0, existing['provenance']['confidence_score'] + 0.1) if state and not existing['locations']: if state: existing['locations'].append({'city': '', 'region': state, 'country': 'MX'}) return # Extract metadata from context urls = self._extract_urls(context) emails = self._extract_emails(context) address = self._extract_address(context) city = self._extract_city(context) description = self._extract_description(context, name) # Create new record institution_id = f"mx-glam-{self.institution_id_counter:04d}" self.institution_id_counter += 1 record = { 'id': institution_id, 'name': name, 'name_normalized': name_normalized, 'institution_type': institution_type, 'alternative_names': [], 'description': description, 'locations': [], 'identifiers': [], 'digital_platforms': [], 'provenance': { 'data_source': 'CONVERSATION_NLP', 'data_tier': 'TIER_4_INFERRED', 'extraction_date': datetime.now(timezone.utc).isoformat(), 'extraction_method': 'Pattern-based NER from Mexican GLAM conversations', 'confidence_score': self._calculate_confidence(name, context, urls), 'conversation_id': conversation_id, 'source_url': None, } } # Add location if available if city or state: location = { 'city': city, 'region': state, 'country': 'MX', } if address: location['street_address'] = address record['locations'].append(location) # Add identifiers (URLs, emails) for url in urls: record['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': url, 'identifier_url': url, }) if emails: record['identifiers'].append({ 'identifier_scheme': 'Email', 'identifier_value': emails[0], # Take first email }) self.extracted_institutions[name_normalized] = record def _normalize_name(self, name: str) -> str: """Normalize institution name for deduplication.""" # Remove extra whitespace, lowercase, remove punctuation normalized = re.sub(r'\s+', ' ', name.strip().lower()) normalized = re.sub(r'[^\w\s]', '', normalized) return normalized def _extract_urls(self, context: str) -> List[str]: """Extract URLs from context.""" urls = self.URL_PATTERN.findall(context) return [url.rstrip('.,;)') for url in urls[:3]] # Take up to 3 URLs def _extract_emails(self, context: str) -> List[str]: """Extract email addresses from context.""" return self.EMAIL_PATTERN.findall(context)[:2] # Take up to 2 emails def _extract_address(self, context: str) -> str: """Extract street address from context.""" # Look for "Address:" or address patterns address_match = re.search(r'(?:Address|Dirección):\s*([^\n]+)', context, re.IGNORECASE) if address_match: return address_match.group(1).strip() # Look for Mexican address patterns (street number, colony, postal code) address_match = re.search(r'([A-Z][a-záéíóúñ]+\s+\d+[^,\n]+,?\s*\d{5})', context) if address_match: return address_match.group(1).strip() return "" def _extract_city(self, context: str) -> str: """Extract city name from context.""" # Common Mexican cities cities = [ 'México', 'Guadalajara', 'Monterrey', 'Puebla', 'Toluca', 'Tijuana', 'León', 'Ciudad Juárez', 'Zapopan', 'Mérida', 'Aguascalientes', 'Querétaro', 'Morelia', 'Hermosillo', 'Saltillo', 'Mexicali', 'Culiacán', 'Chihuahua', 'Oaxaca', 'Veracruz', 'Acapulco', 'Cancún', 'Cuernavaca', 'Pachuca', 'Durango', 'Tepic', 'Tuxtla Gutiérrez', 'Villahermosa', 'Campeche', 'Chetumal', 'Zacatecas', 'Colima', 'Guanajuato', 'Haarlem', # In case of Dutch crossover ] context_lower = context.lower() for city in cities: if city.lower() in context_lower: return city return "" def _extract_description(self, context: str, institution_name: str) -> str: """Extract description from context around institution mention.""" # Look for sentences containing the institution name sentences = re.split(r'[.!?]\s+', context) relevant_sentences = [] for sentence in sentences: if institution_name[:20] in sentence: # Match on first 20 chars relevant_sentences.append(sentence.strip()) if relevant_sentences: return '. '.join(relevant_sentences[:2]) # Take up to 2 sentences return "" def _calculate_confidence(self, name: str, context: str, urls: List[str]) -> float: """Calculate confidence score for extraction.""" confidence = 0.6 # Base confidence for pattern match # Increase confidence based on available metadata if urls: confidence += 0.15 if len(context) > 300: confidence += 0.1 if len(name) > 25: # Longer, more specific name confidence += 0.1 if 'museo' in name.lower() or 'biblioteca' in name.lower() or 'archivo' in name.lower(): confidence += 0.05 return min(1.0, confidence) def get_results(self) -> Dict[str, Any]: """Get extraction results with statistics.""" institutions_list = list(self.extracted_institutions.values()) # Calculate statistics type_counts = defaultdict(int) state_counts = defaultdict(int) for inst in institutions_list: type_counts[inst['institution_type']] += 1 for loc in inst.get('locations', []): if loc.get('region'): state_counts[loc['region']] += 1 return { 'total_institutions': len(institutions_list), 'institutions': institutions_list, 'statistics': { 'by_type': dict(type_counts), 'by_state': dict(state_counts), 'with_urls': sum(1 for i in institutions_list if i.get('identifiers')), 'with_locations': sum(1 for i in institutions_list if i.get('locations')), }, 'extraction_metadata': { 'extraction_date': datetime.now(timezone.utc).isoformat(), 'data_tier': 'TIER_4_INFERRED', 'data_source': 'CONVERSATION_NLP', } } def main(): """Main extraction workflow.""" print("Mexican GLAM Institution Extractor") print("=" * 60) extractor = MexicanGLAMExtractor() # Process both conversation files files = [ 'mexican_glam_1.json', 'mexican_glam_2.json' ] for filepath in files: print(f"\nProcessing: {filepath}") result = extractor.extract_from_conversation_file(filepath) print(f" Conversation: {result['conversation_name']}") print(f" UUID: {result['conversation_id']}") print(f" Running total: {result['institutions_found']} institutions") # Get final results results = extractor.get_results() # Print statistics print("\n" + "=" * 60) print("EXTRACTION RESULTS") print("=" * 60) print(f"Total institutions extracted: {results['total_institutions']}") print(f"\nBy institution type:") for inst_type, count in sorted(results['statistics']['by_type'].items()): print(f" {inst_type}: {count}") print(f"\nTop 10 states by institution count:") state_counts = results['statistics']['by_state'] for state, count in sorted(state_counts.items(), key=lambda x: x[1], reverse=True)[:10]: print(f" {state}: {count}") print(f"\nMetadata completeness:") print(f" With URLs: {results['statistics']['with_urls']}") print(f" With locations: {results['statistics']['with_locations']}") # Save to JSON output_file = 'mexican_glam_extracted.json' with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"\nResults saved to: {output_file}") # Show sample institutions print("\n" + "=" * 60) print("SAMPLE INSTITUTIONS (first 10):") print("=" * 60) for i, inst in enumerate(results['institutions'][:10], 1): print(f"\n{i}. {inst['name']}") print(f" Type: {inst['institution_type']}") print(f" Confidence: {inst['provenance']['confidence_score']:.2f}") if inst.get('locations'): loc = inst['locations'][0] print(f" Location: {loc.get('city', '')}, {loc.get('region', '')}") if inst.get('identifiers'): urls = [id['identifier_value'] for id in inst['identifiers'] if id['identifier_scheme'] == 'Website'] if urls: print(f" URL: {urls[0]}") if __name__ == '__main__': main()