glam/extract_mexican_glams.py
2025-11-19 23:25:22 +01:00

451 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Extract Mexican GLAM institutions from conversation JSON files.
Follows GLAM Data Extraction project specifications.
"""
import json
import re
from typing import List, Dict, Any, Set
from datetime import datetime, timezone
from collections import defaultdict
class MexicanGLAMExtractor:
"""Extract heritage institutions from Mexican GLAM conversation artifacts."""
# Comprehensive institution type patterns - capture up to natural boundaries
INSTITUTION_PATTERNS = {
'MUSEUM': [
r'Museo\s+Nacional\s+de\s+(?:Antropología|Arte|Historia|Culturas\s+Populares)[A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located|operates|provides|offers|has|was|is))',
r'Museo\s+Regional\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located|INAH|\())',
r'Museo\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]{3,50}?(?=\s*[,.\n]|\s+(?:in|at|located|operates|provides))',
r'Casa\s+Museo\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
],
'LIBRARY': [
r'Biblioteca\s+Nacional\s+de\s+(?:México|Antropología)[A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
r'Biblioteca\s+(?:Central|Pública)\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
r'Hemeroteca\s+Nacional\s+(?:Digital\s+)?de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
r'Red\s+Nacional\s+de\s+Bibliotecas\s+Públicas',
],
'ARCHIVE': [
r'Archivo\s+General\s+de\s+(?:la\s+Nación|[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?)(?=\s*[,.\n]|\s+(?:in|at|located|holds|manages))',
r'Archivo\s+(?:Histórico|Municipal|del\s+Estado)\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
],
'GALLERY': [
r'Galería\s+(?:de\s+Arte\s+|Nacional\s+)?[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
],
'RESEARCH_CENTER': [
r'Instituto\s+Nacional\s+de\s+Antropología\s+e\s+Historia(?:\s+\(INAH\))?',
r'Instituto\s+Nacional\s+de\s+Bellas\s+Artes\s+y\s+Literatura(?:\s+\(INBAL\))?',
r'Instituto\s+Nacional\s+de\s+Lenguas\s+Indígenas(?:\s+\(INALI\))?',
r'Instituto\s+Nacional\s+de\s+Estudios\s+Históricos(?:\s+\(INEHRM\))?',
],
'OFFICIAL_INSTITUTION': [
r'Secretaría\s+de\s+Cultura',
r'Sistema\s+de\s+Información\s+Cultural(?:\s+\(SIC\))?',
r'Fonoteca\s+Nacional',
r'IMCINE',
],
}
# URL patterns
URL_PATTERN = re.compile(r'https?://[^\s\)]+')
# Email patterns
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
# Mexican states for location matching
MEXICAN_STATES = {
'AGUASCALIENTES', 'BAJA CALIFORNIA', 'BAJA CALIFORNIA SUR', 'CAMPECHE',
'CHIAPAS', 'CHIHUAHUA', 'COAHUILA', 'COLIMA', 'DURANGO', 'GUANAJUATO',
'GUERRERO', 'HIDALGO', 'JALISCO', 'MÉXICO', 'MICHOACÁN', 'MORELOS',
'NAYARIT', 'NUEVO LEÓN', 'OAXACA', 'PUEBLA', 'QUERÉTARO', 'QUINTANA ROO',
'SAN LUIS POTOSÍ', 'SINALOA', 'SONORA', 'TABASCO', 'TAMAULIPAS',
'TLAXCALA', 'VERACRUZ', 'YUCATÁN', 'ZACATECAS', 'CIUDAD DE MÉXICO'
}
def __init__(self):
self.extracted_institutions = {}
self.institution_id_counter = 1
def extract_from_conversation_file(self, filepath: str) -> Dict[str, Any]:
"""Extract institutions from a single conversation JSON file."""
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
conversation_id = data.get('uuid', '')
conversation_name = data.get('name', '')
# Extract from all artifact content in the conversation
for msg in data.get('chat_messages', []):
for content in msg.get('content', []):
if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
artifact_text = content.get('input', {}).get('content', '')
if artifact_text:
self._extract_from_text(artifact_text, conversation_id, conversation_name)
return {
'conversation_id': conversation_id,
'conversation_name': conversation_name,
'institutions_found': len(self.extracted_institutions)
}
def _extract_from_text(self, text: str, conversation_id: str, conversation_name: str):
"""Extract institutions from artifact text content."""
# Split text into sections for better context tracking
sections = self._split_into_sections(text)
for section in sections:
# Extract state context from section header
state = self._extract_state_from_section(section)
# Extract institutions using pattern matching
for inst_type, patterns in self.INSTITUTION_PATTERNS.items():
for pattern in patterns:
matches = re.finditer(pattern, section, re.IGNORECASE)
for match in matches:
institution_name = self._clean_institution_name(match.group(0).strip())
# Skip if it's just a generic term
if self._is_generic_term(institution_name):
continue
# Extract context around the match
context = self._get_context(section, match.start(), match.end())
# Create or update institution record
self._add_institution(
name=institution_name,
institution_type=inst_type,
state=state,
context=context,
conversation_id=conversation_id,
conversation_name=conversation_name
)
def _split_into_sections(self, text: str) -> List[str]:
"""Split text into logical sections based on headers."""
# Split on ### headers or state names in ALL CAPS
section_pattern = re.compile(r'(?:^|\n)(?:###?\s+[A-ZÁÉÍÓÚÑ]|[A-ZÁÉÍÓÚÑ]{5,})', re.MULTILINE)
sections = section_pattern.split(text)
return [s for s in sections if len(s.strip()) > 50] # Filter out tiny sections
def _extract_state_from_section(self, section: str) -> str:
"""Extract Mexican state name from section text."""
# Look for state names in first 200 characters of section
header = section[:200].upper()
for state in self.MEXICAN_STATES:
if state in header:
return state
return ""
def _is_generic_term(self, name: str) -> bool:
"""Check if extracted name is just a generic term."""
generic_terms = [
'Museo de', 'Museo del', 'Museo Nacional', 'Biblioteca de',
'Archivo de', 'Instituto Nacional', 'INAH', 'INBAL',
'Museo Regional', 'Biblioteca Nacional', 'Archivo General'
]
# Must be longer than just the generic prefix
name_clean = name.strip()
if len(name_clean) < 15: # Too short to be a full institution name
return True
return False
def _clean_institution_name(self, name: str) -> str:
"""Clean institution name by removing sentence fragments."""
# Remove trailing verbs and common sentence starters
stop_words = [
r'\s+stands?\s+as\b.*',
r'\s+operates?\s+.*',
r'\s+provides?\s+.*',
r'\s+offers?\s+.*',
r'\s+manages?\s+.*',
r'\s+holds?\s+.*',
r'\s+contains?\s+.*',
r'\s+includes?\s+.*',
r'\s+features?\s+.*',
r'\s+was\s+.*',
r'\s+is\s+.*',
r'\s+has\s+.*',
r'\s+serves?\s+.*',
r'\s+at\s+.*',
r'\s+in\s+.*',
r'\s+located\s+.*',
]
cleaned = name
for pattern in stop_words:
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
# Remove trailing punctuation and whitespace
cleaned = re.sub(r'[,.\s]+$', '', cleaned)
return cleaned.strip()
def _get_context(self, text: str, start: int, end: int, window: int = 500) -> str:
"""Extract context around a match for additional metadata."""
context_start = max(0, start - window)
context_end = min(len(text), end + window)
return text[context_start:context_end]
def _add_institution(self, name: str, institution_type: str, state: str,
context: str, conversation_id: str, conversation_name: str):
"""Add or update an institution record."""
# Normalize name for deduplication
name_normalized = self._normalize_name(name)
# Check if already exists
if name_normalized in self.extracted_institutions:
# Update existing record with additional context
existing = self.extracted_institutions[name_normalized]
existing['provenance']['confidence_score'] = min(1.0, existing['provenance']['confidence_score'] + 0.1)
if state and not existing['locations']:
if state:
existing['locations'].append({'city': '', 'region': state, 'country': 'MX'})
return
# Extract metadata from context
urls = self._extract_urls(context)
emails = self._extract_emails(context)
address = self._extract_address(context)
city = self._extract_city(context)
description = self._extract_description(context, name)
# Create new record
institution_id = f"mx-glam-{self.institution_id_counter:04d}"
self.institution_id_counter += 1
record = {
'id': institution_id,
'name': name,
'name_normalized': name_normalized,
'institution_type': institution_type,
'alternative_names': [],
'description': description,
'locations': [],
'identifiers': [],
'digital_platforms': [],
'provenance': {
'data_source': 'CONVERSATION_NLP',
'data_tier': 'TIER_4_INFERRED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Pattern-based NER from Mexican GLAM conversations',
'confidence_score': self._calculate_confidence(name, context, urls),
'conversation_id': conversation_id,
'source_url': None,
}
}
# Add location if available
if city or state:
location = {
'city': city,
'region': state,
'country': 'MX',
}
if address:
location['street_address'] = address
record['locations'].append(location)
# Add identifiers (URLs, emails)
for url in urls:
record['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': url,
'identifier_url': url,
})
if emails:
record['identifiers'].append({
'identifier_scheme': 'Email',
'identifier_value': emails[0], # Take first email
})
self.extracted_institutions[name_normalized] = record
def _normalize_name(self, name: str) -> str:
"""Normalize institution name for deduplication."""
# Remove extra whitespace, lowercase, remove punctuation
normalized = re.sub(r'\s+', ' ', name.strip().lower())
normalized = re.sub(r'[^\w\s]', '', normalized)
return normalized
def _extract_urls(self, context: str) -> List[str]:
"""Extract URLs from context."""
urls = self.URL_PATTERN.findall(context)
return [url.rstrip('.,;)') for url in urls[:3]] # Take up to 3 URLs
def _extract_emails(self, context: str) -> List[str]:
"""Extract email addresses from context."""
return self.EMAIL_PATTERN.findall(context)[:2] # Take up to 2 emails
def _extract_address(self, context: str) -> str:
"""Extract street address from context."""
# Look for "Address:" or address patterns
address_match = re.search(r'(?:Address|Dirección):\s*([^\n]+)', context, re.IGNORECASE)
if address_match:
return address_match.group(1).strip()
# Look for Mexican address patterns (street number, colony, postal code)
address_match = re.search(r'([A-Z][a-záéíóúñ]+\s+\d+[^,\n]+,?\s*\d{5})', context)
if address_match:
return address_match.group(1).strip()
return ""
def _extract_city(self, context: str) -> str:
"""Extract city name from context."""
# Common Mexican cities
cities = [
'México', 'Guadalajara', 'Monterrey', 'Puebla', 'Toluca',
'Tijuana', 'León', 'Ciudad Juárez', 'Zapopan', 'Mérida',
'Aguascalientes', 'Querétaro', 'Morelia', 'Hermosillo',
'Saltillo', 'Mexicali', 'Culiacán', 'Chihuahua', 'Oaxaca',
'Veracruz', 'Acapulco', 'Cancún', 'Cuernavaca', 'Pachuca',
'Durango', 'Tepic', 'Tuxtla Gutiérrez', 'Villahermosa',
'Campeche', 'Chetumal', 'Zacatecas', 'Colima', 'Guanajuato',
'Haarlem', # In case of Dutch crossover
]
context_lower = context.lower()
for city in cities:
if city.lower() in context_lower:
return city
return ""
def _extract_description(self, context: str, institution_name: str) -> str:
"""Extract description from context around institution mention."""
# Look for sentences containing the institution name
sentences = re.split(r'[.!?]\s+', context)
relevant_sentences = []
for sentence in sentences:
if institution_name[:20] in sentence: # Match on first 20 chars
relevant_sentences.append(sentence.strip())
if relevant_sentences:
return '. '.join(relevant_sentences[:2]) # Take up to 2 sentences
return ""
def _calculate_confidence(self, name: str, context: str, urls: List[str]) -> float:
"""Calculate confidence score for extraction."""
confidence = 0.6 # Base confidence for pattern match
# Increase confidence based on available metadata
if urls:
confidence += 0.15
if len(context) > 300:
confidence += 0.1
if len(name) > 25: # Longer, more specific name
confidence += 0.1
if 'museo' in name.lower() or 'biblioteca' in name.lower() or 'archivo' in name.lower():
confidence += 0.05
return min(1.0, confidence)
def get_results(self) -> Dict[str, Any]:
"""Get extraction results with statistics."""
institutions_list = list(self.extracted_institutions.values())
# Calculate statistics
type_counts = defaultdict(int)
state_counts = defaultdict(int)
for inst in institutions_list:
type_counts[inst['institution_type']] += 1
for loc in inst.get('locations', []):
if loc.get('region'):
state_counts[loc['region']] += 1
return {
'total_institutions': len(institutions_list),
'institutions': institutions_list,
'statistics': {
'by_type': dict(type_counts),
'by_state': dict(state_counts),
'with_urls': sum(1 for i in institutions_list if i.get('identifiers')),
'with_locations': sum(1 for i in institutions_list if i.get('locations')),
},
'extraction_metadata': {
'extraction_date': datetime.now(timezone.utc).isoformat(),
'data_tier': 'TIER_4_INFERRED',
'data_source': 'CONVERSATION_NLP',
}
}
def main():
"""Main extraction workflow."""
print("Mexican GLAM Institution Extractor")
print("=" * 60)
extractor = MexicanGLAMExtractor()
# Process both conversation files
files = [
'mexican_glam_1.json',
'mexican_glam_2.json'
]
for filepath in files:
print(f"\nProcessing: {filepath}")
result = extractor.extract_from_conversation_file(filepath)
print(f" Conversation: {result['conversation_name']}")
print(f" UUID: {result['conversation_id']}")
print(f" Running total: {result['institutions_found']} institutions")
# Get final results
results = extractor.get_results()
# Print statistics
print("\n" + "=" * 60)
print("EXTRACTION RESULTS")
print("=" * 60)
print(f"Total institutions extracted: {results['total_institutions']}")
print(f"\nBy institution type:")
for inst_type, count in sorted(results['statistics']['by_type'].items()):
print(f" {inst_type}: {count}")
print(f"\nTop 10 states by institution count:")
state_counts = results['statistics']['by_state']
for state, count in sorted(state_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
print(f" {state}: {count}")
print(f"\nMetadata completeness:")
print(f" With URLs: {results['statistics']['with_urls']}")
print(f" With locations: {results['statistics']['with_locations']}")
# Save to JSON
output_file = 'mexican_glam_extracted.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\nResults saved to: {output_file}")
# Show sample institutions
print("\n" + "=" * 60)
print("SAMPLE INSTITUTIONS (first 10):")
print("=" * 60)
for i, inst in enumerate(results['institutions'][:10], 1):
print(f"\n{i}. {inst['name']}")
print(f" Type: {inst['institution_type']}")
print(f" Confidence: {inst['provenance']['confidence_score']:.2f}")
if inst.get('locations'):
loc = inst['locations'][0]
print(f" Location: {loc.get('city', '')}, {loc.get('region', '')}")
if inst.get('identifiers'):
urls = [id['identifier_value'] for id in inst['identifiers'] if id['identifier_scheme'] == 'Website']
if urls:
print(f" URL: {urls[0]}")
if __name__ == '__main__':
main()