glam/extract_mexican_glams_v2.py
2025-11-19 23:25:22 +01:00

486 lines
20 KiB
Python

#!/usr/bin/env python3
"""
Extract Mexican GLAM institutions from conversation JSON files.
Version 2: Improved markdown parsing for structured institution lists.
Follows GLAM Data Extraction project specifications.
"""
import json
import re
from typing import List, Dict, Any, Tuple
from datetime import datetime, timezone
from collections import defaultdict
class MexicanGLAMExtractor:
"""Extract heritage institutions from Mexican GLAM conversation artifacts."""
# URL patterns
URL_PATTERN = re.compile(r'https?://[^\s\)]+')
# Email patterns
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
# Phone patterns
PHONE_PATTERN = re.compile(r'(?:Phone|Tel|Teléfono):\s*([0-9\s\-\(\)]+)')
# Address patterns
ADDRESS_PATTERN = re.compile(r'Address:\s*([^\n]+)')
# Mexican states for location matching
MEXICAN_STATES = {
'AGUASCALIENTES', 'BAJA CALIFORNIA', 'BAJA CALIFORNIA SUR', 'CAMPECHE',
'CHIAPAS', 'CHIHUAHUA', 'COAHUILA', 'COLIMA', 'DURANGO', 'GUANAJUATO',
'GUERRERO', 'HIDALGO', 'JALISCO', 'MÉXICO', 'MICHOACÁN', 'MORELOS',
'NAYARIT', 'NUEVO LEÓN', 'OAXACA', 'PUEBLA', 'QUERÉTARO', 'QUINTANA ROO',
'SAN LUIS POTOSÍ', 'SINALOA', 'SONORA', 'TABASCO', 'TAMAULIPAS',
'TLAXCALA', 'VERACRUZ', 'YUCATÁN', 'ZACATECAS', 'CIUDAD DE MÉXICO'
}
# Common Mexican cities
MEXICAN_CITIES = {
'México', 'Guadalajara', 'Monterrey', 'Puebla', 'Toluca', 'Tijuana',
'León', 'Ciudad Juárez', 'Zapopan', 'Mérida', 'Aguascalientes',
'Querétaro', 'Morelia', 'Hermosillo', 'Saltillo', 'Mexicali',
'Culiacán', 'Chihuahua', 'Oaxaca', 'Veracruz', 'Acapulco',
'Cancún', 'Cuernavaca', 'Pachuca', 'Durango', 'Tepic',
'Tuxtla Gutiérrez', 'Villahermosa', 'Campeche', 'Chetumal',
'Zacatecas', 'Colima', 'Guanajuato', 'San Luis Potosí'
}
def __init__(self):
self.extracted_institutions = {}
self.institution_id_counter = 1
def extract_from_conversation_file(self, filepath: str) -> Dict[str, Any]:
"""Extract institutions from a single conversation JSON file."""
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
conversation_id = data.get('uuid', '')
conversation_name = data.get('name', '')
# Extract from all artifact content in the conversation
for msg in data.get('chat_messages', []):
for content in msg.get('content', []):
if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
artifact_text = content.get('input', {}).get('content', '')
if artifact_text:
self._extract_from_artifact(artifact_text, conversation_id, conversation_name)
return {
'conversation_id': conversation_id,
'conversation_name': conversation_name,
'institutions_found': len(self.extracted_institutions)
}
def _extract_from_artifact(self, text: str, conversation_id: str, conversation_name: str):
"""Extract institutions from artifact text content."""
# Strategy 1: Extract from state-by-state sections
self._extract_from_state_sections(text, conversation_id, conversation_name)
# Strategy 2: Extract from national platforms section
self._extract_national_platforms(text, conversation_id, conversation_name)
# Strategy 3: Extract inline mentions with URLs
self._extract_inline_mentions(text, conversation_id, conversation_name)
def _extract_from_state_sections(self, text: str, conversation_id: str, conversation_name: str):
"""Extract institutions from state-by-state directory sections."""
# Find all state sections (### STATE_NAME)
state_pattern = re.compile(r'###\s+([A-ZÁÉÍÓÚÑ\s]+)\n\n(.*?)(?=\n###\s+[A-ZÁÉÍÓÚÑ]|\Z)', re.DOTALL)
for match in state_pattern.finditer(text):
state_name = match.group(1).strip()
section_content = match.group(2)
# Only process if it's a valid Mexican state
if state_name.upper() not in self.MEXICAN_STATES:
continue
# Extract institutions from bullet lists with bold names
self._extract_from_bullet_lists(section_content, state_name, conversation_id, conversation_name)
def _extract_from_bullet_lists(self, text: str, state: str, conversation_id: str, conversation_name: str):
"""Extract institutions from markdown bullet lists with bold names."""
# Pattern: - **Institution Name**: description or - **Institution Name (ACRONYM)**: description
# Also handles multi-line entries with nested bullets
pattern = re.compile(r'^-\s+\*\*([^*]+?)\*\*:?\s*(.*?)(?=\n-\s+\*\*|\n\n|\Z)', re.MULTILINE | re.DOTALL)
for match in pattern.finditer(text):
institution_name = match.group(1).strip()
description_block = match.group(2).strip()
# Classify institution type
inst_type = self._classify_institution_type(institution_name, description_block)
if not inst_type:
continue # Skip if we can't determine type
# Extract metadata from description block
metadata = self._extract_metadata_from_block(description_block)
# Add institution
self._add_institution(
name=institution_name,
institution_type=inst_type,
state=state,
metadata=metadata,
conversation_id=conversation_id,
conversation_name=conversation_name
)
def _extract_national_platforms(self, text: str, conversation_id: str, conversation_name: str):
"""Extract national-level platforms and institutions."""
# Look for "National" or "Federal" sections
national_sections = re.finditer(
r'##\s+(?:National|Federal|Core\s+National).*?\n(.*?)(?=\n##|\Z)',
text, re.DOTALL | re.IGNORECASE
)
for section_match in national_sections:
section_content = section_match.group(1)
# Extract from bullet lists
self._extract_from_bullet_lists(section_content, '', conversation_id, conversation_name)
def _extract_inline_mentions(self, text: str, conversation_id: str, conversation_name: str):
"""Extract institutions mentioned inline with URLs."""
# Pattern 1: **Name**\n- **URL**: https://...
url_pattern = re.compile(
r'\*\*([^*]+?)\*\*\s*\n\s*-\s*\*\*URL\*\*:\s*(https?://[^\s\)]+)',
re.MULTILINE
)
for match in url_pattern.finditer(text):
institution_name = match.group(1).strip()
url = match.group(2).strip().rstrip('.,;)')
# Get context for better metadata
context_start = max(0, match.start() - 200)
context_end = min(len(text), match.end() + 500)
context = text[context_start:context_end]
metadata = self._extract_metadata_from_block(context)
metadata['urls'] = [url] + metadata.get('urls', [])
inst_type = self._classify_institution_type(institution_name, context)
if inst_type:
self._add_institution(
name=institution_name,
institution_type=inst_type,
state='',
metadata=metadata,
conversation_id=conversation_id,
conversation_name=conversation_name
)
# Pattern 2: Institution name at/URL/website: URL
inline_pattern = re.compile(
r'\b((?:Museo|Biblioteca|Archivo|Instituto|Centro|Sistema|Fonoteca|Mapoteca)\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]+?)\s+(?:at|URL|website|portal):\s*(https?://[^\s\)]+)',
re.IGNORECASE
)
for match in inline_pattern.finditer(text):
institution_name = match.group(1).strip()
url = match.group(2).strip().rstrip('.,;)')
inst_type = self._classify_institution_type(institution_name, '')
if inst_type:
self._add_institution(
name=institution_name,
institution_type=inst_type,
state='',
metadata={'urls': [url]},
conversation_id=conversation_id,
conversation_name=conversation_name
)
def _classify_institution_type(self, name: str, description: str) -> str:
"""Classify institution type based on name and description."""
name_lower = name.lower()
desc_lower = description.lower()
combined = name_lower + ' ' + desc_lower
# Classification rules
if any(term in name_lower for term in ['museo', 'museum']):
return 'MUSEUM'
elif any(term in name_lower for term in ['biblioteca', 'library', 'hemeroteca']):
return 'LIBRARY'
elif any(term in name_lower for term in ['archivo', 'archive', 'mapoteca']):
return 'ARCHIVE'
elif any(term in name_lower for term in ['galería', 'gallery']):
return 'GALLERY'
elif any(term in name_lower for term in ['instituto', 'institute', 'centro', 'center']):
# Check if it's a research center or museum
if 'museo' in combined or 'museum' in combined:
return 'MUSEUM'
else:
return 'RESEARCH_CENTER'
elif any(term in name_lower for term in ['secretaría', 'sistema', 'fonoteca', 'imcine', 'red nacional', 'mediateca', 'mexicana', 'memórica']):
return 'OFFICIAL_INSTITUTION'
return '' # Unknown type
def _extract_metadata_from_block(self, block: str) -> Dict[str, Any]:
"""Extract metadata (URLs, addresses, emails, phones) from description block."""
metadata = {
'urls': [],
'emails': [],
'phones': [],
'address': '',
'city': '',
'description': '',
}
# Extract URLs
urls = self.URL_PATTERN.findall(block)
metadata['urls'] = [url.rstrip('.,;)') for url in urls[:3]]
# Extract emails
metadata['emails'] = self.EMAIL_PATTERN.findall(block)[:2]
# Extract phones
phone_matches = self.PHONE_PATTERN.findall(block)
metadata['phones'] = [p.strip() for p in phone_matches][:2]
# Extract address
address_match = self.ADDRESS_PATTERN.search(block)
if address_match:
metadata['address'] = address_match.group(1).strip().rstrip('.,;')
# Extract city from address or text
for city in self.MEXICAN_CITIES:
if city.lower() in block.lower():
metadata['city'] = city
break
# Extract description (first sentence or line)
lines = block.split('\n')
for line in lines:
line = line.strip().lstrip('-').strip()
if line and not any(line.startswith(prefix) for prefix in ['Address:', 'Phone:', 'Email:', 'URL:', 'Director:']):
# Clean up the line
desc = re.sub(r'\*\*', '', line) # Remove bold markers
desc = re.sub(r'\[.*?\]\(.*?\)', '', desc) # Remove markdown links
if len(desc) > 20:
metadata['description'] = desc[:300]
break
return metadata
def _add_institution(self, name: str, institution_type: str, state: str,
metadata: Dict[str, Any], conversation_id: str, conversation_name: str):
"""Add or update an institution record."""
# Normalize name for deduplication
name_normalized = self._normalize_name(name)
# Check if already exists
if name_normalized in self.extracted_institutions:
# Update existing record
existing = self.extracted_institutions[name_normalized]
existing['provenance']['confidence_score'] = min(1.0, existing['provenance']['confidence_score'] + 0.15)
# Merge metadata
if metadata.get('urls'):
for url in metadata['urls']:
if not any(id.get('identifier_value') == url for id in existing['identifiers']):
existing['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': url,
'identifier_url': url,
})
if state and not existing['locations']:
existing['locations'].append({'city': metadata.get('city', ''), 'region': state, 'country': 'MX'})
return
# Create new record
institution_id = f"mx-glam-{self.institution_id_counter:04d}"
self.institution_id_counter += 1
record = {
'id': institution_id,
'name': name,
'name_normalized': name_normalized,
'institution_type': institution_type,
'alternative_names': [],
'description': metadata.get('description', ''),
'locations': [],
'identifiers': [],
'digital_platforms': [],
'provenance': {
'data_source': 'CONVERSATION_NLP',
'data_tier': 'TIER_4_INFERRED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Markdown parsing + pattern matching from Mexican GLAM conversations',
'confidence_score': self._calculate_confidence(name, metadata),
'conversation_id': conversation_id,
'source_url': None,
}
}
# Add location if available
if metadata.get('city') or state:
location = {
'city': metadata.get('city', ''),
'region': state,
'country': 'MX',
}
if metadata.get('address'):
location['street_address'] = metadata['address']
record['locations'].append(location)
# Add identifiers
for url in metadata.get('urls', []):
record['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': url,
'identifier_url': url,
})
for email in metadata.get('emails', []):
record['identifiers'].append({
'identifier_scheme': 'Email',
'identifier_value': email,
})
self.extracted_institutions[name_normalized] = record
def _normalize_name(self, name: str) -> str:
"""Normalize institution name for deduplication."""
# Remove acronyms in parentheses
name = re.sub(r'\s*\([A-Z]+\)\s*', ' ', name)
# Remove extra whitespace, lowercase
normalized = re.sub(r'\s+', ' ', name.strip().lower())
return normalized
def _calculate_confidence(self, name: str, metadata: Dict[str, Any]) -> float:
"""Calculate confidence score for extraction."""
confidence = 0.7 # Base confidence for structured extraction
# Increase confidence based on available metadata
if metadata.get('urls'):
confidence += 0.15
if metadata.get('address'):
confidence += 0.1
if metadata.get('description'):
confidence += 0.05
return min(1.0, confidence)
def get_results(self) -> Dict[str, Any]:
"""Get extraction results with statistics."""
institutions_list = list(self.extracted_institutions.values())
# Calculate statistics
type_counts = defaultdict(int)
state_counts = defaultdict(int)
for inst in institutions_list:
type_counts[inst['institution_type']] += 1
for loc in inst.get('locations', []):
if loc.get('region'):
state_counts[loc['region']] += 1
return {
'total_institutions': len(institutions_list),
'institutions': institutions_list,
'statistics': {
'by_type': dict(type_counts),
'by_state': dict(state_counts),
'with_urls': sum(1 for i in institutions_list if any(id.get('identifier_scheme') == 'Website' for id in i.get('identifiers', []))),
'with_locations': sum(1 for i in institutions_list if i.get('locations')),
'with_descriptions': sum(1 for i in institutions_list if i.get('description')),
},
'extraction_metadata': {
'extraction_date': datetime.now(timezone.utc).isoformat(),
'data_tier': 'TIER_4_INFERRED',
'data_source': 'CONVERSATION_NLP',
}
}
def main():
"""Main extraction workflow."""
print("Mexican GLAM Institution Extractor v2")
print("=" * 60)
extractor = MexicanGLAMExtractor()
# Process both conversation files
files = [
'mexican_glam_1.json',
'mexican_glam_2.json'
]
for filepath in files:
print(f"\nProcessing: {filepath}")
result = extractor.extract_from_conversation_file(filepath)
print(f" Conversation: {result['conversation_name']}")
print(f" UUID: {result['conversation_id']}")
print(f" Running total: {result['institutions_found']} institutions")
# Get final results
results = extractor.get_results()
# Print statistics
print("\n" + "=" * 60)
print("EXTRACTION RESULTS")
print("=" * 60)
print(f"Total institutions extracted: {results['total_institutions']}")
print(f"\nBy institution type:")
for inst_type, count in sorted(results['statistics']['by_type'].items(), key=lambda x: x[1], reverse=True):
print(f" {inst_type}: {count}")
print(f"\nTop 15 states by institution count:")
state_counts = results['statistics']['by_state']
for state, count in sorted(state_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
print(f" {state}: {count}")
print(f"\nMetadata completeness:")
print(f" With URLs: {results['statistics']['with_urls']}")
print(f" With locations: {results['statistics']['with_locations']}")
print(f" With descriptions: {results['statistics']['with_descriptions']}")
# Save to JSON
output_file = 'mexican_glam_extracted.json'
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"\nResults saved to: {output_file}")
# Show sample institutions
print("\n" + "=" * 60)
print("SAMPLE INSTITUTIONS (first 15):")
print("=" * 60)
for i, inst in enumerate(results['institutions'][:15], 1):
print(f"\n{i}. {inst['name']}")
print(f" Type: {inst['institution_type']}")
print(f" Confidence: {inst['provenance']['confidence_score']:.2f}")
if inst.get('locations'):
loc = inst['locations'][0]
city_state = f"{loc.get('city', '')}, {loc.get('region', '')}".strip(', ')
if city_state:
print(f" Location: {city_state}")
if inst.get('description'):
desc = inst['description'][:100] + '...' if len(inst['description']) > 100 else inst['description']
print(f" Description: {desc}")
urls = [id['identifier_value'] for id in inst.get('identifiers', []) if id['identifier_scheme'] == 'Website']
if urls:
print(f" URL: {urls[0]}")
if __name__ == '__main__':
main()