451 lines
19 KiB
Python
451 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract Mexican GLAM institutions from conversation JSON files.
|
|
Follows GLAM Data Extraction project specifications.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from typing import List, Dict, Any, Set
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
|
|
|
|
class MexicanGLAMExtractor:
|
|
"""Extract heritage institutions from Mexican GLAM conversation artifacts."""
|
|
|
|
# Comprehensive institution type patterns - capture up to natural boundaries
|
|
INSTITUTION_PATTERNS = {
|
|
'MUSEUM': [
|
|
r'Museo\s+Nacional\s+de\s+(?:Antropología|Arte|Historia|Culturas\s+Populares)[A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located|operates|provides|offers|has|was|is))',
|
|
r'Museo\s+Regional\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located|INAH|\())',
|
|
r'Museo\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]{3,50}?(?=\s*[,.\n]|\s+(?:in|at|located|operates|provides))',
|
|
r'Casa\s+Museo\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
|
|
],
|
|
'LIBRARY': [
|
|
r'Biblioteca\s+Nacional\s+de\s+(?:México|Antropología)[A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
|
|
r'Biblioteca\s+(?:Central|Pública)\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
|
|
r'Hemeroteca\s+Nacional\s+(?:Digital\s+)?de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
|
|
r'Red\s+Nacional\s+de\s+Bibliotecas\s+Públicas',
|
|
],
|
|
'ARCHIVE': [
|
|
r'Archivo\s+General\s+de\s+(?:la\s+Nación|[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?)(?=\s*[,.\n]|\s+(?:in|at|located|holds|manages))',
|
|
r'Archivo\s+(?:Histórico|Municipal|del\s+Estado)\s+de\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
|
|
],
|
|
'GALLERY': [
|
|
r'Galería\s+(?:de\s+Arte\s+|Nacional\s+)?[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]*?(?=\s*[,.\n]|\s+(?:in|at|located))',
|
|
],
|
|
'RESEARCH_CENTER': [
|
|
r'Instituto\s+Nacional\s+de\s+Antropología\s+e\s+Historia(?:\s+\(INAH\))?',
|
|
r'Instituto\s+Nacional\s+de\s+Bellas\s+Artes\s+y\s+Literatura(?:\s+\(INBAL\))?',
|
|
r'Instituto\s+Nacional\s+de\s+Lenguas\s+Indígenas(?:\s+\(INALI\))?',
|
|
r'Instituto\s+Nacional\s+de\s+Estudios\s+Históricos(?:\s+\(INEHRM\))?',
|
|
],
|
|
'OFFICIAL_INSTITUTION': [
|
|
r'Secretaría\s+de\s+Cultura',
|
|
r'Sistema\s+de\s+Información\s+Cultural(?:\s+\(SIC\))?',
|
|
r'Fonoteca\s+Nacional',
|
|
r'IMCINE',
|
|
],
|
|
}
|
|
|
|
# URL patterns
|
|
URL_PATTERN = re.compile(r'https?://[^\s\)]+')
|
|
|
|
# Email patterns
|
|
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
|
|
|
|
# Mexican states for location matching
|
|
MEXICAN_STATES = {
|
|
'AGUASCALIENTES', 'BAJA CALIFORNIA', 'BAJA CALIFORNIA SUR', 'CAMPECHE',
|
|
'CHIAPAS', 'CHIHUAHUA', 'COAHUILA', 'COLIMA', 'DURANGO', 'GUANAJUATO',
|
|
'GUERRERO', 'HIDALGO', 'JALISCO', 'MÉXICO', 'MICHOACÁN', 'MORELOS',
|
|
'NAYARIT', 'NUEVO LEÓN', 'OAXACA', 'PUEBLA', 'QUERÉTARO', 'QUINTANA ROO',
|
|
'SAN LUIS POTOSÍ', 'SINALOA', 'SONORA', 'TABASCO', 'TAMAULIPAS',
|
|
'TLAXCALA', 'VERACRUZ', 'YUCATÁN', 'ZACATECAS', 'CIUDAD DE MÉXICO'
|
|
}
|
|
|
|
def __init__(self):
|
|
self.extracted_institutions = {}
|
|
self.institution_id_counter = 1
|
|
|
|
def extract_from_conversation_file(self, filepath: str) -> Dict[str, Any]:
|
|
"""Extract institutions from a single conversation JSON file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
conversation_id = data.get('uuid', '')
|
|
conversation_name = data.get('name', '')
|
|
|
|
# Extract from all artifact content in the conversation
|
|
for msg in data.get('chat_messages', []):
|
|
for content in msg.get('content', []):
|
|
if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
|
|
artifact_text = content.get('input', {}).get('content', '')
|
|
if artifact_text:
|
|
self._extract_from_text(artifact_text, conversation_id, conversation_name)
|
|
|
|
return {
|
|
'conversation_id': conversation_id,
|
|
'conversation_name': conversation_name,
|
|
'institutions_found': len(self.extracted_institutions)
|
|
}
|
|
|
|
def _extract_from_text(self, text: str, conversation_id: str, conversation_name: str):
|
|
"""Extract institutions from artifact text content."""
|
|
|
|
# Split text into sections for better context tracking
|
|
sections = self._split_into_sections(text)
|
|
|
|
for section in sections:
|
|
# Extract state context from section header
|
|
state = self._extract_state_from_section(section)
|
|
|
|
# Extract institutions using pattern matching
|
|
for inst_type, patterns in self.INSTITUTION_PATTERNS.items():
|
|
for pattern in patterns:
|
|
matches = re.finditer(pattern, section, re.IGNORECASE)
|
|
for match in matches:
|
|
institution_name = self._clean_institution_name(match.group(0).strip())
|
|
|
|
# Skip if it's just a generic term
|
|
if self._is_generic_term(institution_name):
|
|
continue
|
|
|
|
# Extract context around the match
|
|
context = self._get_context(section, match.start(), match.end())
|
|
|
|
# Create or update institution record
|
|
self._add_institution(
|
|
name=institution_name,
|
|
institution_type=inst_type,
|
|
state=state,
|
|
context=context,
|
|
conversation_id=conversation_id,
|
|
conversation_name=conversation_name
|
|
)
|
|
|
|
def _split_into_sections(self, text: str) -> List[str]:
|
|
"""Split text into logical sections based on headers."""
|
|
# Split on ### headers or state names in ALL CAPS
|
|
section_pattern = re.compile(r'(?:^|\n)(?:###?\s+[A-ZÁÉÍÓÚÑ]|[A-ZÁÉÍÓÚÑ]{5,})', re.MULTILINE)
|
|
sections = section_pattern.split(text)
|
|
return [s for s in sections if len(s.strip()) > 50] # Filter out tiny sections
|
|
|
|
def _extract_state_from_section(self, section: str) -> str:
|
|
"""Extract Mexican state name from section text."""
|
|
# Look for state names in first 200 characters of section
|
|
header = section[:200].upper()
|
|
for state in self.MEXICAN_STATES:
|
|
if state in header:
|
|
return state
|
|
return ""
|
|
|
|
def _is_generic_term(self, name: str) -> bool:
|
|
"""Check if extracted name is just a generic term."""
|
|
generic_terms = [
|
|
'Museo de', 'Museo del', 'Museo Nacional', 'Biblioteca de',
|
|
'Archivo de', 'Instituto Nacional', 'INAH', 'INBAL',
|
|
'Museo Regional', 'Biblioteca Nacional', 'Archivo General'
|
|
]
|
|
|
|
# Must be longer than just the generic prefix
|
|
name_clean = name.strip()
|
|
if len(name_clean) < 15: # Too short to be a full institution name
|
|
return True
|
|
|
|
return False
|
|
|
|
def _clean_institution_name(self, name: str) -> str:
|
|
"""Clean institution name by removing sentence fragments."""
|
|
# Remove trailing verbs and common sentence starters
|
|
stop_words = [
|
|
r'\s+stands?\s+as\b.*',
|
|
r'\s+operates?\s+.*',
|
|
r'\s+provides?\s+.*',
|
|
r'\s+offers?\s+.*',
|
|
r'\s+manages?\s+.*',
|
|
r'\s+holds?\s+.*',
|
|
r'\s+contains?\s+.*',
|
|
r'\s+includes?\s+.*',
|
|
r'\s+features?\s+.*',
|
|
r'\s+was\s+.*',
|
|
r'\s+is\s+.*',
|
|
r'\s+has\s+.*',
|
|
r'\s+serves?\s+.*',
|
|
r'\s+at\s+.*',
|
|
r'\s+in\s+.*',
|
|
r'\s+located\s+.*',
|
|
]
|
|
|
|
cleaned = name
|
|
for pattern in stop_words:
|
|
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE)
|
|
|
|
# Remove trailing punctuation and whitespace
|
|
cleaned = re.sub(r'[,.\s]+$', '', cleaned)
|
|
|
|
return cleaned.strip()
|
|
|
|
def _get_context(self, text: str, start: int, end: int, window: int = 500) -> str:
|
|
"""Extract context around a match for additional metadata."""
|
|
context_start = max(0, start - window)
|
|
context_end = min(len(text), end + window)
|
|
return text[context_start:context_end]
|
|
|
|
def _add_institution(self, name: str, institution_type: str, state: str,
|
|
context: str, conversation_id: str, conversation_name: str):
|
|
"""Add or update an institution record."""
|
|
|
|
# Normalize name for deduplication
|
|
name_normalized = self._normalize_name(name)
|
|
|
|
# Check if already exists
|
|
if name_normalized in self.extracted_institutions:
|
|
# Update existing record with additional context
|
|
existing = self.extracted_institutions[name_normalized]
|
|
existing['provenance']['confidence_score'] = min(1.0, existing['provenance']['confidence_score'] + 0.1)
|
|
if state and not existing['locations']:
|
|
if state:
|
|
existing['locations'].append({'city': '', 'region': state, 'country': 'MX'})
|
|
return
|
|
|
|
# Extract metadata from context
|
|
urls = self._extract_urls(context)
|
|
emails = self._extract_emails(context)
|
|
address = self._extract_address(context)
|
|
city = self._extract_city(context)
|
|
description = self._extract_description(context, name)
|
|
|
|
# Create new record
|
|
institution_id = f"mx-glam-{self.institution_id_counter:04d}"
|
|
self.institution_id_counter += 1
|
|
|
|
record = {
|
|
'id': institution_id,
|
|
'name': name,
|
|
'name_normalized': name_normalized,
|
|
'institution_type': institution_type,
|
|
'alternative_names': [],
|
|
'description': description,
|
|
'locations': [],
|
|
'identifiers': [],
|
|
'digital_platforms': [],
|
|
'provenance': {
|
|
'data_source': 'CONVERSATION_NLP',
|
|
'data_tier': 'TIER_4_INFERRED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Pattern-based NER from Mexican GLAM conversations',
|
|
'confidence_score': self._calculate_confidence(name, context, urls),
|
|
'conversation_id': conversation_id,
|
|
'source_url': None,
|
|
}
|
|
}
|
|
|
|
# Add location if available
|
|
if city or state:
|
|
location = {
|
|
'city': city,
|
|
'region': state,
|
|
'country': 'MX',
|
|
}
|
|
if address:
|
|
location['street_address'] = address
|
|
record['locations'].append(location)
|
|
|
|
# Add identifiers (URLs, emails)
|
|
for url in urls:
|
|
record['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': url,
|
|
'identifier_url': url,
|
|
})
|
|
|
|
if emails:
|
|
record['identifiers'].append({
|
|
'identifier_scheme': 'Email',
|
|
'identifier_value': emails[0], # Take first email
|
|
})
|
|
|
|
self.extracted_institutions[name_normalized] = record
|
|
|
|
def _normalize_name(self, name: str) -> str:
|
|
"""Normalize institution name for deduplication."""
|
|
# Remove extra whitespace, lowercase, remove punctuation
|
|
normalized = re.sub(r'\s+', ' ', name.strip().lower())
|
|
normalized = re.sub(r'[^\w\s]', '', normalized)
|
|
return normalized
|
|
|
|
def _extract_urls(self, context: str) -> List[str]:
|
|
"""Extract URLs from context."""
|
|
urls = self.URL_PATTERN.findall(context)
|
|
return [url.rstrip('.,;)') for url in urls[:3]] # Take up to 3 URLs
|
|
|
|
def _extract_emails(self, context: str) -> List[str]:
|
|
"""Extract email addresses from context."""
|
|
return self.EMAIL_PATTERN.findall(context)[:2] # Take up to 2 emails
|
|
|
|
def _extract_address(self, context: str) -> str:
|
|
"""Extract street address from context."""
|
|
# Look for "Address:" or address patterns
|
|
address_match = re.search(r'(?:Address|Dirección):\s*([^\n]+)', context, re.IGNORECASE)
|
|
if address_match:
|
|
return address_match.group(1).strip()
|
|
|
|
# Look for Mexican address patterns (street number, colony, postal code)
|
|
address_match = re.search(r'([A-Z][a-záéíóúñ]+\s+\d+[^,\n]+,?\s*\d{5})', context)
|
|
if address_match:
|
|
return address_match.group(1).strip()
|
|
|
|
return ""
|
|
|
|
def _extract_city(self, context: str) -> str:
|
|
"""Extract city name from context."""
|
|
# Common Mexican cities
|
|
cities = [
|
|
'México', 'Guadalajara', 'Monterrey', 'Puebla', 'Toluca',
|
|
'Tijuana', 'León', 'Ciudad Juárez', 'Zapopan', 'Mérida',
|
|
'Aguascalientes', 'Querétaro', 'Morelia', 'Hermosillo',
|
|
'Saltillo', 'Mexicali', 'Culiacán', 'Chihuahua', 'Oaxaca',
|
|
'Veracruz', 'Acapulco', 'Cancún', 'Cuernavaca', 'Pachuca',
|
|
'Durango', 'Tepic', 'Tuxtla Gutiérrez', 'Villahermosa',
|
|
'Campeche', 'Chetumal', 'Zacatecas', 'Colima', 'Guanajuato',
|
|
'Haarlem', # In case of Dutch crossover
|
|
]
|
|
|
|
context_lower = context.lower()
|
|
for city in cities:
|
|
if city.lower() in context_lower:
|
|
return city
|
|
|
|
return ""
|
|
|
|
def _extract_description(self, context: str, institution_name: str) -> str:
|
|
"""Extract description from context around institution mention."""
|
|
# Look for sentences containing the institution name
|
|
sentences = re.split(r'[.!?]\s+', context)
|
|
relevant_sentences = []
|
|
|
|
for sentence in sentences:
|
|
if institution_name[:20] in sentence: # Match on first 20 chars
|
|
relevant_sentences.append(sentence.strip())
|
|
|
|
if relevant_sentences:
|
|
return '. '.join(relevant_sentences[:2]) # Take up to 2 sentences
|
|
|
|
return ""
|
|
|
|
def _calculate_confidence(self, name: str, context: str, urls: List[str]) -> float:
|
|
"""Calculate confidence score for extraction."""
|
|
confidence = 0.6 # Base confidence for pattern match
|
|
|
|
# Increase confidence based on available metadata
|
|
if urls:
|
|
confidence += 0.15
|
|
if len(context) > 300:
|
|
confidence += 0.1
|
|
if len(name) > 25: # Longer, more specific name
|
|
confidence += 0.1
|
|
if 'museo' in name.lower() or 'biblioteca' in name.lower() or 'archivo' in name.lower():
|
|
confidence += 0.05
|
|
|
|
return min(1.0, confidence)
|
|
|
|
def get_results(self) -> Dict[str, Any]:
|
|
"""Get extraction results with statistics."""
|
|
institutions_list = list(self.extracted_institutions.values())
|
|
|
|
# Calculate statistics
|
|
type_counts = defaultdict(int)
|
|
state_counts = defaultdict(int)
|
|
|
|
for inst in institutions_list:
|
|
type_counts[inst['institution_type']] += 1
|
|
for loc in inst.get('locations', []):
|
|
if loc.get('region'):
|
|
state_counts[loc['region']] += 1
|
|
|
|
return {
|
|
'total_institutions': len(institutions_list),
|
|
'institutions': institutions_list,
|
|
'statistics': {
|
|
'by_type': dict(type_counts),
|
|
'by_state': dict(state_counts),
|
|
'with_urls': sum(1 for i in institutions_list if i.get('identifiers')),
|
|
'with_locations': sum(1 for i in institutions_list if i.get('locations')),
|
|
},
|
|
'extraction_metadata': {
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'data_tier': 'TIER_4_INFERRED',
|
|
'data_source': 'CONVERSATION_NLP',
|
|
}
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Main extraction workflow."""
|
|
print("Mexican GLAM Institution Extractor")
|
|
print("=" * 60)
|
|
|
|
extractor = MexicanGLAMExtractor()
|
|
|
|
# Process both conversation files
|
|
files = [
|
|
'mexican_glam_1.json',
|
|
'mexican_glam_2.json'
|
|
]
|
|
|
|
for filepath in files:
|
|
print(f"\nProcessing: {filepath}")
|
|
result = extractor.extract_from_conversation_file(filepath)
|
|
print(f" Conversation: {result['conversation_name']}")
|
|
print(f" UUID: {result['conversation_id']}")
|
|
print(f" Running total: {result['institutions_found']} institutions")
|
|
|
|
# Get final results
|
|
results = extractor.get_results()
|
|
|
|
# Print statistics
|
|
print("\n" + "=" * 60)
|
|
print("EXTRACTION RESULTS")
|
|
print("=" * 60)
|
|
print(f"Total institutions extracted: {results['total_institutions']}")
|
|
print(f"\nBy institution type:")
|
|
for inst_type, count in sorted(results['statistics']['by_type'].items()):
|
|
print(f" {inst_type}: {count}")
|
|
|
|
print(f"\nTop 10 states by institution count:")
|
|
state_counts = results['statistics']['by_state']
|
|
for state, count in sorted(state_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
|
|
print(f" {state}: {count}")
|
|
|
|
print(f"\nMetadata completeness:")
|
|
print(f" With URLs: {results['statistics']['with_urls']}")
|
|
print(f" With locations: {results['statistics']['with_locations']}")
|
|
|
|
# Save to JSON
|
|
output_file = 'mexican_glam_extracted.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\nResults saved to: {output_file}")
|
|
|
|
# Show sample institutions
|
|
print("\n" + "=" * 60)
|
|
print("SAMPLE INSTITUTIONS (first 10):")
|
|
print("=" * 60)
|
|
for i, inst in enumerate(results['institutions'][:10], 1):
|
|
print(f"\n{i}. {inst['name']}")
|
|
print(f" Type: {inst['institution_type']}")
|
|
print(f" Confidence: {inst['provenance']['confidence_score']:.2f}")
|
|
if inst.get('locations'):
|
|
loc = inst['locations'][0]
|
|
print(f" Location: {loc.get('city', '')}, {loc.get('region', '')}")
|
|
if inst.get('identifiers'):
|
|
urls = [id['identifier_value'] for id in inst['identifiers'] if id['identifier_scheme'] == 'Website']
|
|
if urls:
|
|
print(f" URL: {urls[0]}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|