486 lines
20 KiB
Python
486 lines
20 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract Mexican GLAM institutions from conversation JSON files.
|
|
Version 2: Improved markdown parsing for structured institution lists.
|
|
Follows GLAM Data Extraction project specifications.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from typing import List, Dict, Any, Tuple
|
|
from datetime import datetime, timezone
|
|
from collections import defaultdict
|
|
|
|
|
|
class MexicanGLAMExtractor:
|
|
"""Extract heritage institutions from Mexican GLAM conversation artifacts."""
|
|
|
|
# URL patterns
|
|
URL_PATTERN = re.compile(r'https?://[^\s\)]+')
|
|
|
|
# Email patterns
|
|
EMAIL_PATTERN = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
|
|
|
|
# Phone patterns
|
|
PHONE_PATTERN = re.compile(r'(?:Phone|Tel|Teléfono):\s*([0-9\s\-\(\)]+)')
|
|
|
|
# Address patterns
|
|
ADDRESS_PATTERN = re.compile(r'Address:\s*([^\n]+)')
|
|
|
|
# Mexican states for location matching
|
|
MEXICAN_STATES = {
|
|
'AGUASCALIENTES', 'BAJA CALIFORNIA', 'BAJA CALIFORNIA SUR', 'CAMPECHE',
|
|
'CHIAPAS', 'CHIHUAHUA', 'COAHUILA', 'COLIMA', 'DURANGO', 'GUANAJUATO',
|
|
'GUERRERO', 'HIDALGO', 'JALISCO', 'MÉXICO', 'MICHOACÁN', 'MORELOS',
|
|
'NAYARIT', 'NUEVO LEÓN', 'OAXACA', 'PUEBLA', 'QUERÉTARO', 'QUINTANA ROO',
|
|
'SAN LUIS POTOSÍ', 'SINALOA', 'SONORA', 'TABASCO', 'TAMAULIPAS',
|
|
'TLAXCALA', 'VERACRUZ', 'YUCATÁN', 'ZACATECAS', 'CIUDAD DE MÉXICO'
|
|
}
|
|
|
|
# Common Mexican cities
|
|
MEXICAN_CITIES = {
|
|
'México', 'Guadalajara', 'Monterrey', 'Puebla', 'Toluca', 'Tijuana',
|
|
'León', 'Ciudad Juárez', 'Zapopan', 'Mérida', 'Aguascalientes',
|
|
'Querétaro', 'Morelia', 'Hermosillo', 'Saltillo', 'Mexicali',
|
|
'Culiacán', 'Chihuahua', 'Oaxaca', 'Veracruz', 'Acapulco',
|
|
'Cancún', 'Cuernavaca', 'Pachuca', 'Durango', 'Tepic',
|
|
'Tuxtla Gutiérrez', 'Villahermosa', 'Campeche', 'Chetumal',
|
|
'Zacatecas', 'Colima', 'Guanajuato', 'San Luis Potosí'
|
|
}
|
|
|
|
def __init__(self):
|
|
self.extracted_institutions = {}
|
|
self.institution_id_counter = 1
|
|
|
|
def extract_from_conversation_file(self, filepath: str) -> Dict[str, Any]:
|
|
"""Extract institutions from a single conversation JSON file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
conversation_id = data.get('uuid', '')
|
|
conversation_name = data.get('name', '')
|
|
|
|
# Extract from all artifact content in the conversation
|
|
for msg in data.get('chat_messages', []):
|
|
for content in msg.get('content', []):
|
|
if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
|
|
artifact_text = content.get('input', {}).get('content', '')
|
|
if artifact_text:
|
|
self._extract_from_artifact(artifact_text, conversation_id, conversation_name)
|
|
|
|
return {
|
|
'conversation_id': conversation_id,
|
|
'conversation_name': conversation_name,
|
|
'institutions_found': len(self.extracted_institutions)
|
|
}
|
|
|
|
def _extract_from_artifact(self, text: str, conversation_id: str, conversation_name: str):
|
|
"""Extract institutions from artifact text content."""
|
|
|
|
# Strategy 1: Extract from state-by-state sections
|
|
self._extract_from_state_sections(text, conversation_id, conversation_name)
|
|
|
|
# Strategy 2: Extract from national platforms section
|
|
self._extract_national_platforms(text, conversation_id, conversation_name)
|
|
|
|
# Strategy 3: Extract inline mentions with URLs
|
|
self._extract_inline_mentions(text, conversation_id, conversation_name)
|
|
|
|
def _extract_from_state_sections(self, text: str, conversation_id: str, conversation_name: str):
|
|
"""Extract institutions from state-by-state directory sections."""
|
|
|
|
# Find all state sections (### STATE_NAME)
|
|
state_pattern = re.compile(r'###\s+([A-ZÁÉÍÓÚÑ\s]+)\n\n(.*?)(?=\n###\s+[A-ZÁÉÍÓÚÑ]|\Z)', re.DOTALL)
|
|
|
|
for match in state_pattern.finditer(text):
|
|
state_name = match.group(1).strip()
|
|
section_content = match.group(2)
|
|
|
|
# Only process if it's a valid Mexican state
|
|
if state_name.upper() not in self.MEXICAN_STATES:
|
|
continue
|
|
|
|
# Extract institutions from bullet lists with bold names
|
|
self._extract_from_bullet_lists(section_content, state_name, conversation_id, conversation_name)
|
|
|
|
def _extract_from_bullet_lists(self, text: str, state: str, conversation_id: str, conversation_name: str):
|
|
"""Extract institutions from markdown bullet lists with bold names."""
|
|
|
|
# Pattern: - **Institution Name**: description or - **Institution Name (ACRONYM)**: description
|
|
# Also handles multi-line entries with nested bullets
|
|
pattern = re.compile(r'^-\s+\*\*([^*]+?)\*\*:?\s*(.*?)(?=\n-\s+\*\*|\n\n|\Z)', re.MULTILINE | re.DOTALL)
|
|
|
|
for match in pattern.finditer(text):
|
|
institution_name = match.group(1).strip()
|
|
description_block = match.group(2).strip()
|
|
|
|
# Classify institution type
|
|
inst_type = self._classify_institution_type(institution_name, description_block)
|
|
|
|
if not inst_type:
|
|
continue # Skip if we can't determine type
|
|
|
|
# Extract metadata from description block
|
|
metadata = self._extract_metadata_from_block(description_block)
|
|
|
|
# Add institution
|
|
self._add_institution(
|
|
name=institution_name,
|
|
institution_type=inst_type,
|
|
state=state,
|
|
metadata=metadata,
|
|
conversation_id=conversation_id,
|
|
conversation_name=conversation_name
|
|
)
|
|
|
|
def _extract_national_platforms(self, text: str, conversation_id: str, conversation_name: str):
|
|
"""Extract national-level platforms and institutions."""
|
|
|
|
# Look for "National" or "Federal" sections
|
|
national_sections = re.finditer(
|
|
r'##\s+(?:National|Federal|Core\s+National).*?\n(.*?)(?=\n##|\Z)',
|
|
text, re.DOTALL | re.IGNORECASE
|
|
)
|
|
|
|
for section_match in national_sections:
|
|
section_content = section_match.group(1)
|
|
|
|
# Extract from bullet lists
|
|
self._extract_from_bullet_lists(section_content, '', conversation_id, conversation_name)
|
|
|
|
def _extract_inline_mentions(self, text: str, conversation_id: str, conversation_name: str):
|
|
"""Extract institutions mentioned inline with URLs."""
|
|
|
|
# Pattern 1: **Name**\n- **URL**: https://...
|
|
url_pattern = re.compile(
|
|
r'\*\*([^*]+?)\*\*\s*\n\s*-\s*\*\*URL\*\*:\s*(https?://[^\s\)]+)',
|
|
re.MULTILINE
|
|
)
|
|
|
|
for match in url_pattern.finditer(text):
|
|
institution_name = match.group(1).strip()
|
|
url = match.group(2).strip().rstrip('.,;)')
|
|
|
|
# Get context for better metadata
|
|
context_start = max(0, match.start() - 200)
|
|
context_end = min(len(text), match.end() + 500)
|
|
context = text[context_start:context_end]
|
|
|
|
metadata = self._extract_metadata_from_block(context)
|
|
metadata['urls'] = [url] + metadata.get('urls', [])
|
|
|
|
inst_type = self._classify_institution_type(institution_name, context)
|
|
|
|
if inst_type:
|
|
self._add_institution(
|
|
name=institution_name,
|
|
institution_type=inst_type,
|
|
state='',
|
|
metadata=metadata,
|
|
conversation_id=conversation_id,
|
|
conversation_name=conversation_name
|
|
)
|
|
|
|
# Pattern 2: Institution name at/URL/website: URL
|
|
inline_pattern = re.compile(
|
|
r'\b((?:Museo|Biblioteca|Archivo|Instituto|Centro|Sistema|Fonoteca|Mapoteca)\s+[A-ZÁÉÍÓÚÑ][A-Za-záéíóúñ\s]+?)\s+(?:at|URL|website|portal):\s*(https?://[^\s\)]+)',
|
|
re.IGNORECASE
|
|
)
|
|
|
|
for match in inline_pattern.finditer(text):
|
|
institution_name = match.group(1).strip()
|
|
url = match.group(2).strip().rstrip('.,;)')
|
|
|
|
inst_type = self._classify_institution_type(institution_name, '')
|
|
|
|
if inst_type:
|
|
self._add_institution(
|
|
name=institution_name,
|
|
institution_type=inst_type,
|
|
state='',
|
|
metadata={'urls': [url]},
|
|
conversation_id=conversation_id,
|
|
conversation_name=conversation_name
|
|
)
|
|
|
|
def _classify_institution_type(self, name: str, description: str) -> str:
|
|
"""Classify institution type based on name and description."""
|
|
|
|
name_lower = name.lower()
|
|
desc_lower = description.lower()
|
|
combined = name_lower + ' ' + desc_lower
|
|
|
|
# Classification rules
|
|
if any(term in name_lower for term in ['museo', 'museum']):
|
|
return 'MUSEUM'
|
|
elif any(term in name_lower for term in ['biblioteca', 'library', 'hemeroteca']):
|
|
return 'LIBRARY'
|
|
elif any(term in name_lower for term in ['archivo', 'archive', 'mapoteca']):
|
|
return 'ARCHIVE'
|
|
elif any(term in name_lower for term in ['galería', 'gallery']):
|
|
return 'GALLERY'
|
|
elif any(term in name_lower for term in ['instituto', 'institute', 'centro', 'center']):
|
|
# Check if it's a research center or museum
|
|
if 'museo' in combined or 'museum' in combined:
|
|
return 'MUSEUM'
|
|
else:
|
|
return 'RESEARCH_CENTER'
|
|
elif any(term in name_lower for term in ['secretaría', 'sistema', 'fonoteca', 'imcine', 'red nacional', 'mediateca', 'mexicana', 'memórica']):
|
|
return 'OFFICIAL_INSTITUTION'
|
|
|
|
return '' # Unknown type
|
|
|
|
def _extract_metadata_from_block(self, block: str) -> Dict[str, Any]:
|
|
"""Extract metadata (URLs, addresses, emails, phones) from description block."""
|
|
|
|
metadata = {
|
|
'urls': [],
|
|
'emails': [],
|
|
'phones': [],
|
|
'address': '',
|
|
'city': '',
|
|
'description': '',
|
|
}
|
|
|
|
# Extract URLs
|
|
urls = self.URL_PATTERN.findall(block)
|
|
metadata['urls'] = [url.rstrip('.,;)') for url in urls[:3]]
|
|
|
|
# Extract emails
|
|
metadata['emails'] = self.EMAIL_PATTERN.findall(block)[:2]
|
|
|
|
# Extract phones
|
|
phone_matches = self.PHONE_PATTERN.findall(block)
|
|
metadata['phones'] = [p.strip() for p in phone_matches][:2]
|
|
|
|
# Extract address
|
|
address_match = self.ADDRESS_PATTERN.search(block)
|
|
if address_match:
|
|
metadata['address'] = address_match.group(1).strip().rstrip('.,;')
|
|
|
|
# Extract city from address or text
|
|
for city in self.MEXICAN_CITIES:
|
|
if city.lower() in block.lower():
|
|
metadata['city'] = city
|
|
break
|
|
|
|
# Extract description (first sentence or line)
|
|
lines = block.split('\n')
|
|
for line in lines:
|
|
line = line.strip().lstrip('-').strip()
|
|
if line and not any(line.startswith(prefix) for prefix in ['Address:', 'Phone:', 'Email:', 'URL:', 'Director:']):
|
|
# Clean up the line
|
|
desc = re.sub(r'\*\*', '', line) # Remove bold markers
|
|
desc = re.sub(r'\[.*?\]\(.*?\)', '', desc) # Remove markdown links
|
|
if len(desc) > 20:
|
|
metadata['description'] = desc[:300]
|
|
break
|
|
|
|
return metadata
|
|
|
|
def _add_institution(self, name: str, institution_type: str, state: str,
|
|
metadata: Dict[str, Any], conversation_id: str, conversation_name: str):
|
|
"""Add or update an institution record."""
|
|
|
|
# Normalize name for deduplication
|
|
name_normalized = self._normalize_name(name)
|
|
|
|
# Check if already exists
|
|
if name_normalized in self.extracted_institutions:
|
|
# Update existing record
|
|
existing = self.extracted_institutions[name_normalized]
|
|
existing['provenance']['confidence_score'] = min(1.0, existing['provenance']['confidence_score'] + 0.15)
|
|
|
|
# Merge metadata
|
|
if metadata.get('urls'):
|
|
for url in metadata['urls']:
|
|
if not any(id.get('identifier_value') == url for id in existing['identifiers']):
|
|
existing['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': url,
|
|
'identifier_url': url,
|
|
})
|
|
|
|
if state and not existing['locations']:
|
|
existing['locations'].append({'city': metadata.get('city', ''), 'region': state, 'country': 'MX'})
|
|
|
|
return
|
|
|
|
# Create new record
|
|
institution_id = f"mx-glam-{self.institution_id_counter:04d}"
|
|
self.institution_id_counter += 1
|
|
|
|
record = {
|
|
'id': institution_id,
|
|
'name': name,
|
|
'name_normalized': name_normalized,
|
|
'institution_type': institution_type,
|
|
'alternative_names': [],
|
|
'description': metadata.get('description', ''),
|
|
'locations': [],
|
|
'identifiers': [],
|
|
'digital_platforms': [],
|
|
'provenance': {
|
|
'data_source': 'CONVERSATION_NLP',
|
|
'data_tier': 'TIER_4_INFERRED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Markdown parsing + pattern matching from Mexican GLAM conversations',
|
|
'confidence_score': self._calculate_confidence(name, metadata),
|
|
'conversation_id': conversation_id,
|
|
'source_url': None,
|
|
}
|
|
}
|
|
|
|
# Add location if available
|
|
if metadata.get('city') or state:
|
|
location = {
|
|
'city': metadata.get('city', ''),
|
|
'region': state,
|
|
'country': 'MX',
|
|
}
|
|
if metadata.get('address'):
|
|
location['street_address'] = metadata['address']
|
|
record['locations'].append(location)
|
|
|
|
# Add identifiers
|
|
for url in metadata.get('urls', []):
|
|
record['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': url,
|
|
'identifier_url': url,
|
|
})
|
|
|
|
for email in metadata.get('emails', []):
|
|
record['identifiers'].append({
|
|
'identifier_scheme': 'Email',
|
|
'identifier_value': email,
|
|
})
|
|
|
|
self.extracted_institutions[name_normalized] = record
|
|
|
|
def _normalize_name(self, name: str) -> str:
|
|
"""Normalize institution name for deduplication."""
|
|
# Remove acronyms in parentheses
|
|
name = re.sub(r'\s*\([A-Z]+\)\s*', ' ', name)
|
|
# Remove extra whitespace, lowercase
|
|
normalized = re.sub(r'\s+', ' ', name.strip().lower())
|
|
return normalized
|
|
|
|
def _calculate_confidence(self, name: str, metadata: Dict[str, Any]) -> float:
|
|
"""Calculate confidence score for extraction."""
|
|
confidence = 0.7 # Base confidence for structured extraction
|
|
|
|
# Increase confidence based on available metadata
|
|
if metadata.get('urls'):
|
|
confidence += 0.15
|
|
if metadata.get('address'):
|
|
confidence += 0.1
|
|
if metadata.get('description'):
|
|
confidence += 0.05
|
|
|
|
return min(1.0, confidence)
|
|
|
|
def get_results(self) -> Dict[str, Any]:
|
|
"""Get extraction results with statistics."""
|
|
institutions_list = list(self.extracted_institutions.values())
|
|
|
|
# Calculate statistics
|
|
type_counts = defaultdict(int)
|
|
state_counts = defaultdict(int)
|
|
|
|
for inst in institutions_list:
|
|
type_counts[inst['institution_type']] += 1
|
|
for loc in inst.get('locations', []):
|
|
if loc.get('region'):
|
|
state_counts[loc['region']] += 1
|
|
|
|
return {
|
|
'total_institutions': len(institutions_list),
|
|
'institutions': institutions_list,
|
|
'statistics': {
|
|
'by_type': dict(type_counts),
|
|
'by_state': dict(state_counts),
|
|
'with_urls': sum(1 for i in institutions_list if any(id.get('identifier_scheme') == 'Website' for id in i.get('identifiers', []))),
|
|
'with_locations': sum(1 for i in institutions_list if i.get('locations')),
|
|
'with_descriptions': sum(1 for i in institutions_list if i.get('description')),
|
|
},
|
|
'extraction_metadata': {
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'data_tier': 'TIER_4_INFERRED',
|
|
'data_source': 'CONVERSATION_NLP',
|
|
}
|
|
}
|
|
|
|
|
|
def main():
|
|
"""Main extraction workflow."""
|
|
print("Mexican GLAM Institution Extractor v2")
|
|
print("=" * 60)
|
|
|
|
extractor = MexicanGLAMExtractor()
|
|
|
|
# Process both conversation files
|
|
files = [
|
|
'mexican_glam_1.json',
|
|
'mexican_glam_2.json'
|
|
]
|
|
|
|
for filepath in files:
|
|
print(f"\nProcessing: {filepath}")
|
|
result = extractor.extract_from_conversation_file(filepath)
|
|
print(f" Conversation: {result['conversation_name']}")
|
|
print(f" UUID: {result['conversation_id']}")
|
|
print(f" Running total: {result['institutions_found']} institutions")
|
|
|
|
# Get final results
|
|
results = extractor.get_results()
|
|
|
|
# Print statistics
|
|
print("\n" + "=" * 60)
|
|
print("EXTRACTION RESULTS")
|
|
print("=" * 60)
|
|
print(f"Total institutions extracted: {results['total_institutions']}")
|
|
print(f"\nBy institution type:")
|
|
for inst_type, count in sorted(results['statistics']['by_type'].items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {inst_type}: {count}")
|
|
|
|
print(f"\nTop 15 states by institution count:")
|
|
state_counts = results['statistics']['by_state']
|
|
for state, count in sorted(state_counts.items(), key=lambda x: x[1], reverse=True)[:15]:
|
|
print(f" {state}: {count}")
|
|
|
|
print(f"\nMetadata completeness:")
|
|
print(f" With URLs: {results['statistics']['with_urls']}")
|
|
print(f" With locations: {results['statistics']['with_locations']}")
|
|
print(f" With descriptions: {results['statistics']['with_descriptions']}")
|
|
|
|
# Save to JSON
|
|
output_file = 'mexican_glam_extracted.json'
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"\nResults saved to: {output_file}")
|
|
|
|
# Show sample institutions
|
|
print("\n" + "=" * 60)
|
|
print("SAMPLE INSTITUTIONS (first 15):")
|
|
print("=" * 60)
|
|
for i, inst in enumerate(results['institutions'][:15], 1):
|
|
print(f"\n{i}. {inst['name']}")
|
|
print(f" Type: {inst['institution_type']}")
|
|
print(f" Confidence: {inst['provenance']['confidence_score']:.2f}")
|
|
if inst.get('locations'):
|
|
loc = inst['locations'][0]
|
|
city_state = f"{loc.get('city', '')}, {loc.get('region', '')}".strip(', ')
|
|
if city_state:
|
|
print(f" Location: {city_state}")
|
|
if inst.get('description'):
|
|
desc = inst['description'][:100] + '...' if len(inst['description']) > 100 else inst['description']
|
|
print(f" Description: {desc}")
|
|
urls = [id['identifier_value'] for id in inst.get('identifiers', []) if id['identifier_scheme'] == 'Website']
|
|
if urls:
|
|
print(f" URL: {urls[0]}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|