glam/extract_brazilian_institutions_v2.py
2025-11-19 23:25:22 +01:00

371 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Brazilian GLAM Institution Extractor v2.0
Improved extraction focusing on structured bullet-point lists from the state-by-state
artifact section. Extracts actual institution names (not sentence fragments).
Expected output: 150-200 quality institutions with proper names, types, and metadata.
Source file: /Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json
"""
import json
import re
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional, Tuple
from pathlib import Path
import yaml
def slugify(text: str) -> str:
"""Create URL-safe slug from institution name."""
text = text.lower()
text = re.sub(r'[àáâãäå]', 'a', text)
text = re.sub(r'[èéêë]', 'e', text)
text = re.sub(r'[ìíîï]', 'i', text)
text = re.sub(r'[òóôõö]', 'o', text)
text = re.sub(r'[ùúûü]', 'u', text)
text = re.sub(r'[ç]', 'c', text)
text = re.sub(r'[^a-z0-9]+', '-', text)
text = text.strip('-')
return text[:50]
def classify_institution(name: str, description: str = "") -> str:
"""Classify institution type from name and description.
Priority order:
1. Check name first (most reliable)
2. Then description (less reliable, may contain misleading keywords)
"""
name_lower = name.lower()
desc_lower = description.lower()
# Check NAME first for strong indicators
if 'museu' in name_lower or 'museum' in name_lower or 'memorial' in name_lower or 'pinacoteca' in name_lower:
return 'MUSEUM'
elif 'arquivo' in name_lower or 'archiv' in name_lower:
return 'ARCHIVE'
elif 'biblioteca' in name_lower or 'library' in name_lower or 'bibliotheca' in name_lower:
return 'LIBRARY'
# For acronyms and ambiguous names, check description
text = (name_lower + " " + desc_lower)
if any(word in text for word in ['museu', 'museum', 'memorial', 'pinacoteca', 'casa de cultura', 'mam-', 'mam ', 'marco', 'musear']):
return 'MUSEUM'
elif 'arquivo' in text or 'archiv' in text:
return 'ARCHIVE'
elif 'biblioteca' in text or 'bce' in text:
return 'LIBRARY'
elif any(word in text for word in ['ibram', 'iphan', 'secult', 'secretaria', 'fundação de cultura', 'instituto brasileiro', 'fpc/', 'ipac', 'unesco']):
return 'OFFICIAL_INSTITUTION'
elif any(word in text for word in ['universidade', 'university', 'ufac', 'ufal', 'unifap', 'ufpa', 'usp', 'ufmg', 'unicamp', 'ufrj', 'ufba', 'ufam', 'ufc', 'unb', 'ufg', 'ufma', 'ufmt', 'ufms', 'ufpe', 'ufpi', 'ufrn', 'ufpb', 'ufrgs', 'ufpr', 'ufsc', 'ufes', 'repository', 'repositories']):
return 'EDUCATION_PROVIDER'
elif any(word in text for word in ['centro de pesquisa', 'research center', 'laboratório', 'documentation', 'cepap', 'instituto histórico', 'instituto geográfico']):
return 'RESEARCH_CENTER'
elif any(word in text for word in ['teatro', 'centro cultural', 'ccbb', 'centro dragão', 'geopark', 'casa das', 'projects', 'projetos']):
return 'MIXED'
else:
return 'MIXED'
def parse_state_sections(content: str, source_file_path: str) -> List[Dict[str, Any]]:
"""Parse state-by-state sections to extract institution records."""
institutions = []
# Brazilian states mapping (for region extraction)
states = {
'ACRE': 'AC', 'ALAGOAS': 'AL', 'AMAPÁ': 'AP', 'AMAZONAS': 'AM',
'BAHIA': 'BA', 'CEARÁ': 'CE', 'DISTRITO FEDERAL': 'DF', 'ESPÍRITO SANTO': 'ES',
'GOIÁS': 'GO', 'MARANHÃO': 'MA', 'MATO GROSSO': 'MT', 'MATO GROSSO DO SUL': 'MS',
'MINAS GERAIS': 'MG', 'PARÁ': 'PA', 'PARAÍBA': 'PB', 'PARANÁ': 'PR',
'PERNAMBUCO': 'PE', 'PIAUÍ': 'PI', 'RIO DE JANEIRO': 'RJ', 'RIO GRANDE DO NORTE': 'RN',
'RIO GRANDE DO SUL': 'RS', 'RONDÔNIA': 'RO', 'RORAIMA': 'RR', 'SANTA CATARINA': 'SC',
'SÃO PAULO': 'SP', 'SERGIPE': 'SE', 'TOCANTINS': 'TO'
}
# Split content by state headers (## STATE_NAME (XX))
state_pattern = r'## ([A-ZÀÁÂÃÄÅÈÉÊËÌÍÎÏÒÓÔÕÖÙÚÛÜ\s]+) \(([A-Z]{2})\)'
state_sections = re.split(state_pattern, content)
current_state = None
current_state_code = None
for i, section in enumerate(state_sections):
# Check if this is a state name
if i % 3 == 1: # State names appear at positions 1, 4, 7, etc.
current_state = section.strip()
elif i % 3 == 2: # State codes appear at positions 2, 5, 8, etc.
current_state_code = section.strip()
elif i % 3 == 0 and i > 0: # Content appears at positions 3, 6, 9, etc.
# Parse institutions from this state's section (only if state info is available)
if current_state and current_state_code:
state_institutions = parse_institutions_from_section(
section,
current_state,
current_state_code,
source_file_path
)
institutions.extend(state_institutions)
return institutions
def parse_institutions_from_section(section: str, state_name: str, state_code: str, source_file_path: str) -> List[Dict[str, Any]]:
"""Parse individual institutions from a state section."""
institutions = []
# Pattern: - **Institution Name**: description/details
# Match bullet points with bold institution names
bullet_pattern = r'^[\s]*[-•]\s*\*\*([^*:]+?)(?:\s*\([^)]+\))?\*\*:?\s*(.*)$'
lines = section.split('\n')
for line in lines:
match = re.match(bullet_pattern, line, re.MULTILINE)
if match:
name = match.group(1).strip()
details = match.group(2).strip()
# Skip section headers and generic labels
skip_words = ['State Infrastructure', 'Digital Systems', 'Collections', 'Digital Initiatives',
'Contact', 'Federal', 'Technical', 'Metadata', 'Notable', 'Key Features',
'Preservation', 'Ongoing', 'Major', 'Systems', 'Coverage', 'Database']
if any(skip in name for skip in skip_words):
continue
# Skip Contact entries (phone numbers, addresses)
if re.search(r'\(\d{2}\)\s*\d', name): # Brazilian phone format
continue
# Skip if name is too short or too long
if len(name) < 3 or len(name) > 150:
continue
# Extract URL from details
url_match = re.search(r'https?://[^\s,]+', details)
url = url_match.group(0) if url_match else None
# Extract city if mentioned
city = extract_city_from_details(details, name)
# Create institution record
inst = create_institution_record_v2(
name=name,
description=details,
state_name=state_name,
state_code=state_code,
city=city,
url=url,
source_file_path=source_file_path
)
if inst:
institutions.append(inst)
return institutions
def extract_city_from_details(details: str, name: str) -> Optional[str]:
"""Extract city name from institution details or name."""
# Common Brazilian cities
cities = [
'São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Belo Horizonte',
'Curitiba', 'Recife', 'Porto Alegre', 'Manaus', 'Fortaleza',
'Rio Branco', 'Maceió', 'Macapá', 'Belém', 'Goiânia', 'Campo Grande',
'Cuiabá', 'João Pessoa', 'Teresina', 'Natal', 'Florianópolis',
'Aracaju', 'Palmas', 'Boa Vista', 'Vitória', 'São Luís', 'Campinas',
'Santos', 'Niterói', 'Ouro Preto', 'Petrópolis', 'Paraty', 'Olinda',
'Tiradentes', 'Diamantina', 'Cachoeira', 'São Cristóvão'
]
# Check in details first
for city in cities:
if city in details or city in name:
return city
return None
def create_institution_record_v2(
name: str,
description: str,
state_name: str,
state_code: str,
city: Optional[str],
url: Optional[str],
source_file_path: str
) -> Optional[Dict[str, Any]]:
"""Create LinkML-compliant institution record from structured data."""
# Clean name
name = re.sub(r'\s+', ' ', name).strip()
# Create slug for ID
slug = slugify(name)
# Classify institution type
inst_type = classify_institution(name, description)
# Calculate confidence based on available data
confidence = 0.7 # Base for structured extraction
if url:
confidence += 0.1
if city:
confidence += 0.05
if len(description) > 50:
confidence += 0.1
confidence = min(confidence, 0.95)
# Create location record
location = {
'country': 'BR',
'region': state_name
}
if city:
location['city'] = city
# Create record
record = {
'id': f'https://w3id.org/heritage/custodian/br/{slug}',
'name': name,
'institution_type': inst_type,
'locations': [location],
'provenance': {
'data_source': 'CONVERSATION_NLP',
'data_tier': 'TIER_4_INFERRED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Python structured extraction from Brazilian state-by-state artifact v2.0',
'confidence_score': confidence,
'conversation_id': '2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5',
'source_url': f'file://{source_file_path}'
}
}
# Add description if meaningful
if description and len(description) > 10:
# Clean up description
desc = description
# Remove URLs from description text
desc = re.sub(r'https?://[^\s,]+', '', desc).strip()
# Limit length
if len(desc) > 500:
desc = desc[:497] + '...'
if desc:
record['description'] = desc
# Add URL as identifier
if url:
record['identifiers'] = [{
'identifier_scheme': 'Website',
'identifier_value': url,
'identifier_url': url
}]
return record
def extract_institutions_from_conversation(json_path: Path) -> List[Dict[str, Any]]:
"""Main extraction function."""
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Extract markdown artifacts
markdown_artifacts = []
for message in data.get('chat_messages', []):
for content_item in message.get('content', []):
if content_item.get('type') == 'tool_use':
tool_input = content_item.get('input', {})
if 'content' in tool_input:
markdown_artifacts.append(tool_input['content'])
# Process all artifacts
all_institutions = []
seen_names = set()
# Convert Path to absolute string for source_url
source_file_path = str(json_path.resolve())
for artifact in markdown_artifacts:
# Parse state sections
institutions = parse_state_sections(artifact, source_file_path)
# Deduplicate by name
for inst in institutions:
name = inst['name']
if name not in seen_names:
all_institutions.append(inst)
seen_names.add(name)
return all_institutions
def main():
"""Main extraction workflow."""
input_file = Path('/Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json')
output_file = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_v2.yaml')
print(f"Brazilian GLAM Institution Extractor v2.0")
print(f"=" * 60)
print(f"Input: {input_file.name}")
print(f"Output: {output_file}")
print()
institutions = extract_institutions_from_conversation(input_file)
print(f"✓ Extracted {len(institutions)} institutions")
print()
# Type breakdown
type_counts = {}
for inst in institutions:
inst_type = inst['institution_type']
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
print(f"Institution Type Distribution:")
for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
print(f" {inst_type:25s}: {count:3d}")
print()
# Location coverage
states_covered = set()
for inst in institutions:
if inst.get('locations'):
state = inst['locations'][0].get('region')
if state:
states_covered.add(state)
print(f"Geographic Coverage:")
print(f" States covered: {len(states_covered)}/27")
print()
# URL coverage
with_urls = sum(1 for inst in institutions if inst.get('identifiers'))
print(f"Data Quality:")
print(f" Institutions with URLs: {with_urls}/{len(institutions)} ({100*with_urls/len(institutions):.1f}%)")
# Average confidence
avg_confidence = sum(inst['provenance']['confidence_score'] for inst in institutions) / len(institutions)
print(f" Average confidence: {avg_confidence:.3f}")
print()
# Write output
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=120)
print(f"✓ Wrote {len(institutions)} records to {output_file}")
print()
print(f"Sample Institutions:")
for inst in institutions[:10]:
city = inst['locations'][0].get('city', '(state level)')
print(f" - {inst['name'][:50]:50s} | {inst['institution_type']:12s} | {city}")
if __name__ == '__main__':
main()