glam/extract_brazilian_institutions.py
2025-11-19 23:25:22 +01:00

409 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Brazilian GLAM Institution Extractor
Extracts ALL heritage institutions from Brazilian conversation JSON and creates
LinkML-compliant YAML records following schema v0.2.0.
Expected output: 200+ institutions covering all 27 Brazilian federative units.
"""
import json
import re
from datetime import datetime, timezone
from typing import List, Dict, Any, Optional
from pathlib import Path
import yaml
def slugify(text: str) -> str:
"""Create URL-safe slug from institution name."""
text = text.lower()
text = re.sub(r'[àáâãäå]', 'a', text)
text = re.sub(r'[èéêë]', 'e', text)
text = re.sub(r'[ìíîï]', 'i', text)
text = re.sub(r'[òóôõö]', 'o', text)
text = re.sub(r'[ùúûü]', 'u', text)
text = re.sub(r'[ç]', 'c', text)
text = re.sub(r'[^a-z0-9]+', '-', text)
text = text.strip('-')
return text[:50] # Limit length
def classify_institution(name: str, description: str = "") -> str:
"""Classify institution type from name and description."""
text = (name + " " + description).lower()
if any(word in text for word in ['museu', 'museum', 'memorial', 'pinacoteca']):
return 'MUSEUM'
elif any(word in text for word in ['biblioteca', 'library', 'bibliotheca']):
return 'LIBRARY'
elif any(word in text for word in ['arquivo', 'archive', 'archiv']):
return 'ARCHIVE'
elif any(word in text for word in ['ibram', 'iphan', 'secult', 'secretaria', 'fundação de cultura', 'instituto brasileiro']):
return 'OFFICIAL_INSTITUTION'
elif any(word in text for word in ['universidade', 'university', 'usp', 'ufmg', 'unicamp', 'ufrj', 'ufba']):
return 'EDUCATION_PROVIDER'
elif any(word in text for word in ['centro de pesquisa', 'research center', 'laboratório', 'documentation']):
return 'RESEARCH_CENTER'
else:
return 'MIXED' # Default when unclear
def extract_institutions_from_conversation(json_path: Path) -> List[Dict[str, Any]]:
"""Extract all institutions from Brazilian GLAM conversation JSON."""
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
conversation_id = data.get('uuid', '')
extraction_date = datetime.now(timezone.utc).isoformat()
institutions = []
seen_names = set() # Deduplicate
# Extract markdown content from artifacts
markdown_content = []
citations = []
for message in data.get('chat_messages', []):
for content_item in message.get('content', []):
if content_item.get('type') == 'tool_use':
tool_input = content_item.get('input', {})
if 'content' in tool_input:
markdown_content.append(tool_input['content'])
if 'md_citations' in tool_input:
citations.extend(tool_input['md_citations'])
# Parse markdown artifacts for institutions
for artifact in markdown_content:
institutions.extend(parse_artifact(artifact, conversation_id, extraction_date, seen_names))
# Parse citations to create digital platform records
institutions.extend(parse_citations(citations, conversation_id, extraction_date, seen_names))
return institutions
def parse_artifact(content: str, conversation_id: str, extraction_date: str, seen_names: set) -> List[Dict[str, Any]]:
"""Parse markdown artifact content to extract institutions."""
institutions = []
# Pattern 1: Federal institutions (IBRAM, IPHAN, Biblioteca Nacional, etc.)
federal_pattern = r'\*\*([^*]+(?:Instituto Brasileiro de Museus|IBRAM|IPHAN|Biblioteca Nacional|Arquivo Nacional|Fundação Cultural Palmares)[^*]*)\*\*'
for match in re.finditer(federal_pattern, content, re.IGNORECASE):
name = match.group(1).strip()
if name not in seen_names:
inst = create_institution_record(name, content, conversation_id, extraction_date)
if inst:
institutions.append(inst)
seen_names.add(name)
# Pattern 2: Museums
museum_pattern = r'\*\*([^*]*(?:Museu|Museum|Memorial|Pinacoteca)[^*]*?)\*\*'
for match in re.finditer(museum_pattern, content, re.IGNORECASE):
name = match.group(1).strip()
if name and name not in seen_names and len(name) > 5:
inst = create_institution_record(name, content, conversation_id, extraction_date)
if inst:
institutions.append(inst)
seen_names.add(name)
# Pattern 3: Libraries
library_pattern = r'\*\*([^*]*(?:Biblioteca|Library)[^*]*?)\*\*'
for match in re.finditer(library_pattern, content, re.IGNORECASE):
name = match.group(1).strip()
if name and name not in seen_names and len(name) > 5:
inst = create_institution_record(name, content, conversation_id, extraction_date)
if inst:
institutions.append(inst)
seen_names.add(name)
# Pattern 4: Archives
archive_pattern = r'\*\*([^*]*(?:Arquivo|Archive)[^*]*?)\*\*'
for match in re.finditer(archive_pattern, content, re.IGNORECASE):
name = match.group(1).strip()
if name and name not in seen_names and len(name) > 5:
inst = create_institution_record(name, content, conversation_id, extraction_date)
if inst:
institutions.append(inst)
seen_names.add(name)
# Pattern 5: Universities
university_pattern = r'\*\*([^*]*(?:Universidade|University|USP|UFMG|UNICAMP|UFRJ|UFBA|UNIFAP|UFAC)[^*]*?)\*\*'
for match in re.finditer(university_pattern, content, re.IGNORECASE):
name = match.group(1).strip()
if name and name not in seen_names and len(name) > 3:
inst = create_institution_record(name, content, conversation_id, extraction_date)
if inst:
institutions.append(inst)
seen_names.add(name)
# Pattern 6: Digital platforms
platform_pattern = r'\*\*([^*]*(?:Digital|Brasiliana|Hemeroteca|Tainacan|BNDigital)[^*]*?)\*\*'
for match in re.finditer(platform_pattern, content, re.IGNORECASE):
name = match.group(1).strip()
if name and name not in seen_names and len(name) > 5:
inst = create_institution_record(name, content, conversation_id, extraction_date)
if inst:
institutions.append(inst)
seen_names.add(name)
return institutions
def create_institution_record(name: str, context: str, conversation_id: str, extraction_date: str) -> Optional[Dict[str, Any]]:
"""Create a complete LinkML-compliant institution record."""
# Clean up name
name = re.sub(r'\s+', ' ', name).strip()
if not name or len(name) < 4:
return None
# Skip generic headings
skip_words = ['Executive Summary', 'State Infrastructure', 'Digital Systems', 'Collections', 'Federal', 'Contact']
if any(skip in name for skip in skip_words):
return None
slug = slugify(name)
inst_type = classify_institution(name, context)
# Extract description from nearby context
description = extract_description(name, context)
# Extract URL if mentioned
urls = extract_urls_for_institution(name, context)
# Extract location
location = extract_location(name, context)
# Determine confidence score
confidence = calculate_confidence(name, context, urls, location)
record = {
'id': f'https://w3id.org/heritage/custodian/br/{slug}',
'name': name,
'institution_type': inst_type,
'description': description if description else f'Brazilian heritage institution: {name}',
'provenance': {
'data_source': 'CONVERSATION_NLP',
'data_tier': 'TIER_4_INFERRED',
'extraction_date': extraction_date,
'extraction_method': 'Python NLP extraction from Brazilian GLAM conversation artifact',
'confidence_score': confidence,
'conversation_id': conversation_id
}
}
# Add location if found
if location:
record['locations'] = [location]
# Add identifiers if URLs found
if urls:
record['identifiers'] = []
for url in urls[:5]: # Limit to 5 URLs
record['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': url,
'identifier_url': url
})
return record
def extract_description(name: str, context: str) -> Optional[str]:
"""Extract descriptive text about the institution from context."""
# Find paragraphs mentioning the institution
sentences = []
# Look for sentences containing the institution name
for paragraph in context.split('\n\n'):
if name[:20] in paragraph: # Use partial name for matching
# Extract sentences from this paragraph
para_sentences = re.split(r'[.!?]+\s+', paragraph)
for sent in para_sentences[:3]: # Up to 3 sentences
if len(sent) > 30 and len(sent) < 500:
clean = re.sub(r'\*\*', '', sent).strip()
if clean and clean not in sentences:
sentences.append(clean)
if sentences:
return ' '.join(sentences[:2]) # Combine up to 2 sentences
return None
def extract_urls_for_institution(name: str, context: str) -> List[str]:
"""Extract URLs mentioned near institution name."""
urls = []
# Find URLs in context
url_pattern = r'https?://[^\s<>"\')]+(?:\.[^\s<>"\')\]]+)+'
# Look in paragraphs mentioning the institution
for paragraph in context.split('\n'):
if name[:15] in paragraph or name.split()[0] in paragraph:
found_urls = re.findall(url_pattern, paragraph)
urls.extend(found_urls)
return list(set(urls)) # Deduplicate
def extract_location(name: str, context: str) -> Optional[Dict[str, Any]]:
"""Extract location information for institution."""
# Brazilian cities commonly mentioned
cities = ['São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Belo Horizonte',
'Curitiba', 'Recife', 'Porto Alegre', 'Manaus', 'Fortaleza',
'Rio Branco', 'Maceió', 'Macapá', 'Belém', 'Goiânia', 'Campo Grande',
'Cuiabá', 'João Pessoa', 'Teresina', 'Natal', 'Florianópolis',
'Aracaju', 'Palmas', 'Boa Vista', 'Vitória']
# Brazilian states
states = {
'Acre': 'AC', 'Alagoas': 'AL', 'Amapá': 'AP', 'Amazonas': 'AM',
'Bahia': 'BA', 'Ceará': 'CE', 'Distrito Federal': 'DF', 'Espírito Santo': 'ES',
'Goiás': 'GO', 'Maranhão': 'MA', 'Mato Grosso': 'MT', 'Mato Grosso do Sul': 'MS',
'Minas Gerais': 'MG', 'Pará': 'PA', 'Paraíba': 'PB', 'Paraná': 'PR',
'Pernambuco': 'PE', 'Piauí': 'PI', 'Rio de Janeiro': 'RJ', 'Rio Grande do Norte': 'RN',
'Rio Grande do Sul': 'RS', 'Rondônia': 'RO', 'Roraima': 'RR', 'Santa Catarina': 'SC',
'São Paulo': 'SP', 'Sergipe': 'SE', 'Tocantins': 'TO'
}
location = None
# Search for city mentions near institution name
for paragraph in context.split('\n'):
if name[:15] in paragraph:
for city in cities:
if city in paragraph:
# Try to find state too
region = None
for state_name, state_code in states.items():
if state_name in paragraph or f'({state_code})' in paragraph:
region = state_name
break
location = {
'city': city,
'country': 'BR'
}
if region:
location['region'] = region
return location
return None
def calculate_confidence(name: str, context: str, urls: List[str], location: Optional[Dict]) -> float:
"""Calculate confidence score for extraction."""
score = 0.5 # Base score
# Name quality
if len(name) > 10 and len(name) < 100:
score += 0.1
# Has URL
if urls:
score += 0.15
# Has location
if location:
score += 0.1
# Has description in context
if name in context and len(context) > 100:
score += 0.1
# Mentioned multiple times
count = context.lower().count(name[:20].lower())
if count > 2:
score += 0.05
return min(score, 0.95) # Cap at 0.95
def parse_citations(citations: List[Dict], conversation_id: str, extraction_date: str, seen_names: set) -> List[Dict[str, Any]]:
"""Parse citation metadata to extract additional institutions."""
institutions = []
for citation in citations:
# Extract from citation titles and sources
title = citation.get('title', '')
url = citation.get('url', '')
# Look for institutional sources
for source_item in citation.get('sources', []):
source = source_item.get('source', '')
if source and source not in seen_names and len(source) > 5:
# Create minimal record for citation sources
slug = slugify(source)
inst_type = classify_institution(source, title)
record = {
'id': f'https://w3id.org/heritage/custodian/br/{slug}',
'name': source,
'institution_type': inst_type,
'description': f'Institution identified from web citation: {title}',
'identifiers': [{
'identifier_scheme': 'Website',
'identifier_value': url,
'identifier_url': url
}],
'locations': [{
'country': 'BR'
}],
'provenance': {
'data_source': 'CONVERSATION_NLP',
'data_tier': 'TIER_4_INFERRED',
'extraction_date': extraction_date,
'extraction_method': 'Python extraction from conversation citation metadata',
'confidence_score': 0.6, # Lower confidence for citation-only
'conversation_id': conversation_id
}
}
institutions.append(record)
seen_names.add(source)
return institutions
def main():
"""Main extraction workflow."""
input_file = Path('/Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json')
output_file = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_comprehensive.yaml')
print(f"Extracting Brazilian institutions from: {input_file.name}")
print(f"Output file: {output_file}")
print()
institutions = extract_institutions_from_conversation(input_file)
print(f"✓ Extracted {len(institutions)} institutions")
# Count by type
type_counts = {}
for inst in institutions:
inst_type = inst['institution_type']
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
print(f"\nInstitution type breakdown:")
for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
print(f" {inst_type}: {count}")
# Write output
output_file.parent.mkdir(parents=True, exist_ok=True)
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=100)
print(f"\n✓ Wrote {len(institutions)} records to {output_file}")
print(f"\nSample institutions:")
for inst in institutions[:5]:
print(f" - {inst['name']} ({inst['institution_type']})")
if __name__ == '__main__':
main()