409 lines
16 KiB
Python
409 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Brazilian GLAM Institution Extractor
|
|
|
|
Extracts ALL heritage institutions from Brazilian conversation JSON and creates
|
|
LinkML-compliant YAML records following schema v0.2.0.
|
|
|
|
Expected output: 200+ institutions covering all 27 Brazilian federative units.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Any, Optional
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
|
|
def slugify(text: str) -> str:
|
|
"""Create URL-safe slug from institution name."""
|
|
text = text.lower()
|
|
text = re.sub(r'[àáâãäå]', 'a', text)
|
|
text = re.sub(r'[èéêë]', 'e', text)
|
|
text = re.sub(r'[ìíîï]', 'i', text)
|
|
text = re.sub(r'[òóôõö]', 'o', text)
|
|
text = re.sub(r'[ùúûü]', 'u', text)
|
|
text = re.sub(r'[ç]', 'c', text)
|
|
text = re.sub(r'[^a-z0-9]+', '-', text)
|
|
text = text.strip('-')
|
|
return text[:50] # Limit length
|
|
|
|
|
|
def classify_institution(name: str, description: str = "") -> str:
|
|
"""Classify institution type from name and description."""
|
|
text = (name + " " + description).lower()
|
|
|
|
if any(word in text for word in ['museu', 'museum', 'memorial', 'pinacoteca']):
|
|
return 'MUSEUM'
|
|
elif any(word in text for word in ['biblioteca', 'library', 'bibliotheca']):
|
|
return 'LIBRARY'
|
|
elif any(word in text for word in ['arquivo', 'archive', 'archiv']):
|
|
return 'ARCHIVE'
|
|
elif any(word in text for word in ['ibram', 'iphan', 'secult', 'secretaria', 'fundação de cultura', 'instituto brasileiro']):
|
|
return 'OFFICIAL_INSTITUTION'
|
|
elif any(word in text for word in ['universidade', 'university', 'usp', 'ufmg', 'unicamp', 'ufrj', 'ufba']):
|
|
return 'EDUCATION_PROVIDER'
|
|
elif any(word in text for word in ['centro de pesquisa', 'research center', 'laboratório', 'documentation']):
|
|
return 'RESEARCH_CENTER'
|
|
else:
|
|
return 'MIXED' # Default when unclear
|
|
|
|
|
|
def extract_institutions_from_conversation(json_path: Path) -> List[Dict[str, Any]]:
|
|
"""Extract all institutions from Brazilian GLAM conversation JSON."""
|
|
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
conversation_id = data.get('uuid', '')
|
|
extraction_date = datetime.now(timezone.utc).isoformat()
|
|
|
|
institutions = []
|
|
seen_names = set() # Deduplicate
|
|
|
|
# Extract markdown content from artifacts
|
|
markdown_content = []
|
|
citations = []
|
|
|
|
for message in data.get('chat_messages', []):
|
|
for content_item in message.get('content', []):
|
|
if content_item.get('type') == 'tool_use':
|
|
tool_input = content_item.get('input', {})
|
|
if 'content' in tool_input:
|
|
markdown_content.append(tool_input['content'])
|
|
if 'md_citations' in tool_input:
|
|
citations.extend(tool_input['md_citations'])
|
|
|
|
# Parse markdown artifacts for institutions
|
|
for artifact in markdown_content:
|
|
institutions.extend(parse_artifact(artifact, conversation_id, extraction_date, seen_names))
|
|
|
|
# Parse citations to create digital platform records
|
|
institutions.extend(parse_citations(citations, conversation_id, extraction_date, seen_names))
|
|
|
|
return institutions
|
|
|
|
|
|
def parse_artifact(content: str, conversation_id: str, extraction_date: str, seen_names: set) -> List[Dict[str, Any]]:
|
|
"""Parse markdown artifact content to extract institutions."""
|
|
institutions = []
|
|
|
|
# Pattern 1: Federal institutions (IBRAM, IPHAN, Biblioteca Nacional, etc.)
|
|
federal_pattern = r'\*\*([^*]+(?:Instituto Brasileiro de Museus|IBRAM|IPHAN|Biblioteca Nacional|Arquivo Nacional|Fundação Cultural Palmares)[^*]*)\*\*'
|
|
for match in re.finditer(federal_pattern, content, re.IGNORECASE):
|
|
name = match.group(1).strip()
|
|
if name not in seen_names:
|
|
inst = create_institution_record(name, content, conversation_id, extraction_date)
|
|
if inst:
|
|
institutions.append(inst)
|
|
seen_names.add(name)
|
|
|
|
# Pattern 2: Museums
|
|
museum_pattern = r'\*\*([^*]*(?:Museu|Museum|Memorial|Pinacoteca)[^*]*?)\*\*'
|
|
for match in re.finditer(museum_pattern, content, re.IGNORECASE):
|
|
name = match.group(1).strip()
|
|
if name and name not in seen_names and len(name) > 5:
|
|
inst = create_institution_record(name, content, conversation_id, extraction_date)
|
|
if inst:
|
|
institutions.append(inst)
|
|
seen_names.add(name)
|
|
|
|
# Pattern 3: Libraries
|
|
library_pattern = r'\*\*([^*]*(?:Biblioteca|Library)[^*]*?)\*\*'
|
|
for match in re.finditer(library_pattern, content, re.IGNORECASE):
|
|
name = match.group(1).strip()
|
|
if name and name not in seen_names and len(name) > 5:
|
|
inst = create_institution_record(name, content, conversation_id, extraction_date)
|
|
if inst:
|
|
institutions.append(inst)
|
|
seen_names.add(name)
|
|
|
|
# Pattern 4: Archives
|
|
archive_pattern = r'\*\*([^*]*(?:Arquivo|Archive)[^*]*?)\*\*'
|
|
for match in re.finditer(archive_pattern, content, re.IGNORECASE):
|
|
name = match.group(1).strip()
|
|
if name and name not in seen_names and len(name) > 5:
|
|
inst = create_institution_record(name, content, conversation_id, extraction_date)
|
|
if inst:
|
|
institutions.append(inst)
|
|
seen_names.add(name)
|
|
|
|
# Pattern 5: Universities
|
|
university_pattern = r'\*\*([^*]*(?:Universidade|University|USP|UFMG|UNICAMP|UFRJ|UFBA|UNIFAP|UFAC)[^*]*?)\*\*'
|
|
for match in re.finditer(university_pattern, content, re.IGNORECASE):
|
|
name = match.group(1).strip()
|
|
if name and name not in seen_names and len(name) > 3:
|
|
inst = create_institution_record(name, content, conversation_id, extraction_date)
|
|
if inst:
|
|
institutions.append(inst)
|
|
seen_names.add(name)
|
|
|
|
# Pattern 6: Digital platforms
|
|
platform_pattern = r'\*\*([^*]*(?:Digital|Brasiliana|Hemeroteca|Tainacan|BNDigital)[^*]*?)\*\*'
|
|
for match in re.finditer(platform_pattern, content, re.IGNORECASE):
|
|
name = match.group(1).strip()
|
|
if name and name not in seen_names and len(name) > 5:
|
|
inst = create_institution_record(name, content, conversation_id, extraction_date)
|
|
if inst:
|
|
institutions.append(inst)
|
|
seen_names.add(name)
|
|
|
|
return institutions
|
|
|
|
|
|
def create_institution_record(name: str, context: str, conversation_id: str, extraction_date: str) -> Optional[Dict[str, Any]]:
|
|
"""Create a complete LinkML-compliant institution record."""
|
|
|
|
# Clean up name
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
if not name or len(name) < 4:
|
|
return None
|
|
|
|
# Skip generic headings
|
|
skip_words = ['Executive Summary', 'State Infrastructure', 'Digital Systems', 'Collections', 'Federal', 'Contact']
|
|
if any(skip in name for skip in skip_words):
|
|
return None
|
|
|
|
slug = slugify(name)
|
|
inst_type = classify_institution(name, context)
|
|
|
|
# Extract description from nearby context
|
|
description = extract_description(name, context)
|
|
|
|
# Extract URL if mentioned
|
|
urls = extract_urls_for_institution(name, context)
|
|
|
|
# Extract location
|
|
location = extract_location(name, context)
|
|
|
|
# Determine confidence score
|
|
confidence = calculate_confidence(name, context, urls, location)
|
|
|
|
record = {
|
|
'id': f'https://w3id.org/heritage/custodian/br/{slug}',
|
|
'name': name,
|
|
'institution_type': inst_type,
|
|
'description': description if description else f'Brazilian heritage institution: {name}',
|
|
'provenance': {
|
|
'data_source': 'CONVERSATION_NLP',
|
|
'data_tier': 'TIER_4_INFERRED',
|
|
'extraction_date': extraction_date,
|
|
'extraction_method': 'Python NLP extraction from Brazilian GLAM conversation artifact',
|
|
'confidence_score': confidence,
|
|
'conversation_id': conversation_id
|
|
}
|
|
}
|
|
|
|
# Add location if found
|
|
if location:
|
|
record['locations'] = [location]
|
|
|
|
# Add identifiers if URLs found
|
|
if urls:
|
|
record['identifiers'] = []
|
|
for url in urls[:5]: # Limit to 5 URLs
|
|
record['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': url,
|
|
'identifier_url': url
|
|
})
|
|
|
|
return record
|
|
|
|
|
|
def extract_description(name: str, context: str) -> Optional[str]:
|
|
"""Extract descriptive text about the institution from context."""
|
|
# Find paragraphs mentioning the institution
|
|
sentences = []
|
|
|
|
# Look for sentences containing the institution name
|
|
for paragraph in context.split('\n\n'):
|
|
if name[:20] in paragraph: # Use partial name for matching
|
|
# Extract sentences from this paragraph
|
|
para_sentences = re.split(r'[.!?]+\s+', paragraph)
|
|
for sent in para_sentences[:3]: # Up to 3 sentences
|
|
if len(sent) > 30 and len(sent) < 500:
|
|
clean = re.sub(r'\*\*', '', sent).strip()
|
|
if clean and clean not in sentences:
|
|
sentences.append(clean)
|
|
|
|
if sentences:
|
|
return ' '.join(sentences[:2]) # Combine up to 2 sentences
|
|
return None
|
|
|
|
|
|
def extract_urls_for_institution(name: str, context: str) -> List[str]:
|
|
"""Extract URLs mentioned near institution name."""
|
|
urls = []
|
|
|
|
# Find URLs in context
|
|
url_pattern = r'https?://[^\s<>"\')]+(?:\.[^\s<>"\')\]]+)+'
|
|
|
|
# Look in paragraphs mentioning the institution
|
|
for paragraph in context.split('\n'):
|
|
if name[:15] in paragraph or name.split()[0] in paragraph:
|
|
found_urls = re.findall(url_pattern, paragraph)
|
|
urls.extend(found_urls)
|
|
|
|
return list(set(urls)) # Deduplicate
|
|
|
|
|
|
def extract_location(name: str, context: str) -> Optional[Dict[str, Any]]:
|
|
"""Extract location information for institution."""
|
|
|
|
# Brazilian cities commonly mentioned
|
|
cities = ['São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Belo Horizonte',
|
|
'Curitiba', 'Recife', 'Porto Alegre', 'Manaus', 'Fortaleza',
|
|
'Rio Branco', 'Maceió', 'Macapá', 'Belém', 'Goiânia', 'Campo Grande',
|
|
'Cuiabá', 'João Pessoa', 'Teresina', 'Natal', 'Florianópolis',
|
|
'Aracaju', 'Palmas', 'Boa Vista', 'Vitória']
|
|
|
|
# Brazilian states
|
|
states = {
|
|
'Acre': 'AC', 'Alagoas': 'AL', 'Amapá': 'AP', 'Amazonas': 'AM',
|
|
'Bahia': 'BA', 'Ceará': 'CE', 'Distrito Federal': 'DF', 'Espírito Santo': 'ES',
|
|
'Goiás': 'GO', 'Maranhão': 'MA', 'Mato Grosso': 'MT', 'Mato Grosso do Sul': 'MS',
|
|
'Minas Gerais': 'MG', 'Pará': 'PA', 'Paraíba': 'PB', 'Paraná': 'PR',
|
|
'Pernambuco': 'PE', 'Piauí': 'PI', 'Rio de Janeiro': 'RJ', 'Rio Grande do Norte': 'RN',
|
|
'Rio Grande do Sul': 'RS', 'Rondônia': 'RO', 'Roraima': 'RR', 'Santa Catarina': 'SC',
|
|
'São Paulo': 'SP', 'Sergipe': 'SE', 'Tocantins': 'TO'
|
|
}
|
|
|
|
location = None
|
|
|
|
# Search for city mentions near institution name
|
|
for paragraph in context.split('\n'):
|
|
if name[:15] in paragraph:
|
|
for city in cities:
|
|
if city in paragraph:
|
|
# Try to find state too
|
|
region = None
|
|
for state_name, state_code in states.items():
|
|
if state_name in paragraph or f'({state_code})' in paragraph:
|
|
region = state_name
|
|
break
|
|
|
|
location = {
|
|
'city': city,
|
|
'country': 'BR'
|
|
}
|
|
if region:
|
|
location['region'] = region
|
|
|
|
return location
|
|
|
|
return None
|
|
|
|
|
|
def calculate_confidence(name: str, context: str, urls: List[str], location: Optional[Dict]) -> float:
|
|
"""Calculate confidence score for extraction."""
|
|
score = 0.5 # Base score
|
|
|
|
# Name quality
|
|
if len(name) > 10 and len(name) < 100:
|
|
score += 0.1
|
|
|
|
# Has URL
|
|
if urls:
|
|
score += 0.15
|
|
|
|
# Has location
|
|
if location:
|
|
score += 0.1
|
|
|
|
# Has description in context
|
|
if name in context and len(context) > 100:
|
|
score += 0.1
|
|
|
|
# Mentioned multiple times
|
|
count = context.lower().count(name[:20].lower())
|
|
if count > 2:
|
|
score += 0.05
|
|
|
|
return min(score, 0.95) # Cap at 0.95
|
|
|
|
|
|
def parse_citations(citations: List[Dict], conversation_id: str, extraction_date: str, seen_names: set) -> List[Dict[str, Any]]:
|
|
"""Parse citation metadata to extract additional institutions."""
|
|
institutions = []
|
|
|
|
for citation in citations:
|
|
# Extract from citation titles and sources
|
|
title = citation.get('title', '')
|
|
url = citation.get('url', '')
|
|
|
|
# Look for institutional sources
|
|
for source_item in citation.get('sources', []):
|
|
source = source_item.get('source', '')
|
|
if source and source not in seen_names and len(source) > 5:
|
|
# Create minimal record for citation sources
|
|
slug = slugify(source)
|
|
inst_type = classify_institution(source, title)
|
|
|
|
record = {
|
|
'id': f'https://w3id.org/heritage/custodian/br/{slug}',
|
|
'name': source,
|
|
'institution_type': inst_type,
|
|
'description': f'Institution identified from web citation: {title}',
|
|
'identifiers': [{
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': url,
|
|
'identifier_url': url
|
|
}],
|
|
'locations': [{
|
|
'country': 'BR'
|
|
}],
|
|
'provenance': {
|
|
'data_source': 'CONVERSATION_NLP',
|
|
'data_tier': 'TIER_4_INFERRED',
|
|
'extraction_date': extraction_date,
|
|
'extraction_method': 'Python extraction from conversation citation metadata',
|
|
'confidence_score': 0.6, # Lower confidence for citation-only
|
|
'conversation_id': conversation_id
|
|
}
|
|
}
|
|
|
|
institutions.append(record)
|
|
seen_names.add(source)
|
|
|
|
return institutions
|
|
|
|
|
|
def main():
|
|
"""Main extraction workflow."""
|
|
input_file = Path('/Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json')
|
|
output_file = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_comprehensive.yaml')
|
|
|
|
print(f"Extracting Brazilian institutions from: {input_file.name}")
|
|
print(f"Output file: {output_file}")
|
|
print()
|
|
|
|
institutions = extract_institutions_from_conversation(input_file)
|
|
|
|
print(f"✓ Extracted {len(institutions)} institutions")
|
|
|
|
# Count by type
|
|
type_counts = {}
|
|
for inst in institutions:
|
|
inst_type = inst['institution_type']
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
print(f"\nInstitution type breakdown:")
|
|
for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {inst_type}: {count}")
|
|
|
|
# Write output
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=100)
|
|
|
|
print(f"\n✓ Wrote {len(institutions)} records to {output_file}")
|
|
print(f"\nSample institutions:")
|
|
for inst in institutions[:5]:
|
|
print(f" - {inst['name']} ({inst['institution_type']})")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|