371 lines
14 KiB
Python
371 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Brazilian GLAM Institution Extractor v2.0
|
|
|
|
Improved extraction focusing on structured bullet-point lists from the state-by-state
|
|
artifact section. Extracts actual institution names (not sentence fragments).
|
|
|
|
Expected output: 150-200 quality institutions with proper names, types, and metadata.
|
|
|
|
Source file: /Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
from pathlib import Path
|
|
import yaml
|
|
|
|
|
|
def slugify(text: str) -> str:
|
|
"""Create URL-safe slug from institution name."""
|
|
text = text.lower()
|
|
text = re.sub(r'[àáâãäå]', 'a', text)
|
|
text = re.sub(r'[èéêë]', 'e', text)
|
|
text = re.sub(r'[ìíîï]', 'i', text)
|
|
text = re.sub(r'[òóôõö]', 'o', text)
|
|
text = re.sub(r'[ùúûü]', 'u', text)
|
|
text = re.sub(r'[ç]', 'c', text)
|
|
text = re.sub(r'[^a-z0-9]+', '-', text)
|
|
text = text.strip('-')
|
|
return text[:50]
|
|
|
|
|
|
def classify_institution(name: str, description: str = "") -> str:
|
|
"""Classify institution type from name and description.
|
|
|
|
Priority order:
|
|
1. Check name first (most reliable)
|
|
2. Then description (less reliable, may contain misleading keywords)
|
|
"""
|
|
name_lower = name.lower()
|
|
desc_lower = description.lower()
|
|
|
|
# Check NAME first for strong indicators
|
|
if 'museu' in name_lower or 'museum' in name_lower or 'memorial' in name_lower or 'pinacoteca' in name_lower:
|
|
return 'MUSEUM'
|
|
elif 'arquivo' in name_lower or 'archiv' in name_lower:
|
|
return 'ARCHIVE'
|
|
elif 'biblioteca' in name_lower or 'library' in name_lower or 'bibliotheca' in name_lower:
|
|
return 'LIBRARY'
|
|
|
|
# For acronyms and ambiguous names, check description
|
|
text = (name_lower + " " + desc_lower)
|
|
|
|
if any(word in text for word in ['museu', 'museum', 'memorial', 'pinacoteca', 'casa de cultura', 'mam-', 'mam ', 'marco', 'musear']):
|
|
return 'MUSEUM'
|
|
elif 'arquivo' in text or 'archiv' in text:
|
|
return 'ARCHIVE'
|
|
elif 'biblioteca' in text or 'bce' in text:
|
|
return 'LIBRARY'
|
|
elif any(word in text for word in ['ibram', 'iphan', 'secult', 'secretaria', 'fundação de cultura', 'instituto brasileiro', 'fpc/', 'ipac', 'unesco']):
|
|
return 'OFFICIAL_INSTITUTION'
|
|
elif any(word in text for word in ['universidade', 'university', 'ufac', 'ufal', 'unifap', 'ufpa', 'usp', 'ufmg', 'unicamp', 'ufrj', 'ufba', 'ufam', 'ufc', 'unb', 'ufg', 'ufma', 'ufmt', 'ufms', 'ufpe', 'ufpi', 'ufrn', 'ufpb', 'ufrgs', 'ufpr', 'ufsc', 'ufes', 'repository', 'repositories']):
|
|
return 'EDUCATION_PROVIDER'
|
|
elif any(word in text for word in ['centro de pesquisa', 'research center', 'laboratório', 'documentation', 'cepap', 'instituto histórico', 'instituto geográfico']):
|
|
return 'RESEARCH_CENTER'
|
|
elif any(word in text for word in ['teatro', 'centro cultural', 'ccbb', 'centro dragão', 'geopark', 'casa das', 'projects', 'projetos']):
|
|
return 'MIXED'
|
|
else:
|
|
return 'MIXED'
|
|
|
|
|
|
def parse_state_sections(content: str, source_file_path: str) -> List[Dict[str, Any]]:
|
|
"""Parse state-by-state sections to extract institution records."""
|
|
institutions = []
|
|
|
|
# Brazilian states mapping (for region extraction)
|
|
states = {
|
|
'ACRE': 'AC', 'ALAGOAS': 'AL', 'AMAPÁ': 'AP', 'AMAZONAS': 'AM',
|
|
'BAHIA': 'BA', 'CEARÁ': 'CE', 'DISTRITO FEDERAL': 'DF', 'ESPÍRITO SANTO': 'ES',
|
|
'GOIÁS': 'GO', 'MARANHÃO': 'MA', 'MATO GROSSO': 'MT', 'MATO GROSSO DO SUL': 'MS',
|
|
'MINAS GERAIS': 'MG', 'PARÁ': 'PA', 'PARAÍBA': 'PB', 'PARANÁ': 'PR',
|
|
'PERNAMBUCO': 'PE', 'PIAUÍ': 'PI', 'RIO DE JANEIRO': 'RJ', 'RIO GRANDE DO NORTE': 'RN',
|
|
'RIO GRANDE DO SUL': 'RS', 'RONDÔNIA': 'RO', 'RORAIMA': 'RR', 'SANTA CATARINA': 'SC',
|
|
'SÃO PAULO': 'SP', 'SERGIPE': 'SE', 'TOCANTINS': 'TO'
|
|
}
|
|
|
|
# Split content by state headers (## STATE_NAME (XX))
|
|
state_pattern = r'## ([A-ZÀÁÂÃÄÅÈÉÊËÌÍÎÏÒÓÔÕÖÙÚÛÜ\s]+) \(([A-Z]{2})\)'
|
|
state_sections = re.split(state_pattern, content)
|
|
|
|
current_state = None
|
|
current_state_code = None
|
|
|
|
for i, section in enumerate(state_sections):
|
|
# Check if this is a state name
|
|
if i % 3 == 1: # State names appear at positions 1, 4, 7, etc.
|
|
current_state = section.strip()
|
|
elif i % 3 == 2: # State codes appear at positions 2, 5, 8, etc.
|
|
current_state_code = section.strip()
|
|
elif i % 3 == 0 and i > 0: # Content appears at positions 3, 6, 9, etc.
|
|
# Parse institutions from this state's section (only if state info is available)
|
|
if current_state and current_state_code:
|
|
state_institutions = parse_institutions_from_section(
|
|
section,
|
|
current_state,
|
|
current_state_code,
|
|
source_file_path
|
|
)
|
|
institutions.extend(state_institutions)
|
|
|
|
return institutions
|
|
|
|
|
|
def parse_institutions_from_section(section: str, state_name: str, state_code: str, source_file_path: str) -> List[Dict[str, Any]]:
|
|
"""Parse individual institutions from a state section."""
|
|
institutions = []
|
|
|
|
# Pattern: - **Institution Name**: description/details
|
|
# Match bullet points with bold institution names
|
|
bullet_pattern = r'^[\s]*[-•]\s*\*\*([^*:]+?)(?:\s*\([^)]+\))?\*\*:?\s*(.*)$'
|
|
|
|
lines = section.split('\n')
|
|
|
|
for line in lines:
|
|
match = re.match(bullet_pattern, line, re.MULTILINE)
|
|
if match:
|
|
name = match.group(1).strip()
|
|
details = match.group(2).strip()
|
|
|
|
# Skip section headers and generic labels
|
|
skip_words = ['State Infrastructure', 'Digital Systems', 'Collections', 'Digital Initiatives',
|
|
'Contact', 'Federal', 'Technical', 'Metadata', 'Notable', 'Key Features',
|
|
'Preservation', 'Ongoing', 'Major', 'Systems', 'Coverage', 'Database']
|
|
if any(skip in name for skip in skip_words):
|
|
continue
|
|
|
|
# Skip Contact entries (phone numbers, addresses)
|
|
if re.search(r'\(\d{2}\)\s*\d', name): # Brazilian phone format
|
|
continue
|
|
|
|
# Skip if name is too short or too long
|
|
if len(name) < 3 or len(name) > 150:
|
|
continue
|
|
|
|
# Extract URL from details
|
|
url_match = re.search(r'https?://[^\s,]+', details)
|
|
url = url_match.group(0) if url_match else None
|
|
|
|
# Extract city if mentioned
|
|
city = extract_city_from_details(details, name)
|
|
|
|
# Create institution record
|
|
inst = create_institution_record_v2(
|
|
name=name,
|
|
description=details,
|
|
state_name=state_name,
|
|
state_code=state_code,
|
|
city=city,
|
|
url=url,
|
|
source_file_path=source_file_path
|
|
)
|
|
|
|
if inst:
|
|
institutions.append(inst)
|
|
|
|
return institutions
|
|
|
|
|
|
def extract_city_from_details(details: str, name: str) -> Optional[str]:
|
|
"""Extract city name from institution details or name."""
|
|
# Common Brazilian cities
|
|
cities = [
|
|
'São Paulo', 'Rio de Janeiro', 'Brasília', 'Salvador', 'Belo Horizonte',
|
|
'Curitiba', 'Recife', 'Porto Alegre', 'Manaus', 'Fortaleza',
|
|
'Rio Branco', 'Maceió', 'Macapá', 'Belém', 'Goiânia', 'Campo Grande',
|
|
'Cuiabá', 'João Pessoa', 'Teresina', 'Natal', 'Florianópolis',
|
|
'Aracaju', 'Palmas', 'Boa Vista', 'Vitória', 'São Luís', 'Campinas',
|
|
'Santos', 'Niterói', 'Ouro Preto', 'Petrópolis', 'Paraty', 'Olinda',
|
|
'Tiradentes', 'Diamantina', 'Cachoeira', 'São Cristóvão'
|
|
]
|
|
|
|
# Check in details first
|
|
for city in cities:
|
|
if city in details or city in name:
|
|
return city
|
|
|
|
return None
|
|
|
|
|
|
def create_institution_record_v2(
|
|
name: str,
|
|
description: str,
|
|
state_name: str,
|
|
state_code: str,
|
|
city: Optional[str],
|
|
url: Optional[str],
|
|
source_file_path: str
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""Create LinkML-compliant institution record from structured data."""
|
|
|
|
# Clean name
|
|
name = re.sub(r'\s+', ' ', name).strip()
|
|
|
|
# Create slug for ID
|
|
slug = slugify(name)
|
|
|
|
# Classify institution type
|
|
inst_type = classify_institution(name, description)
|
|
|
|
# Calculate confidence based on available data
|
|
confidence = 0.7 # Base for structured extraction
|
|
if url:
|
|
confidence += 0.1
|
|
if city:
|
|
confidence += 0.05
|
|
if len(description) > 50:
|
|
confidence += 0.1
|
|
confidence = min(confidence, 0.95)
|
|
|
|
# Create location record
|
|
location = {
|
|
'country': 'BR',
|
|
'region': state_name
|
|
}
|
|
if city:
|
|
location['city'] = city
|
|
|
|
# Create record
|
|
record = {
|
|
'id': f'https://w3id.org/heritage/custodian/br/{slug}',
|
|
'name': name,
|
|
'institution_type': inst_type,
|
|
'locations': [location],
|
|
'provenance': {
|
|
'data_source': 'CONVERSATION_NLP',
|
|
'data_tier': 'TIER_4_INFERRED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Python structured extraction from Brazilian state-by-state artifact v2.0',
|
|
'confidence_score': confidence,
|
|
'conversation_id': '2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5',
|
|
'source_url': f'file://{source_file_path}'
|
|
}
|
|
}
|
|
|
|
# Add description if meaningful
|
|
if description and len(description) > 10:
|
|
# Clean up description
|
|
desc = description
|
|
# Remove URLs from description text
|
|
desc = re.sub(r'https?://[^\s,]+', '', desc).strip()
|
|
# Limit length
|
|
if len(desc) > 500:
|
|
desc = desc[:497] + '...'
|
|
if desc:
|
|
record['description'] = desc
|
|
|
|
# Add URL as identifier
|
|
if url:
|
|
record['identifiers'] = [{
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': url,
|
|
'identifier_url': url
|
|
}]
|
|
|
|
return record
|
|
|
|
|
|
def extract_institutions_from_conversation(json_path: Path) -> List[Dict[str, Any]]:
|
|
"""Main extraction function."""
|
|
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Extract markdown artifacts
|
|
markdown_artifacts = []
|
|
|
|
for message in data.get('chat_messages', []):
|
|
for content_item in message.get('content', []):
|
|
if content_item.get('type') == 'tool_use':
|
|
tool_input = content_item.get('input', {})
|
|
if 'content' in tool_input:
|
|
markdown_artifacts.append(tool_input['content'])
|
|
|
|
# Process all artifacts
|
|
all_institutions = []
|
|
seen_names = set()
|
|
|
|
# Convert Path to absolute string for source_url
|
|
source_file_path = str(json_path.resolve())
|
|
|
|
for artifact in markdown_artifacts:
|
|
# Parse state sections
|
|
institutions = parse_state_sections(artifact, source_file_path)
|
|
|
|
# Deduplicate by name
|
|
for inst in institutions:
|
|
name = inst['name']
|
|
if name not in seen_names:
|
|
all_institutions.append(inst)
|
|
seen_names.add(name)
|
|
|
|
return all_institutions
|
|
|
|
|
|
def main():
|
|
"""Main extraction workflow."""
|
|
input_file = Path('/Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json')
|
|
output_file = Path('/Users/kempersc/apps/glam/data/instances/brazilian_institutions_v2.yaml')
|
|
|
|
print(f"Brazilian GLAM Institution Extractor v2.0")
|
|
print(f"=" * 60)
|
|
print(f"Input: {input_file.name}")
|
|
print(f"Output: {output_file}")
|
|
print()
|
|
|
|
institutions = extract_institutions_from_conversation(input_file)
|
|
|
|
print(f"✓ Extracted {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Type breakdown
|
|
type_counts = {}
|
|
for inst in institutions:
|
|
inst_type = inst['institution_type']
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
print(f"Institution Type Distribution:")
|
|
for inst_type, count in sorted(type_counts.items(), key=lambda x: x[1], reverse=True):
|
|
print(f" {inst_type:25s}: {count:3d}")
|
|
print()
|
|
|
|
# Location coverage
|
|
states_covered = set()
|
|
for inst in institutions:
|
|
if inst.get('locations'):
|
|
state = inst['locations'][0].get('region')
|
|
if state:
|
|
states_covered.add(state)
|
|
|
|
print(f"Geographic Coverage:")
|
|
print(f" States covered: {len(states_covered)}/27")
|
|
print()
|
|
|
|
# URL coverage
|
|
with_urls = sum(1 for inst in institutions if inst.get('identifiers'))
|
|
print(f"Data Quality:")
|
|
print(f" Institutions with URLs: {with_urls}/{len(institutions)} ({100*with_urls/len(institutions):.1f}%)")
|
|
|
|
# Average confidence
|
|
avg_confidence = sum(inst['provenance']['confidence_score'] for inst in institutions) / len(institutions)
|
|
print(f" Average confidence: {avg_confidence:.3f}")
|
|
print()
|
|
|
|
# Write output
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False, width=120)
|
|
|
|
print(f"✓ Wrote {len(institutions)} records to {output_file}")
|
|
print()
|
|
print(f"Sample Institutions:")
|
|
for inst in institutions[:10]:
|
|
city = inst['locations'][0].get('city', '(state level)')
|
|
print(f" - {inst['name'][:50]:50s} | {inst['institution_type']:12s} | {city}")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|