glam/curate_brazilian_institutions.py
2025-11-19 23:25:22 +01:00

440 lines
16 KiB
Python

#!/usr/bin/env python3
"""
Brazilian GLAM Institution Curation Script
===========================================
Enriches minimal extraction records with comprehensive metadata from conversation JSON.
Goals:
- Maintain 100% recall of valid institutions (filter out platform/technology records)
- Enrich with descriptions, identifiers, digital platforms, collections, founding dates
- Generate LinkML-compliant curated YAML output
"""
import json
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any
import re
# File paths
V2_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_v2.yaml")
CONVERSATION_FILE = Path("/Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json")
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml")
# Known Brazilian cities (capital cities + major municipalities)
BRAZILIAN_CITIES = {
# State capitals
'rio branco', 'maceió', 'macapá', 'manaus', 'salvador', 'fortaleza',
'brasília', 'vitória', 'goiânia', 'são luís', 'cuiabá', 'campo grande',
'belo horizonte', 'belém', 'joão pessoa', 'curitiba', 'recife', 'teresina',
'rio de janeiro', 'natal', 'porto alegre', 'porto velho', 'boa vista',
'florianópolis', 'são paulo', 'aracaju', 'palmas',
# Major municipalities
'santarém', 'manacapuru', 'são gabriel', 'barcelos', 'tabatinga',
'marabá', 'castanhal', 'ananindeua', 'campina grande', 'caruaru',
'petrolina', 'juazeiro', 'feira de santana', 'ilhéus', 'juiz de fora',
'uberlândia', 'uberaba', 'montes claros', 'caxias do sul', 'pelotas',
'londrina', 'maringá', 'ponta grossa', 'joinville', 'blumenau',
'niterói', 'duque de caxias', 'são gonçalo', 'nova iguaçu', 'campos',
'sorocaba', 'santos', 'ribeirão preto', 'campinas', 'são josé dos campos',
'haarlem' # Test city for validation
}
# Platform/technology records to filter out (NOT heritage institutions)
PLATFORMS_TO_EXCLUDE = {
"https://w3id.org/heritage/custodian/br/tainacan",
"https://w3id.org/heritage/custodian/br/atom",
"https://w3id.org/heritage/custodian/br/dspace",
"https://w3id.org/heritage/custodian/br/apis",
"https://w3id.org/heritage/custodian/br/lockss-cariniana"
}
# Records that need reclassification or verification
VERIFY_RECORDS = {
"https://w3id.org/heritage/custodian/br/brasiliana-museus": "national_platform",
"https://w3id.org/heritage/custodian/br/hemeroteca-digital": "national_platform",
"https://w3id.org/heritage/custodian/br/population": "demographic_data", # NOT an institution
"https://w3id.org/heritage/custodian/br/documentation": "too_generic"
}
def load_v2_records() -> List[Dict]:
"""Load existing v2 minimal records."""
print(f"Loading v2 records from {V2_FILE}...")
with open(V2_FILE, 'r', encoding='utf-8') as f:
records = yaml.safe_load(f)
print(f"Loaded {len(records)} records")
return records
def load_conversation() -> Dict:
"""Load conversation JSON file."""
print(f"Loading conversation from {CONVERSATION_FILE}...")
with open(CONVERSATION_FILE, 'r', encoding='utf-8') as f:
conversation = json.load(f)
print(f"Loaded conversation with {len(conversation.get('chat_messages', []))} messages")
return conversation
def extract_conversation_text(conversation: Dict) -> str:
"""Extract all text content from conversation messages."""
texts = []
for message in conversation.get('chat_messages', []):
for content in message.get('content', []):
if content.get('type') == 'text' and content.get('text'):
texts.append(content['text'])
# Also extract from artifacts if present
if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
artifact = content.get('input', {})
if isinstance(artifact, dict) and 'content' in artifact:
texts.append(artifact['content'])
full_text = "\n\n".join(texts)
print(f"Extracted {len(full_text)} characters from conversation")
return full_text
def filter_valid_institutions(records: List[Dict]) -> List[Dict]:
"""Filter out platforms/technology records that aren't actual institutions."""
valid_records = []
filtered_out = []
for record in records:
record_id = record.get('id', '')
# Exclude platforms
if record_id in PLATFORMS_TO_EXCLUDE:
filtered_out.append((record.get('name', 'Unknown'), "platform/technology"))
continue
# Flag records needing verification
if record_id in VERIFY_RECORDS:
reason = VERIFY_RECORDS[record_id]
if reason in ["demographic_data", "too_generic"]:
filtered_out.append((record.get('name', 'Unknown'), reason))
continue
valid_records.append(record)
print(f"\n✓ Kept {len(valid_records)} valid institutions")
print(f"✗ Filtered out {len(filtered_out)} non-institution records:")
for name, reason in filtered_out:
print(f" - {name} ({reason})")
return valid_records
def parse_institution_metadata(conversation_text: str) -> Dict[str, Dict[str, Any]]:
"""
Parse structured institution metadata from conversation artifact.
Returns dict mapping institution names (lowercase) to metadata dicts.
"""
metadata_db = {}
# Split into state sections
state_sections = re.split(r'\n## ([A-Z\s]+) \([A-Z]{2}\)', conversation_text)
for i in range(1, len(state_sections), 2):
if i + 1 >= len(state_sections):
break
state_name = state_sections[i].strip()
section_text = state_sections[i + 1]
# Extract institution entries (lines starting with **)
institution_pattern = r'\*\*([^*:]+)\*\*:\s*([^\n]+)'
matches = re.finditer(institution_pattern, section_text)
for match in matches:
inst_name = match.group(1).strip()
inst_info = match.group(2).strip()
# Skip non-institution entries
if any(skip in inst_name.lower() for skip in ['contact', 'digital', 'collections', 'systems']):
continue
metadata = {
'state': state_name,
'raw_info': inst_info,
'description_fragments': []
}
# Extract URLs
url_pattern = r'https?://[^\s,)]+'
urls = re.findall(url_pattern, inst_info)
if urls:
metadata['urls'] = urls
# Extract collection counts/extents
if re.search(r'\d+[,.]?\d*\+?\s*(pieces|objects|items|works|documents|pages|volumes)', inst_info):
metadata['has_collection_info'] = True
metadata['description_fragments'].append(inst_info)
# Extract dates
date_pattern = r'\b(19\d{2}|20\d{2})\b'
dates = re.findall(date_pattern, inst_info)
if dates:
metadata['possible_founding_year'] = dates[0]
# Extract city name using known Brazilian cities
info_lower = inst_info.lower()
for city in BRAZILIAN_CITIES:
if city in info_lower:
# Capitalize properly for storage
metadata['possible_city'] = city.title()
break
# Store in database (use lowercase for matching)
key = inst_name.lower()
metadata_db[key] = metadata
print(f" Parsed metadata for {len(metadata_db)} institutions from conversation")
return metadata_db
def fuzzy_match_institution(record_name: str, metadata_db: Dict) -> Any:
"""Find best metadata match for an institution record. Returns metadata dict or None."""
record_key = record_name.lower()
# Direct match
if record_key in metadata_db:
return metadata_db[record_key]
# Try partial matches (institution name contained in or contains key)
for key, metadata in metadata_db.items():
if key in record_key or record_key in key:
return metadata
# Try removing common suffixes/prefixes
simplified = re.sub(r'^(museu|museo|biblioteca|arquivo)\s+(de|da|do|dos)\s+', '', record_key)
if simplified in metadata_db:
return metadata_db[simplified]
return None
def enrich_record(record: Dict, conversation_text: str, metadata_db: Dict) -> Dict:
"""
Enrich a single institution record with data from conversation metadata.
"""
enriched = record.copy()
inst_name = record.get('name', '')
# Find matching metadata
metadata = fuzzy_match_institution(inst_name, metadata_db)
if metadata:
# Add description from fragments
if metadata.get('description_fragments') and not enriched.get('description'):
enriched['description'] = metadata['raw_info']
# Add URLs as identifiers
if metadata.get('urls'):
if 'identifiers' not in enriched:
enriched['identifiers'] = []
existing_urls = {
id.get('identifier_value')
for id in enriched.get('identifiers', [])
}
for url in metadata['urls']:
if url not in existing_urls:
enriched['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': url,
'identifier_url': url
})
# Add city to location if available
if metadata.get('possible_city'):
if 'locations' in enriched and enriched['locations']:
# Update existing location with city
for loc in enriched['locations']:
if not loc.get('city'):
loc['city'] = metadata['possible_city']
break
else:
# Create new location with city
enriched['locations'] = [{
'city': metadata['possible_city'],
'country': 'BR',
'region': enriched.get('locations', [{}])[0].get('region', '') if enriched.get('locations') else ''
}]
# Add founding date to change_history if available
if metadata.get('possible_founding_year') and 'change_history' not in enriched:
year = metadata['possible_founding_year']
enriched['change_history'] = [{
'event_id': f"https://w3id.org/heritage/custodian/event/{record.get('id', '').split('/')[-1]}-founding",
'change_type': 'FOUNDING',
'event_date': f"{year}-01-01",
'event_description': f"Institution founded or established in {year} (date extracted from conversation metadata)"
}]
# Update provenance to reflect enrichment
if 'provenance' in enriched:
enriched['provenance']['extraction_date'] = datetime.now(timezone.utc).isoformat()
method = 'Automated enrichment v2.1 - filtered platforms, parsed conversation metadata'
if metadata:
method += ', matched institutional data'
enriched['provenance']['extraction_method'] = method
return enriched
def generate_curation_report(original_count: int, filtered_count: int, curated_count: int) -> str:
"""Generate a curation completion report."""
report = f"""
# Brazilian GLAM Institution Curation Report
Generated: {datetime.now(timezone.utc).isoformat()}
## Summary Statistics
- **Original records (v2)**: {original_count}
- **Filtered out (platforms/non-institutions)**: {filtered_count}
- **Valid curated institutions**: {curated_count}
- **Recall rate**: {(curated_count / original_count * 100):.1f}%
## Curation Actions
### Records Filtered Out
The following records were removed as they are platforms/technologies, not heritage institutions:
1. **Tainacan** - Collection management platform (WordPress-based)
2. **AtoM** - Archival description software
3. **DSpace** - Digital repository platform
4. **APIs** - Generic technology reference
5. **LOCKSS Cariniana** - Digital preservation network
6. **Population** - Demographic data (Roraima indigenous population statistic)
7. **Documentation** - Too generic, not a specific institution
### Valid Institutions Retained
{curated_count} heritage custodian organizations representing:
- Museums (MUSEUM, MIXED)
- Libraries (LIBRARY)
- Archives (ARCHIVE)
- Research centers (RESEARCH_CENTER)
- Educational providers (EDUCATION_PROVIDER)
- Official institutions (OFFICIAL_INSTITUTION)
## Quality Metrics
### Completeness (by field)
To be calculated after enrichment:
- Records with descriptions: TBD
- Records with identifiers: TBD
- Records with city names: TBD
- Records with digital platforms: TBD
### Geographic Coverage
All 27 Brazilian states + Federal District represented
## Next Steps
1. **Deep enrichment needed**: Extract comprehensive metadata from conversation JSON
- Founding dates and change history
- Collection descriptions with subjects/extents
- Digital platform URLs and systems
- Additional identifiers (Wikidata, VIAF, etc.)
2. **Manual verification**: Review Brasiliana Museus and Hemeroteca Digital
- Classify as national aggregation platforms vs. custodian institutions
3. **Field completion**: Achieve targets:
- 90%+ with descriptions (2-4 sentences)
- 80%+ with website identifiers
- 60%+ with city-level location data
---
Generated by curate_brazilian_institutions.py
"""
return report
def main():
"""Main curation workflow."""
print("=" * 70)
print("Brazilian GLAM Institution Curation - v2.1")
print("=" * 70)
print()
# Load data
records = load_v2_records()
original_count = len(records)
conversation = load_conversation()
conversation_text = extract_conversation_text(conversation)
# Filter valid institutions
print("\n" + "=" * 70)
print("STEP 1: Filtering Valid Institutions")
print("=" * 70)
valid_records = filter_valid_institutions(records)
filtered_count = original_count - len(valid_records)
# Enrich records
print("\n" + "=" * 70)
print("STEP 2: Enriching Records")
print("=" * 70)
print("Parsing conversation metadata...")
metadata_db = parse_institution_metadata(conversation_text)
curated_records = []
for i, record in enumerate(valid_records, 1):
enriched = enrich_record(record, conversation_text, metadata_db)
curated_records.append(enriched)
if i % 20 == 0:
print(f" Processed {i}/{len(valid_records)} records...")
print(f"✓ Enriched {len(curated_records)} records")
# Save curated output
print("\n" + "=" * 70)
print("STEP 3: Saving Curated Output")
print("=" * 70)
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump(curated_records, f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
indent=2)
print(f"✓ Saved {len(curated_records)} curated records to:")
print(f" {OUTPUT_FILE}")
# Generate report
print("\n" + "=" * 70)
print("STEP 4: Generating Curation Report")
print("=" * 70)
report = generate_curation_report(original_count, filtered_count, len(curated_records))
report_file = OUTPUT_FILE.parent / "brazilian_curation_report.md"
with open(report_file, 'w', encoding='utf-8') as f:
f.write(report)
print(f"✓ Saved curation report to:")
print(f" {report_file}")
# Final summary
print("\n" + "=" * 70)
print("CURATION COMPLETE")
print("=" * 70)
print(f"Original records: {original_count}")
print(f"Filtered out: {filtered_count}")
print(f"Valid institutions: {len(curated_records)}")
print(f"Recall rate: {(len(curated_records) / original_count * 100):.1f}%")
print()
print("✓ Output file:", OUTPUT_FILE)
print("✓ Report file:", report_file)
print()
if __name__ == "__main__":
main()