440 lines
16 KiB
Python
440 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Brazilian GLAM Institution Curation Script
|
|
===========================================
|
|
|
|
Enriches minimal extraction records with comprehensive metadata from conversation JSON.
|
|
|
|
Goals:
|
|
- Maintain 100% recall of valid institutions (filter out platform/technology records)
|
|
- Enrich with descriptions, identifiers, digital platforms, collections, founding dates
|
|
- Generate LinkML-compliant curated YAML output
|
|
"""
|
|
|
|
import json
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any
|
|
import re
|
|
|
|
# File paths
|
|
V2_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_v2.yaml")
|
|
CONVERSATION_FILE = Path("/Users/kempersc/apps/glam/2025-09-22T14-40-15-0102c00a-4c0a-4488-bdca-5dd9fb94c9c5-Brazilian_GLAM_collection_inventories.json")
|
|
OUTPUT_FILE = Path("/Users/kempersc/apps/glam/data/instances/brazilian_institutions_curated_v2.yaml")
|
|
|
|
# Known Brazilian cities (capital cities + major municipalities)
|
|
BRAZILIAN_CITIES = {
|
|
# State capitals
|
|
'rio branco', 'maceió', 'macapá', 'manaus', 'salvador', 'fortaleza',
|
|
'brasília', 'vitória', 'goiânia', 'são luís', 'cuiabá', 'campo grande',
|
|
'belo horizonte', 'belém', 'joão pessoa', 'curitiba', 'recife', 'teresina',
|
|
'rio de janeiro', 'natal', 'porto alegre', 'porto velho', 'boa vista',
|
|
'florianópolis', 'são paulo', 'aracaju', 'palmas',
|
|
# Major municipalities
|
|
'santarém', 'manacapuru', 'são gabriel', 'barcelos', 'tabatinga',
|
|
'marabá', 'castanhal', 'ananindeua', 'campina grande', 'caruaru',
|
|
'petrolina', 'juazeiro', 'feira de santana', 'ilhéus', 'juiz de fora',
|
|
'uberlândia', 'uberaba', 'montes claros', 'caxias do sul', 'pelotas',
|
|
'londrina', 'maringá', 'ponta grossa', 'joinville', 'blumenau',
|
|
'niterói', 'duque de caxias', 'são gonçalo', 'nova iguaçu', 'campos',
|
|
'sorocaba', 'santos', 'ribeirão preto', 'campinas', 'são josé dos campos',
|
|
'haarlem' # Test city for validation
|
|
}
|
|
|
|
# Platform/technology records to filter out (NOT heritage institutions)
|
|
PLATFORMS_TO_EXCLUDE = {
|
|
"https://w3id.org/heritage/custodian/br/tainacan",
|
|
"https://w3id.org/heritage/custodian/br/atom",
|
|
"https://w3id.org/heritage/custodian/br/dspace",
|
|
"https://w3id.org/heritage/custodian/br/apis",
|
|
"https://w3id.org/heritage/custodian/br/lockss-cariniana"
|
|
}
|
|
|
|
# Records that need reclassification or verification
|
|
VERIFY_RECORDS = {
|
|
"https://w3id.org/heritage/custodian/br/brasiliana-museus": "national_platform",
|
|
"https://w3id.org/heritage/custodian/br/hemeroteca-digital": "national_platform",
|
|
"https://w3id.org/heritage/custodian/br/population": "demographic_data", # NOT an institution
|
|
"https://w3id.org/heritage/custodian/br/documentation": "too_generic"
|
|
}
|
|
|
|
|
|
def load_v2_records() -> List[Dict]:
|
|
"""Load existing v2 minimal records."""
|
|
print(f"Loading v2 records from {V2_FILE}...")
|
|
with open(V2_FILE, 'r', encoding='utf-8') as f:
|
|
records = yaml.safe_load(f)
|
|
print(f"Loaded {len(records)} records")
|
|
return records
|
|
|
|
|
|
def load_conversation() -> Dict:
|
|
"""Load conversation JSON file."""
|
|
print(f"Loading conversation from {CONVERSATION_FILE}...")
|
|
with open(CONVERSATION_FILE, 'r', encoding='utf-8') as f:
|
|
conversation = json.load(f)
|
|
print(f"Loaded conversation with {len(conversation.get('chat_messages', []))} messages")
|
|
return conversation
|
|
|
|
|
|
def extract_conversation_text(conversation: Dict) -> str:
|
|
"""Extract all text content from conversation messages."""
|
|
texts = []
|
|
for message in conversation.get('chat_messages', []):
|
|
for content in message.get('content', []):
|
|
if content.get('type') == 'text' and content.get('text'):
|
|
texts.append(content['text'])
|
|
# Also extract from artifacts if present
|
|
if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
|
|
artifact = content.get('input', {})
|
|
if isinstance(artifact, dict) and 'content' in artifact:
|
|
texts.append(artifact['content'])
|
|
|
|
full_text = "\n\n".join(texts)
|
|
print(f"Extracted {len(full_text)} characters from conversation")
|
|
return full_text
|
|
|
|
|
|
def filter_valid_institutions(records: List[Dict]) -> List[Dict]:
|
|
"""Filter out platforms/technology records that aren't actual institutions."""
|
|
valid_records = []
|
|
filtered_out = []
|
|
|
|
for record in records:
|
|
record_id = record.get('id', '')
|
|
|
|
# Exclude platforms
|
|
if record_id in PLATFORMS_TO_EXCLUDE:
|
|
filtered_out.append((record.get('name', 'Unknown'), "platform/technology"))
|
|
continue
|
|
|
|
# Flag records needing verification
|
|
if record_id in VERIFY_RECORDS:
|
|
reason = VERIFY_RECORDS[record_id]
|
|
if reason in ["demographic_data", "too_generic"]:
|
|
filtered_out.append((record.get('name', 'Unknown'), reason))
|
|
continue
|
|
|
|
valid_records.append(record)
|
|
|
|
print(f"\n✓ Kept {len(valid_records)} valid institutions")
|
|
print(f"✗ Filtered out {len(filtered_out)} non-institution records:")
|
|
for name, reason in filtered_out:
|
|
print(f" - {name} ({reason})")
|
|
|
|
return valid_records
|
|
|
|
|
|
def parse_institution_metadata(conversation_text: str) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Parse structured institution metadata from conversation artifact.
|
|
|
|
Returns dict mapping institution names (lowercase) to metadata dicts.
|
|
"""
|
|
metadata_db = {}
|
|
|
|
# Split into state sections
|
|
state_sections = re.split(r'\n## ([A-Z\s]+) \([A-Z]{2}\)', conversation_text)
|
|
|
|
for i in range(1, len(state_sections), 2):
|
|
if i + 1 >= len(state_sections):
|
|
break
|
|
|
|
state_name = state_sections[i].strip()
|
|
section_text = state_sections[i + 1]
|
|
|
|
# Extract institution entries (lines starting with **)
|
|
institution_pattern = r'\*\*([^*:]+)\*\*:\s*([^\n]+)'
|
|
matches = re.finditer(institution_pattern, section_text)
|
|
|
|
for match in matches:
|
|
inst_name = match.group(1).strip()
|
|
inst_info = match.group(2).strip()
|
|
|
|
# Skip non-institution entries
|
|
if any(skip in inst_name.lower() for skip in ['contact', 'digital', 'collections', 'systems']):
|
|
continue
|
|
|
|
metadata = {
|
|
'state': state_name,
|
|
'raw_info': inst_info,
|
|
'description_fragments': []
|
|
}
|
|
|
|
# Extract URLs
|
|
url_pattern = r'https?://[^\s,)]+'
|
|
urls = re.findall(url_pattern, inst_info)
|
|
if urls:
|
|
metadata['urls'] = urls
|
|
|
|
# Extract collection counts/extents
|
|
if re.search(r'\d+[,.]?\d*\+?\s*(pieces|objects|items|works|documents|pages|volumes)', inst_info):
|
|
metadata['has_collection_info'] = True
|
|
metadata['description_fragments'].append(inst_info)
|
|
|
|
# Extract dates
|
|
date_pattern = r'\b(19\d{2}|20\d{2})\b'
|
|
dates = re.findall(date_pattern, inst_info)
|
|
if dates:
|
|
metadata['possible_founding_year'] = dates[0]
|
|
|
|
# Extract city name using known Brazilian cities
|
|
info_lower = inst_info.lower()
|
|
for city in BRAZILIAN_CITIES:
|
|
if city in info_lower:
|
|
# Capitalize properly for storage
|
|
metadata['possible_city'] = city.title()
|
|
break
|
|
|
|
# Store in database (use lowercase for matching)
|
|
key = inst_name.lower()
|
|
metadata_db[key] = metadata
|
|
|
|
print(f" Parsed metadata for {len(metadata_db)} institutions from conversation")
|
|
return metadata_db
|
|
|
|
|
|
def fuzzy_match_institution(record_name: str, metadata_db: Dict) -> Any:
|
|
"""Find best metadata match for an institution record. Returns metadata dict or None."""
|
|
record_key = record_name.lower()
|
|
|
|
# Direct match
|
|
if record_key in metadata_db:
|
|
return metadata_db[record_key]
|
|
|
|
# Try partial matches (institution name contained in or contains key)
|
|
for key, metadata in metadata_db.items():
|
|
if key in record_key or record_key in key:
|
|
return metadata
|
|
|
|
# Try removing common suffixes/prefixes
|
|
simplified = re.sub(r'^(museu|museo|biblioteca|arquivo)\s+(de|da|do|dos)\s+', '', record_key)
|
|
if simplified in metadata_db:
|
|
return metadata_db[simplified]
|
|
|
|
return None
|
|
|
|
|
|
def enrich_record(record: Dict, conversation_text: str, metadata_db: Dict) -> Dict:
|
|
"""
|
|
Enrich a single institution record with data from conversation metadata.
|
|
"""
|
|
enriched = record.copy()
|
|
inst_name = record.get('name', '')
|
|
|
|
# Find matching metadata
|
|
metadata = fuzzy_match_institution(inst_name, metadata_db)
|
|
|
|
if metadata:
|
|
# Add description from fragments
|
|
if metadata.get('description_fragments') and not enriched.get('description'):
|
|
enriched['description'] = metadata['raw_info']
|
|
|
|
# Add URLs as identifiers
|
|
if metadata.get('urls'):
|
|
if 'identifiers' not in enriched:
|
|
enriched['identifiers'] = []
|
|
|
|
existing_urls = {
|
|
id.get('identifier_value')
|
|
for id in enriched.get('identifiers', [])
|
|
}
|
|
|
|
for url in metadata['urls']:
|
|
if url not in existing_urls:
|
|
enriched['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': url,
|
|
'identifier_url': url
|
|
})
|
|
|
|
# Add city to location if available
|
|
if metadata.get('possible_city'):
|
|
if 'locations' in enriched and enriched['locations']:
|
|
# Update existing location with city
|
|
for loc in enriched['locations']:
|
|
if not loc.get('city'):
|
|
loc['city'] = metadata['possible_city']
|
|
break
|
|
else:
|
|
# Create new location with city
|
|
enriched['locations'] = [{
|
|
'city': metadata['possible_city'],
|
|
'country': 'BR',
|
|
'region': enriched.get('locations', [{}])[0].get('region', '') if enriched.get('locations') else ''
|
|
}]
|
|
|
|
# Add founding date to change_history if available
|
|
if metadata.get('possible_founding_year') and 'change_history' not in enriched:
|
|
year = metadata['possible_founding_year']
|
|
enriched['change_history'] = [{
|
|
'event_id': f"https://w3id.org/heritage/custodian/event/{record.get('id', '').split('/')[-1]}-founding",
|
|
'change_type': 'FOUNDING',
|
|
'event_date': f"{year}-01-01",
|
|
'event_description': f"Institution founded or established in {year} (date extracted from conversation metadata)"
|
|
}]
|
|
|
|
# Update provenance to reflect enrichment
|
|
if 'provenance' in enriched:
|
|
enriched['provenance']['extraction_date'] = datetime.now(timezone.utc).isoformat()
|
|
method = 'Automated enrichment v2.1 - filtered platforms, parsed conversation metadata'
|
|
if metadata:
|
|
method += ', matched institutional data'
|
|
enriched['provenance']['extraction_method'] = method
|
|
|
|
return enriched
|
|
|
|
|
|
def generate_curation_report(original_count: int, filtered_count: int, curated_count: int) -> str:
|
|
"""Generate a curation completion report."""
|
|
report = f"""
|
|
# Brazilian GLAM Institution Curation Report
|
|
Generated: {datetime.now(timezone.utc).isoformat()}
|
|
|
|
## Summary Statistics
|
|
|
|
- **Original records (v2)**: {original_count}
|
|
- **Filtered out (platforms/non-institutions)**: {filtered_count}
|
|
- **Valid curated institutions**: {curated_count}
|
|
- **Recall rate**: {(curated_count / original_count * 100):.1f}%
|
|
|
|
## Curation Actions
|
|
|
|
### Records Filtered Out
|
|
The following records were removed as they are platforms/technologies, not heritage institutions:
|
|
1. **Tainacan** - Collection management platform (WordPress-based)
|
|
2. **AtoM** - Archival description software
|
|
3. **DSpace** - Digital repository platform
|
|
4. **APIs** - Generic technology reference
|
|
5. **LOCKSS Cariniana** - Digital preservation network
|
|
6. **Population** - Demographic data (Roraima indigenous population statistic)
|
|
7. **Documentation** - Too generic, not a specific institution
|
|
|
|
### Valid Institutions Retained
|
|
{curated_count} heritage custodian organizations representing:
|
|
- Museums (MUSEUM, MIXED)
|
|
- Libraries (LIBRARY)
|
|
- Archives (ARCHIVE)
|
|
- Research centers (RESEARCH_CENTER)
|
|
- Educational providers (EDUCATION_PROVIDER)
|
|
- Official institutions (OFFICIAL_INSTITUTION)
|
|
|
|
## Quality Metrics
|
|
|
|
### Completeness (by field)
|
|
To be calculated after enrichment:
|
|
- Records with descriptions: TBD
|
|
- Records with identifiers: TBD
|
|
- Records with city names: TBD
|
|
- Records with digital platforms: TBD
|
|
|
|
### Geographic Coverage
|
|
All 27 Brazilian states + Federal District represented
|
|
|
|
## Next Steps
|
|
|
|
1. **Deep enrichment needed**: Extract comprehensive metadata from conversation JSON
|
|
- Founding dates and change history
|
|
- Collection descriptions with subjects/extents
|
|
- Digital platform URLs and systems
|
|
- Additional identifiers (Wikidata, VIAF, etc.)
|
|
|
|
2. **Manual verification**: Review Brasiliana Museus and Hemeroteca Digital
|
|
- Classify as national aggregation platforms vs. custodian institutions
|
|
|
|
3. **Field completion**: Achieve targets:
|
|
- 90%+ with descriptions (2-4 sentences)
|
|
- 80%+ with website identifiers
|
|
- 60%+ with city-level location data
|
|
|
|
---
|
|
Generated by curate_brazilian_institutions.py
|
|
"""
|
|
return report
|
|
|
|
|
|
def main():
|
|
"""Main curation workflow."""
|
|
print("=" * 70)
|
|
print("Brazilian GLAM Institution Curation - v2.1")
|
|
print("=" * 70)
|
|
print()
|
|
|
|
# Load data
|
|
records = load_v2_records()
|
|
original_count = len(records)
|
|
|
|
conversation = load_conversation()
|
|
conversation_text = extract_conversation_text(conversation)
|
|
|
|
# Filter valid institutions
|
|
print("\n" + "=" * 70)
|
|
print("STEP 1: Filtering Valid Institutions")
|
|
print("=" * 70)
|
|
valid_records = filter_valid_institutions(records)
|
|
filtered_count = original_count - len(valid_records)
|
|
|
|
# Enrich records
|
|
print("\n" + "=" * 70)
|
|
print("STEP 2: Enriching Records")
|
|
print("=" * 70)
|
|
print("Parsing conversation metadata...")
|
|
|
|
metadata_db = parse_institution_metadata(conversation_text)
|
|
|
|
curated_records = []
|
|
for i, record in enumerate(valid_records, 1):
|
|
enriched = enrich_record(record, conversation_text, metadata_db)
|
|
curated_records.append(enriched)
|
|
if i % 20 == 0:
|
|
print(f" Processed {i}/{len(valid_records)} records...")
|
|
|
|
print(f"✓ Enriched {len(curated_records)} records")
|
|
|
|
# Save curated output
|
|
print("\n" + "=" * 70)
|
|
print("STEP 3: Saving Curated Output")
|
|
print("=" * 70)
|
|
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
yaml.dump(curated_records, f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
indent=2)
|
|
|
|
print(f"✓ Saved {len(curated_records)} curated records to:")
|
|
print(f" {OUTPUT_FILE}")
|
|
|
|
# Generate report
|
|
print("\n" + "=" * 70)
|
|
print("STEP 4: Generating Curation Report")
|
|
print("=" * 70)
|
|
|
|
report = generate_curation_report(original_count, filtered_count, len(curated_records))
|
|
report_file = OUTPUT_FILE.parent / "brazilian_curation_report.md"
|
|
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
f.write(report)
|
|
|
|
print(f"✓ Saved curation report to:")
|
|
print(f" {report_file}")
|
|
|
|
# Final summary
|
|
print("\n" + "=" * 70)
|
|
print("CURATION COMPLETE")
|
|
print("=" * 70)
|
|
print(f"Original records: {original_count}")
|
|
print(f"Filtered out: {filtered_count}")
|
|
print(f"Valid institutions: {len(curated_records)}")
|
|
print(f"Recall rate: {(len(curated_records) / original_count * 100):.1f}%")
|
|
print()
|
|
print("✓ Output file:", OUTPUT_FILE)
|
|
print("✓ Report file:", report_file)
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|