glam/process_chilean_institutions.py
2025-11-19 23:25:22 +01:00

357 lines
14 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Process Chilean GLAM institutions from conversation file.
Extracts institutions from comprehensive provincial directory format.
"""
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List
from collections import defaultdict
# Reuse non-institution filter from Mexican script
NON_INSTITUTION_TERMS = {
"dublin core", "mods", "mets", "vra core", "object id", "marc21", "marc 21",
"handle system", "orcid", "doi", "master formats", "documents", "audio/video",
"storage", "climate control", "digital objects", "photography", "tiff", "jpeg",
"pdf/a", "api", "oai-pmh", "rest", "iiif", "current status", "museums", "archives",
"libraries", "archaeological sites", "digital objects", "access", "platform",
"network", "system", "standard", "format", "preservation", "digitization", "metadata"
}
def is_real_institution(name: str) -> bool:
"""Check if extracted name is a real institution."""
name_lower = name.lower().strip()
if name_lower in NON_INSTITUTION_TERMS:
return False
words = set(name_lower.split())
if words <= NON_INSTITUTION_TERMS:
return False
if len(name_lower) < 5:
return False
return True
def extract_institutions_from_report(report_text: str) -> List[Dict]:
"""Extract Chilean institutions from the provincial directory report.
Chilean format uses inline bold text within paragraphs:
Example: "**Copiapó Province** anticipates... the new **Museo Regional de Atacama**..."
"""
institutions = []
# Chilean-specific filter terms (in addition to NON_INSTITUTION_TERMS)
chilean_skip_terms = {
'province', 'region', 'unesco', 'serpat', 'dibam', 'national monuments',
'cultural heritage', 'digital preservation', 'atacama province',
'copiapó province', 'huasco province', 'chañaral province',
'antofagasta region', 'metropolitan region', 'maule region',
'valparaíso region', 'biobío region', 'los lagos region',
'araucanía region', 'tarapacá region', 'arica and parinacota region',
'aysén region', 'magallanes region', 'coquimbo region',
'atacama region', "o'higgins region", 'los ríos region', 'ñuble region'
}
# Institution keywords in Spanish
institution_keywords = [
'museo', 'museum', 'biblioteca', 'library', 'archivo', 'archiv',
'universidad', 'university', 'instituto', 'institute', 'fundación',
'foundation', 'centro', 'center', 'galería', 'gallery', 'servicio',
'consejo', 'dirección', 'departamento', 'academia', 'sociedad'
]
# Find all bold text instances: **something**
bold_pattern = re.compile(r'\*\*([^*]+?)\*\*')
matches = bold_pattern.findall(report_text)
# Track current region/province context
current_province = None
lines = report_text.split('\n')
for match in matches:
candidate = match.strip()
candidate_lower = candidate.lower()
# Skip empty or very short
if len(candidate) < 5:
continue
# Skip section headers (Province, Region)
if 'province' in candidate_lower or 'region' in candidate_lower:
# Update context for location tracking
if 'province' in candidate_lower:
current_province = candidate.replace(' Province', '').strip()
continue
# Skip generic Chilean terms
if candidate_lower in chilean_skip_terms:
continue
# Skip metadata standards and technical terms
if not is_real_institution(candidate):
continue
# Must contain at least one institution keyword
has_keyword = any(keyword in candidate_lower for keyword in institution_keywords)
if not has_keyword:
continue
# Skip if it's just a keyword by itself
if candidate_lower in institution_keywords:
continue
# Extract institution
inst_record = {
'name': candidate,
'urls': [],
'emails': [],
'province': current_province,
'institution_type': 'MIXED'
}
# Classify institution type
if 'museo' in candidate_lower or 'museum' in candidate_lower:
inst_record['institution_type'] = 'MUSEUM'
elif 'archivo' in candidate_lower or 'archiv' in candidate_lower:
inst_record['institution_type'] = 'ARCHIVE'
elif 'biblioteca' in candidate_lower or 'library' in candidate_lower:
inst_record['institution_type'] = 'LIBRARY'
elif 'universidad' in candidate_lower or 'university' in candidate_lower:
inst_record['institution_type'] = 'EDUCATION_PROVIDER'
elif 'servicio nacional' in candidate_lower or 'consejo' in candidate_lower or 'dirección' in candidate_lower:
inst_record['institution_type'] = 'OFFICIAL_INSTITUTION'
elif 'fundación' in candidate_lower or 'foundation' in candidate_lower:
inst_record['institution_type'] = 'RESEARCH_CENTER'
# Try to find URLs/emails in context (look in surrounding text)
# Find where this institution appears in the full text
context_start = report_text.find(f'**{candidate}**')
if context_start != -1:
# Get 500 characters after the mention
context = report_text[context_start:context_start+500]
# Extract URLs
url_matches = re.findall(r'https?://[^\s\)\],]+', context)
for url in url_matches:
url = url.rstrip(',.;:')
if url not in inst_record['urls']:
inst_record['urls'].append(url)
# Extract emails
email_matches = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', context)
for email in email_matches:
if email not in inst_record['emails']:
inst_record['emails'].append(email)
institutions.append(inst_record)
return institutions
def normalize_name(name: str) -> str:
"""Normalize institution name for deduplication."""
name = re.sub(r'\([^)]*\)', '', name)
name = name.lower().strip()
name = re.sub(r'\s+', ' ', name)
return name
def deduplicate_institutions(institutions: List[Dict]) -> List[Dict]:
"""Deduplicate by normalized name, merging metadata."""
name_map = {}
for inst in institutions:
norm_name = normalize_name(inst['name'])
if norm_name in name_map:
# Merge URLs and emails
existing = name_map[norm_name]
existing['urls'] = list(set(existing.get('urls', []) + inst.get('urls', [])))
existing['emails'] = list(set(existing.get('emails', []) + inst.get('emails', [])))
# Prefer more specific type
if existing.get('institution_type') == 'MIXED' and inst.get('institution_type') != 'MIXED':
existing['institution_type'] = inst['institution_type']
else:
name_map[norm_name] = inst
return list(name_map.values())
def generate_ghcid(country_code: str, inst_type: str, name: str, index: int) -> str:
"""Generate GHCID identifier."""
type_codes = {
'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
'OFFICIAL_INSTITUTION': 'O', 'RESEARCH_CENTER': 'R',
'EDUCATION_PROVIDER': 'E', 'MIXED': 'M', 'UNKNOWN': 'U'
}
type_code = type_codes.get(inst_type, 'U')
slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')[:30]
return f"https://w3id.org/heritage/custodian/{country_code.lower()}/{type_code.lower()}-{slug}-{index:04d}"
def convert_to_linkml_yaml(institutions: List[Dict], conversation_id: str, source_file_path: str) -> str:
"""Convert to LinkML YAML format."""
yaml_lines = [
"---",
"# Chilean GLAM Institutions",
f"# Extracted from: {source_file_path}",
f"# Conversation ID: {conversation_id}",
""
]
for i, inst in enumerate(institutions, 1):
ghcid = generate_ghcid('CL', inst.get('institution_type', 'MIXED'), inst['name'], i)
# Handle names with quotes
name = inst['name']
if '"' in name:
name_escaped = name.replace("'", "''")
name_field = f" name: '{name_escaped}'"
else:
name_field = f" name: \"{name}\""
yaml_lines.append(f"- id: {ghcid}")
yaml_lines.append(name_field)
yaml_lines.append(f" institution_type: {inst.get('institution_type', 'MIXED')}")
# Locations
if inst.get('province'):
yaml_lines.append(f" locations:")
yaml_lines.append(f" - region: {inst['province']}")
yaml_lines.append(f" country: CL")
# Identifiers
if inst.get('urls') or inst.get('emails'):
yaml_lines.append(f" identifiers:")
for url in inst.get('urls', []):
yaml_lines.append(f" - identifier_scheme: Website")
yaml_lines.append(f" identifier_value: {url}")
yaml_lines.append(f" identifier_url: {url}")
for email in inst.get('emails', []):
yaml_lines.append(f" - identifier_scheme: Email")
yaml_lines.append(f" identifier_value: {email}")
# Provenance with source reference
extraction_date = datetime.now(timezone.utc).isoformat()
yaml_lines.append(f" provenance:")
yaml_lines.append(f" data_source: CONVERSATION_NLP")
yaml_lines.append(f" data_tier: TIER_4_INFERRED")
yaml_lines.append(f" extraction_date: \"{extraction_date}\"")
yaml_lines.append(f" extraction_method: \"Inline bold text extraction from provincial directory\"")
yaml_lines.append(f" confidence_score: 0.85")
yaml_lines.append(f" conversation_id: \"{conversation_id}\"")
yaml_lines.append(f" source_url: \"file://{source_file_path}\"")
yaml_lines.append("")
return '\n'.join(yaml_lines)
def main():
# Load conversation file
file_path = "/Users/kempersc/Documents/claude/data-2025-11-02-18-13-26-batch-0000/conversations/2025-09-22T14-43-14-edc75d66-ee42-4199-8e22-65b0d2347922-Chilean_GLAM_inventories_research.json"
print("Loading Chilean GLAM conversation...")
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
conversation_id = data['uuid']
# Find the comprehensive provincial directory (usually the longest artifact)
report_text = None
max_length = 0
for msg in data.get('chat_messages', []):
for content in msg.get('content', []):
if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
artifact = content.get('input', {}).get('content', '')
if len(artifact) > max_length:
max_length = len(artifact)
report_text = artifact
if not report_text:
print("ERROR: No artifact found in conversation!")
return
print(f"Found report with {len(report_text)} characters")
# Extract institutions
print("\nExtracting institutions from provincial directory...")
institutions = extract_institutions_from_report(report_text)
print(f"Extracted: {len(institutions)} institutions")
# Deduplicate
print("\nDeduplicating by normalized name...")
final_institutions = deduplicate_institutions(institutions)
print(f"Final: {len(final_institutions)} unique institutions")
# Statistics
print("\n" + "="*60)
print("FINAL STATISTICS")
print("="*60)
type_counts = defaultdict(int)
province_counts = defaultdict(int)
with_urls = 0
with_emails = 0
for inst in final_institutions:
type_counts[inst.get('institution_type', 'MIXED')] += 1
if inst.get('province'):
province_counts[inst['province']] += 1
if inst.get('urls'):
with_urls += 1
if inst.get('emails'):
with_emails += 1
print(f"\nTotal institutions: {len(final_institutions)}")
print(f"\nBy type:")
for itype, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {itype}: {count}")
print(f"\nTop 15 provinces by institution count:")
for province, count in sorted(province_counts.items(), key=lambda x: -x[1])[:15]:
print(f" {province}: {count}")
print(f"\nIdentifiers:")
print(f" Institutions with URLs: {with_urls} ({with_urls/len(final_institutions)*100:.1f}%)")
print(f" Institutions with emails: {with_emails} ({with_emails/len(final_institutions)*100:.1f}%)")
# Convert to LinkML YAML
print("\nConverting to LinkML YAML format...")
yaml_output = convert_to_linkml_yaml(final_institutions, conversation_id, file_path)
# Write output
output_path = Path('/Users/kempersc/apps/glam/data/instances/chilean_institutions.yaml')
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(yaml_output)
print(f"\nOutput written to: {output_path}")
# Save JSON
json_output = {
'institutions': final_institutions,
'statistics': {
'total': len(final_institutions),
'by_type': dict(type_counts),
'by_province': dict(province_counts),
'with_urls': with_urls,
'with_emails': with_emails
}
}
json_path = '/tmp/chilean_institutions_final.json'
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(json_output, f, indent=2, ensure_ascii=False)
print(f"JSON version saved to: {json_path}")
if __name__ == '__main__':
main()