357 lines
14 KiB
Python
Executable file
357 lines
14 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Process Chilean GLAM institutions from conversation file.
|
|
Extracts institutions from comprehensive provincial directory format.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List
|
|
from collections import defaultdict
|
|
|
|
# Reuse non-institution filter from Mexican script
|
|
NON_INSTITUTION_TERMS = {
|
|
"dublin core", "mods", "mets", "vra core", "object id", "marc21", "marc 21",
|
|
"handle system", "orcid", "doi", "master formats", "documents", "audio/video",
|
|
"storage", "climate control", "digital objects", "photography", "tiff", "jpeg",
|
|
"pdf/a", "api", "oai-pmh", "rest", "iiif", "current status", "museums", "archives",
|
|
"libraries", "archaeological sites", "digital objects", "access", "platform",
|
|
"network", "system", "standard", "format", "preservation", "digitization", "metadata"
|
|
}
|
|
|
|
def is_real_institution(name: str) -> bool:
|
|
"""Check if extracted name is a real institution."""
|
|
name_lower = name.lower().strip()
|
|
|
|
if name_lower in NON_INSTITUTION_TERMS:
|
|
return False
|
|
|
|
words = set(name_lower.split())
|
|
if words <= NON_INSTITUTION_TERMS:
|
|
return False
|
|
|
|
if len(name_lower) < 5:
|
|
return False
|
|
|
|
return True
|
|
|
|
def extract_institutions_from_report(report_text: str) -> List[Dict]:
|
|
"""Extract Chilean institutions from the provincial directory report.
|
|
|
|
Chilean format uses inline bold text within paragraphs:
|
|
Example: "**Copiapó Province** anticipates... the new **Museo Regional de Atacama**..."
|
|
"""
|
|
institutions = []
|
|
|
|
# Chilean-specific filter terms (in addition to NON_INSTITUTION_TERMS)
|
|
chilean_skip_terms = {
|
|
'province', 'region', 'unesco', 'serpat', 'dibam', 'national monuments',
|
|
'cultural heritage', 'digital preservation', 'atacama province',
|
|
'copiapó province', 'huasco province', 'chañaral province',
|
|
'antofagasta region', 'metropolitan region', 'maule region',
|
|
'valparaíso region', 'biobío region', 'los lagos region',
|
|
'araucanía region', 'tarapacá region', 'arica and parinacota region',
|
|
'aysén region', 'magallanes region', 'coquimbo region',
|
|
'atacama region', "o'higgins region", 'los ríos region', 'ñuble region'
|
|
}
|
|
|
|
# Institution keywords in Spanish
|
|
institution_keywords = [
|
|
'museo', 'museum', 'biblioteca', 'library', 'archivo', 'archiv',
|
|
'universidad', 'university', 'instituto', 'institute', 'fundación',
|
|
'foundation', 'centro', 'center', 'galería', 'gallery', 'servicio',
|
|
'consejo', 'dirección', 'departamento', 'academia', 'sociedad'
|
|
]
|
|
|
|
# Find all bold text instances: **something**
|
|
bold_pattern = re.compile(r'\*\*([^*]+?)\*\*')
|
|
matches = bold_pattern.findall(report_text)
|
|
|
|
# Track current region/province context
|
|
current_province = None
|
|
lines = report_text.split('\n')
|
|
|
|
for match in matches:
|
|
candidate = match.strip()
|
|
candidate_lower = candidate.lower()
|
|
|
|
# Skip empty or very short
|
|
if len(candidate) < 5:
|
|
continue
|
|
|
|
# Skip section headers (Province, Region)
|
|
if 'province' in candidate_lower or 'region' in candidate_lower:
|
|
# Update context for location tracking
|
|
if 'province' in candidate_lower:
|
|
current_province = candidate.replace(' Province', '').strip()
|
|
continue
|
|
|
|
# Skip generic Chilean terms
|
|
if candidate_lower in chilean_skip_terms:
|
|
continue
|
|
|
|
# Skip metadata standards and technical terms
|
|
if not is_real_institution(candidate):
|
|
continue
|
|
|
|
# Must contain at least one institution keyword
|
|
has_keyword = any(keyword in candidate_lower for keyword in institution_keywords)
|
|
if not has_keyword:
|
|
continue
|
|
|
|
# Skip if it's just a keyword by itself
|
|
if candidate_lower in institution_keywords:
|
|
continue
|
|
|
|
# Extract institution
|
|
inst_record = {
|
|
'name': candidate,
|
|
'urls': [],
|
|
'emails': [],
|
|
'province': current_province,
|
|
'institution_type': 'MIXED'
|
|
}
|
|
|
|
# Classify institution type
|
|
if 'museo' in candidate_lower or 'museum' in candidate_lower:
|
|
inst_record['institution_type'] = 'MUSEUM'
|
|
elif 'archivo' in candidate_lower or 'archiv' in candidate_lower:
|
|
inst_record['institution_type'] = 'ARCHIVE'
|
|
elif 'biblioteca' in candidate_lower or 'library' in candidate_lower:
|
|
inst_record['institution_type'] = 'LIBRARY'
|
|
elif 'universidad' in candidate_lower or 'university' in candidate_lower:
|
|
inst_record['institution_type'] = 'EDUCATION_PROVIDER'
|
|
elif 'servicio nacional' in candidate_lower or 'consejo' in candidate_lower or 'dirección' in candidate_lower:
|
|
inst_record['institution_type'] = 'OFFICIAL_INSTITUTION'
|
|
elif 'fundación' in candidate_lower or 'foundation' in candidate_lower:
|
|
inst_record['institution_type'] = 'RESEARCH_CENTER'
|
|
|
|
# Try to find URLs/emails in context (look in surrounding text)
|
|
# Find where this institution appears in the full text
|
|
context_start = report_text.find(f'**{candidate}**')
|
|
if context_start != -1:
|
|
# Get 500 characters after the mention
|
|
context = report_text[context_start:context_start+500]
|
|
|
|
# Extract URLs
|
|
url_matches = re.findall(r'https?://[^\s\)\],]+', context)
|
|
for url in url_matches:
|
|
url = url.rstrip(',.;:')
|
|
if url not in inst_record['urls']:
|
|
inst_record['urls'].append(url)
|
|
|
|
# Extract emails
|
|
email_matches = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', context)
|
|
for email in email_matches:
|
|
if email not in inst_record['emails']:
|
|
inst_record['emails'].append(email)
|
|
|
|
institutions.append(inst_record)
|
|
|
|
return institutions
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for deduplication."""
|
|
name = re.sub(r'\([^)]*\)', '', name)
|
|
name = name.lower().strip()
|
|
name = re.sub(r'\s+', ' ', name)
|
|
return name
|
|
|
|
def deduplicate_institutions(institutions: List[Dict]) -> List[Dict]:
|
|
"""Deduplicate by normalized name, merging metadata."""
|
|
name_map = {}
|
|
|
|
for inst in institutions:
|
|
norm_name = normalize_name(inst['name'])
|
|
|
|
if norm_name in name_map:
|
|
# Merge URLs and emails
|
|
existing = name_map[norm_name]
|
|
existing['urls'] = list(set(existing.get('urls', []) + inst.get('urls', [])))
|
|
existing['emails'] = list(set(existing.get('emails', []) + inst.get('emails', [])))
|
|
|
|
# Prefer more specific type
|
|
if existing.get('institution_type') == 'MIXED' and inst.get('institution_type') != 'MIXED':
|
|
existing['institution_type'] = inst['institution_type']
|
|
else:
|
|
name_map[norm_name] = inst
|
|
|
|
return list(name_map.values())
|
|
|
|
def generate_ghcid(country_code: str, inst_type: str, name: str, index: int) -> str:
|
|
"""Generate GHCID identifier."""
|
|
type_codes = {
|
|
'MUSEUM': 'M', 'ARCHIVE': 'A', 'LIBRARY': 'L', 'GALLERY': 'G',
|
|
'OFFICIAL_INSTITUTION': 'O', 'RESEARCH_CENTER': 'R',
|
|
'EDUCATION_PROVIDER': 'E', 'MIXED': 'M', 'UNKNOWN': 'U'
|
|
}
|
|
|
|
type_code = type_codes.get(inst_type, 'U')
|
|
slug = re.sub(r'[^a-z0-9]+', '-', name.lower()).strip('-')[:30]
|
|
|
|
return f"https://w3id.org/heritage/custodian/{country_code.lower()}/{type_code.lower()}-{slug}-{index:04d}"
|
|
|
|
def convert_to_linkml_yaml(institutions: List[Dict], conversation_id: str, source_file_path: str) -> str:
|
|
"""Convert to LinkML YAML format."""
|
|
yaml_lines = [
|
|
"---",
|
|
"# Chilean GLAM Institutions",
|
|
f"# Extracted from: {source_file_path}",
|
|
f"# Conversation ID: {conversation_id}",
|
|
""
|
|
]
|
|
|
|
for i, inst in enumerate(institutions, 1):
|
|
ghcid = generate_ghcid('CL', inst.get('institution_type', 'MIXED'), inst['name'], i)
|
|
|
|
# Handle names with quotes
|
|
name = inst['name']
|
|
if '"' in name:
|
|
name_escaped = name.replace("'", "''")
|
|
name_field = f" name: '{name_escaped}'"
|
|
else:
|
|
name_field = f" name: \"{name}\""
|
|
|
|
yaml_lines.append(f"- id: {ghcid}")
|
|
yaml_lines.append(name_field)
|
|
yaml_lines.append(f" institution_type: {inst.get('institution_type', 'MIXED')}")
|
|
|
|
# Locations
|
|
if inst.get('province'):
|
|
yaml_lines.append(f" locations:")
|
|
yaml_lines.append(f" - region: {inst['province']}")
|
|
yaml_lines.append(f" country: CL")
|
|
|
|
# Identifiers
|
|
if inst.get('urls') or inst.get('emails'):
|
|
yaml_lines.append(f" identifiers:")
|
|
|
|
for url in inst.get('urls', []):
|
|
yaml_lines.append(f" - identifier_scheme: Website")
|
|
yaml_lines.append(f" identifier_value: {url}")
|
|
yaml_lines.append(f" identifier_url: {url}")
|
|
|
|
for email in inst.get('emails', []):
|
|
yaml_lines.append(f" - identifier_scheme: Email")
|
|
yaml_lines.append(f" identifier_value: {email}")
|
|
|
|
# Provenance with source reference
|
|
extraction_date = datetime.now(timezone.utc).isoformat()
|
|
yaml_lines.append(f" provenance:")
|
|
yaml_lines.append(f" data_source: CONVERSATION_NLP")
|
|
yaml_lines.append(f" data_tier: TIER_4_INFERRED")
|
|
yaml_lines.append(f" extraction_date: \"{extraction_date}\"")
|
|
yaml_lines.append(f" extraction_method: \"Inline bold text extraction from provincial directory\"")
|
|
yaml_lines.append(f" confidence_score: 0.85")
|
|
yaml_lines.append(f" conversation_id: \"{conversation_id}\"")
|
|
yaml_lines.append(f" source_url: \"file://{source_file_path}\"")
|
|
|
|
yaml_lines.append("")
|
|
|
|
return '\n'.join(yaml_lines)
|
|
|
|
def main():
|
|
# Load conversation file
|
|
file_path = "/Users/kempersc/Documents/claude/data-2025-11-02-18-13-26-batch-0000/conversations/2025-09-22T14-43-14-edc75d66-ee42-4199-8e22-65b0d2347922-Chilean_GLAM_inventories_research.json"
|
|
|
|
print("Loading Chilean GLAM conversation...")
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
conversation_id = data['uuid']
|
|
|
|
# Find the comprehensive provincial directory (usually the longest artifact)
|
|
report_text = None
|
|
max_length = 0
|
|
|
|
for msg in data.get('chat_messages', []):
|
|
for content in msg.get('content', []):
|
|
if content.get('type') == 'tool_use' and content.get('name') == 'artifacts':
|
|
artifact = content.get('input', {}).get('content', '')
|
|
if len(artifact) > max_length:
|
|
max_length = len(artifact)
|
|
report_text = artifact
|
|
|
|
if not report_text:
|
|
print("ERROR: No artifact found in conversation!")
|
|
return
|
|
|
|
print(f"Found report with {len(report_text)} characters")
|
|
|
|
# Extract institutions
|
|
print("\nExtracting institutions from provincial directory...")
|
|
institutions = extract_institutions_from_report(report_text)
|
|
print(f"Extracted: {len(institutions)} institutions")
|
|
|
|
# Deduplicate
|
|
print("\nDeduplicating by normalized name...")
|
|
final_institutions = deduplicate_institutions(institutions)
|
|
print(f"Final: {len(final_institutions)} unique institutions")
|
|
|
|
# Statistics
|
|
print("\n" + "="*60)
|
|
print("FINAL STATISTICS")
|
|
print("="*60)
|
|
|
|
type_counts = defaultdict(int)
|
|
province_counts = defaultdict(int)
|
|
with_urls = 0
|
|
with_emails = 0
|
|
|
|
for inst in final_institutions:
|
|
type_counts[inst.get('institution_type', 'MIXED')] += 1
|
|
if inst.get('province'):
|
|
province_counts[inst['province']] += 1
|
|
if inst.get('urls'):
|
|
with_urls += 1
|
|
if inst.get('emails'):
|
|
with_emails += 1
|
|
|
|
print(f"\nTotal institutions: {len(final_institutions)}")
|
|
print(f"\nBy type:")
|
|
for itype, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {itype}: {count}")
|
|
|
|
print(f"\nTop 15 provinces by institution count:")
|
|
for province, count in sorted(province_counts.items(), key=lambda x: -x[1])[:15]:
|
|
print(f" {province}: {count}")
|
|
|
|
print(f"\nIdentifiers:")
|
|
print(f" Institutions with URLs: {with_urls} ({with_urls/len(final_institutions)*100:.1f}%)")
|
|
print(f" Institutions with emails: {with_emails} ({with_emails/len(final_institutions)*100:.1f}%)")
|
|
|
|
# Convert to LinkML YAML
|
|
print("\nConverting to LinkML YAML format...")
|
|
yaml_output = convert_to_linkml_yaml(final_institutions, conversation_id, file_path)
|
|
|
|
# Write output
|
|
output_path = Path('/Users/kempersc/apps/glam/data/instances/chilean_institutions.yaml')
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(yaml_output)
|
|
|
|
print(f"\nOutput written to: {output_path}")
|
|
|
|
# Save JSON
|
|
json_output = {
|
|
'institutions': final_institutions,
|
|
'statistics': {
|
|
'total': len(final_institutions),
|
|
'by_type': dict(type_counts),
|
|
'by_province': dict(province_counts),
|
|
'with_urls': with_urls,
|
|
'with_emails': with_emails
|
|
}
|
|
}
|
|
|
|
json_path = '/tmp/chilean_institutions_final.json'
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
json.dump(json_output, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"JSON version saved to: {json_path}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|