- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
610 lines
26 KiB
Python
610 lines
26 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Mexican GLAM Institutions Manual Curation Script
|
|
|
|
This script enriches 117 Mexican heritage institutions by extracting comprehensive
|
|
metadata from conversation artifacts to create fully enriched LinkML-compliant records.
|
|
|
|
Input:
|
|
- /Users/kempersc/apps/glam/data/instances/mexican_institutions.yaml
|
|
- /Users/kempersc/apps/glam/data/temp_conv1_artifact2.md (state-by-state directory)
|
|
- /Users/kempersc/apps/glam/data/temp_conv2_artifact1.md (national institutions)
|
|
|
|
Output:
|
|
- /Users/kempersc/apps/glam/data/instances/mexican_institutions_curated.yaml
|
|
"""
|
|
|
|
import re
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Any, Optional
|
|
from pathlib import Path
|
|
|
|
# Paths
|
|
BASE_DIR = Path("/Users/kempersc/apps/glam")
|
|
INPUT_YAML = BASE_DIR / "data/instances/mexican_institutions.yaml"
|
|
ARTIFACT1 = BASE_DIR / "data/temp_conv1_artifact2.md" # State-by-state
|
|
ARTIFACT2 = BASE_DIR / "data/temp_conv2_artifact1.md" # National institutions
|
|
OUTPUT_YAML = BASE_DIR / "data/instances/mexican_institutions_curated.yaml"
|
|
|
|
class InstitutionEnricher:
|
|
"""Enrich Mexican GLAM institutions with comprehensive metadata."""
|
|
|
|
def __init__(self):
|
|
self.institutions = []
|
|
self.enrichment_data = {}
|
|
|
|
def load_existing_institutions(self):
|
|
"""Load the 117 existing institutions from YAML."""
|
|
with open(INPUT_YAML, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
# Skip header comments
|
|
yaml_content = re.sub(r'^#.*\n', '', content, flags=re.MULTILINE)
|
|
self.institutions = yaml.safe_load(yaml_content)
|
|
print(f"✓ Loaded {len(self.institutions)} institutions from {INPUT_YAML.name}")
|
|
|
|
def parse_markdown_artifacts(self):
|
|
"""Parse both markdown artifacts to extract enrichment data."""
|
|
print("\n=== Parsing Markdown Artifacts ===")
|
|
|
|
# Parse state-by-state directory (artifact 1)
|
|
with open(ARTIFACT1, 'r', encoding='utf-8') as f:
|
|
artifact1_content = f.read()
|
|
self._parse_state_directory(artifact1_content)
|
|
|
|
# Parse national institutions (artifact 2)
|
|
with open(ARTIFACT2, 'r', encoding='utf-8') as f:
|
|
artifact2_content = f.read()
|
|
self._parse_national_institutions(artifact2_content)
|
|
|
|
print(f"✓ Extracted enrichment data for {len(self.enrichment_data)} institutions")
|
|
|
|
def _parse_state_directory(self, content: str):
|
|
"""Parse state-by-state institutional directory."""
|
|
# Extract state sections
|
|
state_sections = re.split(r'\n### ([A-Z\s]+)\n', content)
|
|
|
|
for i in range(1, len(state_sections), 2):
|
|
state_name = state_sections[i].strip()
|
|
section_content = state_sections[i+1]
|
|
|
|
# Extract institutions from this state
|
|
self._extract_institutions_from_section(section_content, state_name)
|
|
|
|
def _extract_institutions_from_section(self, content: str, state: str):
|
|
"""Extract institution details from a state section."""
|
|
# Match institution blocks - two patterns:
|
|
# Pattern 1: List items with bold names (- **Name**: details or - **Name** newline details)
|
|
# Pattern 2: Standalone bold headers followed by details
|
|
|
|
# Pattern 1: List item format (- **Name**: details OR - **Name** followed by details)
|
|
list_pattern = r'- \*\*([^*]+)\*\*(?::?\s*([^\n]+))?(?:\n((?: .*\n?)+))?'
|
|
|
|
# Pattern 2: Standalone bold (for section headers - we'll still try to extract)
|
|
standalone_pattern = r'(?:^|\n)\*\*([^*]+)\*\*\s*$'
|
|
|
|
# Try list pattern first (most institutions are in lists)
|
|
for match in re.finditer(list_pattern, content, re.MULTILINE):
|
|
name = match.group(1).strip()
|
|
inline_desc = match.group(2) if match.group(2) else ''
|
|
indented_details = match.group(3) if match.group(3) else ''
|
|
|
|
# Combine all details
|
|
details = (inline_desc + '\n' + indented_details).strip()
|
|
|
|
# Skip section headers
|
|
if name.endswith(':') or name in ['Institutional Custodians', 'Museums', 'Archives',
|
|
'Digital Resources', 'Digital Infrastructure',
|
|
'University Collections', 'INAH Regional Center',
|
|
'Archaeological Documentation', 'State Infrastructure',
|
|
'Key Museums', 'Universities', 'Major Art Museums',
|
|
'Anthropology and History Museums', 'Major Public Universities',
|
|
'Research Centers']:
|
|
continue
|
|
|
|
# Extract structured data
|
|
institution_data = {
|
|
'name': name,
|
|
'state': state,
|
|
'details': details,
|
|
'metadata': self._extract_metadata_from_details(details)
|
|
}
|
|
|
|
# Store by normalized name for matching
|
|
norm_name = self._normalize_name(name)
|
|
if norm_name not in self.enrichment_data:
|
|
self.enrichment_data[norm_name] = institution_data
|
|
|
|
def _parse_national_institutions(self, content: str):
|
|
"""Parse national-level institutions and platforms."""
|
|
# Extract major sections
|
|
sections = re.split(r'\n## (.+)\n', content)
|
|
|
|
for i in range(1, len(sections), 2):
|
|
section_title = sections[i].strip()
|
|
section_content = sections[i+1]
|
|
|
|
# Extract institutions from this section
|
|
self._extract_institutions_from_section(section_content, "NATIONAL")
|
|
|
|
def _extract_metadata_from_details(self, details: str) -> Dict[str, Any]:
|
|
"""Extract structured metadata from institution details block."""
|
|
metadata = {
|
|
'urls': [],
|
|
'emails': [],
|
|
'phones': [],
|
|
'addresses': [],
|
|
'cities': [],
|
|
'descriptions': [],
|
|
'collections': [],
|
|
'platforms': [],
|
|
'metadata_standards': [],
|
|
'identifiers': [],
|
|
'directors': [],
|
|
'hours': []
|
|
}
|
|
|
|
# Extract URLs (multiple patterns)
|
|
url_patterns = [
|
|
r'(?:URL|Website|Portal|Digital Library|Catalogue|Repository|OPAC|GitHub|Main Website|Alternative Access|Digital Repository):\s*(https?://[^\s\)]+)',
|
|
r'\*\*(?:URL|Website)\*\*:\s*(https?://[^\s\)]+)',
|
|
]
|
|
for pattern in url_patterns:
|
|
metadata['urls'].extend(re.findall(pattern, details, re.IGNORECASE))
|
|
|
|
# Extract generic URLs (not already captured)
|
|
generic_urls = re.findall(r'https?://[^\s\)\]<>,]+', details)
|
|
for url in generic_urls:
|
|
if url not in metadata['urls'] and not url.endswith('...'):
|
|
metadata['urls'].append(url)
|
|
|
|
# Extract emails
|
|
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
|
|
metadata['emails'].extend(re.findall(email_pattern, details))
|
|
|
|
# Extract phone numbers (multiple patterns)
|
|
phone_patterns = [
|
|
r'(?:Phone|Tel):\s*([+\d\s\(\)-]+)',
|
|
r'\+52\s*\d+\s*\d+\s*\d+', # Mexican phone format
|
|
r'\d{3}\s*\d{7}', # Local Mexican format
|
|
]
|
|
for pattern in phone_patterns:
|
|
metadata['phones'].extend(re.findall(pattern, details, re.IGNORECASE))
|
|
|
|
# Extract addresses (multiple patterns)
|
|
address_patterns = [
|
|
r'Address:\s*([^\n-]+?)(?:\n|$)',
|
|
r'(?:Calzada|Avenida|Av\.|Calle|C\.|Boulevard)[\s\w\d,.#°-]+(?:\d{5})?',
|
|
]
|
|
for pattern in address_patterns:
|
|
found = re.findall(pattern, details, re.IGNORECASE)
|
|
metadata['addresses'].extend([a.strip() for a in found if len(a.strip()) > 10])
|
|
|
|
# Extract cities (from addresses or standalone)
|
|
city_pattern = r'(?:,\s*)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,|\s+\d{5}|\s*$)'
|
|
metadata['cities'].extend(re.findall(city_pattern, details))
|
|
|
|
# Extract collection information (multiple patterns)
|
|
collection_patterns = [
|
|
r'(?:Collection|Holdings|Content|Scope):\s*([^\n-]+)',
|
|
r'(\d+[,+]\s*(?:objects|works|items|volumes|documents|pages|resources|photographs))',
|
|
r'(Over \d+[,\d]*\s+(?:objects|works|items|volumes|documents))',
|
|
]
|
|
for pattern in collection_patterns:
|
|
metadata['collections'].extend(re.findall(pattern, details, re.IGNORECASE))
|
|
|
|
# Extract directors/contacts
|
|
director_pattern = r'Director:\s*([^\n]+)'
|
|
metadata['directors'].extend(re.findall(director_pattern, details))
|
|
|
|
# Extract hours
|
|
hours_pattern = r'Hours:\s*([^\n]+)'
|
|
metadata['hours'].extend(re.findall(hours_pattern, details))
|
|
|
|
# Extract metadata standards
|
|
standards = ['Dublin Core', 'MARC21', 'OAI-PMH', 'IIIF', 'Schema.org',
|
|
'EAD', 'BIBFRAME', 'Tainacan', 'LIDO', 'RDF', 'JSON-LD']
|
|
for standard in standards:
|
|
if standard.lower() in details.lower():
|
|
metadata['metadata_standards'].append(standard)
|
|
|
|
# Extract platform types
|
|
platforms = ['RESTful API', 'API', 'SPARQL', 'WorldCat', 'OCLC',
|
|
'Google Arts & Culture', 'Virtual Tours', 'Digital catalogue',
|
|
'Open access', 'OAI-PMH protocol']
|
|
for platform in platforms:
|
|
if platform.lower() in details.lower():
|
|
metadata['platforms'].append(platform)
|
|
|
|
# Extract descriptions (meaningful sentences/fragments)
|
|
# Split by newlines and common delimiters
|
|
fragments = re.split(r'[\n-]', details)
|
|
for fragment in fragments[:5]: # Take first 5 fragments
|
|
clean = fragment.strip()
|
|
# Skip if it's a field label or too short
|
|
if clean and len(clean) > 25 and ':' not in clean[:30]:
|
|
metadata['descriptions'].append(clean)
|
|
|
|
# Also extract inline descriptions after colons
|
|
if ': ' in details:
|
|
parts = details.split(': ', 1)
|
|
if len(parts) > 1 and len(parts[1]) > 30:
|
|
first_sentence = parts[1].split('\n')[0].strip()
|
|
if first_sentence and len(first_sentence) > 25:
|
|
metadata['descriptions'].append(first_sentence)
|
|
|
|
# Remove duplicates and clean up
|
|
for key in metadata:
|
|
if isinstance(metadata[key], list):
|
|
# Remove duplicates while preserving order
|
|
seen = set()
|
|
unique = []
|
|
for item in metadata[key]:
|
|
item_clean = item.strip()
|
|
if item_clean and item_clean not in seen:
|
|
seen.add(item_clean)
|
|
unique.append(item_clean)
|
|
metadata[key] = unique
|
|
|
|
return metadata
|
|
|
|
def _normalize_name(self, name: str) -> str:
|
|
"""Normalize institution name for matching."""
|
|
# Remove parenthetical suffixes
|
|
name = re.sub(r'\([^)]*\)', '', name)
|
|
# Remove common prefixes/suffixes
|
|
name = re.sub(r'^(Museo|Biblioteca|Archivo|Instituto|Universidad)\s+', '', name, flags=re.IGNORECASE)
|
|
# Lowercase and remove extra whitespace
|
|
name = ' '.join(name.lower().split())
|
|
# Remove accents (basic)
|
|
name = name.replace('á', 'a').replace('é', 'e').replace('í', 'i')
|
|
name = name.replace('ó', 'o').replace('ú', 'u').replace('ñ', 'n')
|
|
return name
|
|
|
|
def enrich_institutions(self):
|
|
"""Enrich each institution with data from artifacts."""
|
|
print("\n=== Enriching Institutions ===")
|
|
enriched_count = 0
|
|
|
|
for institution in self.institutions:
|
|
original_name = institution.get('name', '')
|
|
norm_name = self._normalize_name(original_name)
|
|
|
|
# Try exact match first
|
|
enrichment = self.enrichment_data.get(norm_name)
|
|
|
|
# Try fuzzy matching if no exact match
|
|
if not enrichment:
|
|
enrichment = self._fuzzy_match(norm_name)
|
|
|
|
if enrichment:
|
|
self._apply_enrichment(institution, enrichment)
|
|
enriched_count += 1
|
|
else:
|
|
# Mark as not enriched
|
|
if 'description' not in institution or not institution['description']:
|
|
institution['description'] = f"Mexican heritage institution. Further enrichment data not available in source conversations."
|
|
|
|
print(f"✓ Enriched {enriched_count}/{len(self.institutions)} institutions")
|
|
|
|
def _fuzzy_match(self, norm_name: str) -> Optional[Dict]:
|
|
"""Attempt fuzzy matching for institution name."""
|
|
# Check if norm_name is contained in any enrichment key
|
|
for key, data in self.enrichment_data.items():
|
|
if norm_name in key or key in norm_name:
|
|
return data
|
|
return None
|
|
|
|
def _apply_enrichment(self, institution: Dict, enrichment: Dict):
|
|
"""Apply enrichment data to an institution record."""
|
|
metadata = enrichment['metadata']
|
|
|
|
# Enhance description
|
|
desc_parts = []
|
|
|
|
# Start with existing description if good
|
|
existing_desc = institution.get('description', '').strip()
|
|
if (existing_desc and
|
|
existing_desc != "Mexican heritage institution. Further enrichment data not available in source conversations." and
|
|
not existing_desc.startswith('http')): # Avoid URLs in description
|
|
desc_parts.append(existing_desc)
|
|
|
|
# Add new descriptions (skip URLs)
|
|
for desc in metadata['descriptions'][:3]:
|
|
if desc not in desc_parts and not desc.startswith('http'):
|
|
desc_parts.append(desc)
|
|
|
|
# Add collection information to description if not already present
|
|
if metadata['collections'] and not any('collection' in d.lower() or 'holdings' in d.lower() for d in desc_parts):
|
|
coll_summary = metadata['collections'][0]
|
|
if not coll_summary.startswith('http'):
|
|
desc_parts.append(f"Collections: {coll_summary}")
|
|
|
|
# Create final description
|
|
if desc_parts:
|
|
institution['description'] = '. '.join(desc_parts)
|
|
# Clean up double periods
|
|
institution['description'] = institution['description'].replace('..', '.')
|
|
# Ensure it ends with period
|
|
if not institution['description'].endswith('.'):
|
|
institution['description'] += '.'
|
|
elif not institution.get('description'):
|
|
# Fallback: use institution name and type
|
|
type_map = {
|
|
'MUSEUM': 'museum',
|
|
'ARCHIVE': 'archive',
|
|
'LIBRARY': 'library',
|
|
'OFFICIAL_INSTITUTION': 'government cultural institution',
|
|
'MIXED': 'cultural heritage institution'
|
|
}
|
|
inst_type = type_map.get(institution.get('institution_type', 'MIXED'), 'heritage institution')
|
|
institution['description'] = f"Mexican {inst_type} in {enrichment['state'].title()}."
|
|
|
|
# Add/update locations (addresses, cities)
|
|
if metadata['addresses'] or metadata['cities']:
|
|
if 'locations' not in institution:
|
|
institution['locations'] = []
|
|
|
|
# Get or create first location
|
|
if not institution['locations']:
|
|
institution['locations'].append({'country': 'MX'})
|
|
|
|
location = institution['locations'][0]
|
|
|
|
# Add street address (only if it looks like a real address, not a URL)
|
|
if metadata['addresses']:
|
|
for addr in metadata['addresses']:
|
|
if not addr.startswith('http') and len(addr) > 10:
|
|
location['street_address'] = addr.strip()
|
|
break
|
|
|
|
# Extract or set city
|
|
if 'city' not in location or not location.get('city'):
|
|
if metadata['cities']:
|
|
location['city'] = metadata['cities'][0]
|
|
elif 'street_address' in location:
|
|
# Try to extract city from address
|
|
addr = location['street_address']
|
|
city_match = re.search(r',\s*([A-Z][a-zA-Z\s]+?)(?:,|\s+\d{5}|$)', addr)
|
|
if city_match:
|
|
location['city'] = city_match.group(1).strip()
|
|
|
|
# Keep region if already present
|
|
if 'region' not in location and enrichment['state'] != 'NATIONAL':
|
|
location['region'] = enrichment['state']
|
|
|
|
# Add identifiers (URLs, emails, OCLC, etc.)
|
|
if 'identifiers' not in institution:
|
|
institution['identifiers'] = []
|
|
|
|
# Track existing URLs to avoid duplicates
|
|
existing_urls = {id.get('identifier_value') for id in institution['identifiers']}
|
|
|
|
# Add URLs (limit to top 3 most relevant)
|
|
url_priority = []
|
|
for url in metadata['urls']:
|
|
# Prioritize institutional websites over social media
|
|
if 'facebook.com' in url or 'twitter.com' in url or 'instagram.com' in url:
|
|
priority = 2
|
|
elif 'inah.gob.mx' in url or 'cultura.gob.mx' in url or '.edu' in url or '.gob.mx' in url:
|
|
priority = 0
|
|
else:
|
|
priority = 1
|
|
url_priority.append((priority, url))
|
|
|
|
url_priority.sort()
|
|
|
|
for _, url in url_priority[:3]:
|
|
if url not in existing_urls:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': url,
|
|
'identifier_url': url
|
|
})
|
|
existing_urls.add(url)
|
|
|
|
# Add emails (limit to 2)
|
|
for email in metadata['emails'][:2]:
|
|
if not any(id.get('identifier_value') == email for id in institution['identifiers']):
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Email',
|
|
'identifier_value': email
|
|
})
|
|
|
|
# Add phone numbers
|
|
for phone in metadata['phones'][:1]:
|
|
if not any(id.get('identifier_value') == phone for id in institution['identifiers']):
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Phone',
|
|
'identifier_value': phone.strip()
|
|
})
|
|
|
|
# Add OCLC identifier if mentioned
|
|
if any('OCLC' in p or 'WorldCat' in p for p in metadata['platforms']):
|
|
if not any(id.get('identifier_scheme') == 'OCLC' for id in institution['identifiers']):
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'OCLC',
|
|
'identifier_value': 'Catalogued in WorldCat'
|
|
})
|
|
|
|
# Add digital platforms
|
|
if metadata['platforms'] or metadata['metadata_standards'] or metadata['urls']:
|
|
if 'digital_platforms' not in institution:
|
|
institution['digital_platforms'] = []
|
|
|
|
# Create comprehensive platform entry
|
|
platform_entry = {}
|
|
|
|
# Use primary URL
|
|
if metadata['urls']:
|
|
platform_entry['platform_url'] = metadata['urls'][0]
|
|
# Extract platform name from URL or use institution name
|
|
url_match = re.search(r'https?://(?:www\.)?([^/]+)', metadata['urls'][0])
|
|
if url_match:
|
|
platform_entry['platform_name'] = url_match.group(1)
|
|
|
|
# Add metadata standards
|
|
if metadata['metadata_standards']:
|
|
platform_entry['metadata_standards'] = metadata['metadata_standards']
|
|
|
|
# Infer platform type
|
|
if 'API' in ' '.join(metadata['platforms']):
|
|
platform_entry['platform_type'] = 'API'
|
|
elif any(x in ' '.join(metadata['platforms']) for x in ['Virtual Tours', 'Google Arts']):
|
|
platform_entry['platform_type'] = 'DISCOVERY_PORTAL'
|
|
elif any(x in ' '.join(metadata['platforms']) for x in ['WorldCat', 'OPAC']):
|
|
platform_entry['platform_type'] = 'CATALOG'
|
|
elif metadata['urls']:
|
|
# Default based on URL
|
|
url = metadata['urls'][0].lower()
|
|
if 'catalog' in url or 'opac' in url:
|
|
platform_entry['platform_type'] = 'CATALOG'
|
|
elif 'repository' in url or 'repositorio' in url:
|
|
platform_entry['platform_type'] = 'REPOSITORY'
|
|
else:
|
|
platform_entry['platform_type'] = 'DISCOVERY_PORTAL'
|
|
|
|
if platform_entry:
|
|
# Avoid duplicate platforms
|
|
if not any(p.get('platform_url') == platform_entry.get('platform_url')
|
|
for p in institution['digital_platforms']):
|
|
institution['digital_platforms'].append(platform_entry)
|
|
|
|
# Add collections metadata
|
|
if metadata['collections']:
|
|
if 'collections' not in institution:
|
|
institution['collections'] = []
|
|
|
|
for coll_desc in metadata['collections'][:2]:
|
|
# Skip URLs
|
|
if coll_desc.startswith('http'):
|
|
continue
|
|
|
|
# Create detailed collection entry
|
|
collection = {
|
|
'collection_name': f"{enrichment['name']} Collection"
|
|
}
|
|
|
|
# Parse extent/count from description
|
|
extent_match = re.search(r'(\d+[,\d+]*)\s*(objects|works|items|volumes|documents|pages|resources|photographs)',
|
|
coll_desc, re.IGNORECASE)
|
|
if extent_match:
|
|
collection['extent'] = f"{extent_match.group(1)} {extent_match.group(2)}"
|
|
else:
|
|
collection['extent'] = coll_desc.strip()
|
|
|
|
# Avoid duplicates
|
|
if not any(c.get('extent') == collection.get('extent') for c in institution['collections']):
|
|
institution['collections'].append(collection)
|
|
|
|
# Update provenance with enrichment metadata
|
|
if 'provenance' in institution:
|
|
institution['provenance']['confidence_score'] = 0.90
|
|
institution['provenance']['extraction_method'] = "Multi-file NLP extraction with manual curation and artifact enrichment"
|
|
|
|
def save_curated_yaml(self):
|
|
"""Save enriched institutions to output YAML file."""
|
|
# Add header
|
|
header = f"""---
|
|
# Mexican GLAM Institutions - CURATED VERSION
|
|
# Manually enriched from conversation artifacts
|
|
#
|
|
# Source conversations:
|
|
# 1. Mexican GLAM inventories and catalogues (2025-09-22)
|
|
# 2. Mexican GLAM resources inventory (2025-09-23)
|
|
#
|
|
# Enrichment artifacts:
|
|
# - Comprehensive Directory of Mexican Heritage Institutions (759 lines)
|
|
# - Mexican GLAM Online Resources Inventory (383 lines)
|
|
#
|
|
# Total institutions: {len(self.institutions)}
|
|
# Curation date: {datetime.now(timezone.utc).isoformat()}
|
|
# Schema: LinkML v0.2.0 (modular)
|
|
# Data tier: TIER_4_INFERRED (with artifact enrichment)
|
|
|
|
"""
|
|
|
|
# Convert to YAML
|
|
yaml_content = yaml.dump(
|
|
self.institutions,
|
|
allow_unicode=True,
|
|
default_flow_style=False,
|
|
sort_keys=False,
|
|
width=120
|
|
)
|
|
|
|
# Write to file
|
|
with open(OUTPUT_YAML, 'w', encoding='utf-8') as f:
|
|
f.write(header)
|
|
f.write(yaml_content)
|
|
|
|
print(f"\n✓ Saved curated data to {OUTPUT_YAML}")
|
|
print(f" File size: {OUTPUT_YAML.stat().st_size / 1024:.1f} KB")
|
|
|
|
def generate_statistics(self):
|
|
"""Generate enrichment statistics report."""
|
|
print("\n" + "="*80)
|
|
print("ENRICHMENT STATISTICS")
|
|
print("="*80)
|
|
|
|
total = len(self.institutions)
|
|
|
|
# Count field completeness
|
|
with_description = sum(1 for i in self.institutions if i.get('description'))
|
|
with_addresses = sum(1 for i in self.institutions
|
|
if i.get('locations') and any(l.get('street_address') for l in i['locations']))
|
|
with_urls = sum(1 for i in self.institutions
|
|
if i.get('identifiers') and len(i['identifiers']) > 1)
|
|
with_platforms = sum(1 for i in self.institutions if i.get('digital_platforms'))
|
|
with_collections = sum(1 for i in self.institutions if i.get('collections'))
|
|
|
|
print(f"\nTotal institutions: {total}")
|
|
print(f"\nField Completeness:")
|
|
print(f" Descriptions: {with_description:3d} ({with_description/total*100:5.1f}%)")
|
|
print(f" Street addresses: {with_addresses:3d} ({with_addresses/total*100:5.1f}%)")
|
|
print(f" Multiple IDs/URLs: {with_urls:3d} ({with_urls/total*100:5.1f}%)")
|
|
print(f" Digital platforms: {with_platforms:3d} ({with_platforms/total*100:5.1f}%)")
|
|
print(f" Collections: {with_collections:3d} ({with_collections/total*100:5.1f}%)")
|
|
|
|
# Institution type breakdown
|
|
print(f"\nInstitution Types:")
|
|
types = {}
|
|
for inst in self.institutions:
|
|
itype = inst.get('institution_type', 'UNKNOWN')
|
|
types[itype] = types.get(itype, 0) + 1
|
|
|
|
for itype in sorted(types.keys()):
|
|
print(f" {itype:20s} {types[itype]:3d} ({types[itype]/total*100:5.1f}%)")
|
|
|
|
print("\n" + "="*80)
|
|
|
|
|
|
def main():
|
|
"""Main enrichment workflow."""
|
|
print("="*80)
|
|
print("MEXICAN GLAM INSTITUTIONS - MANUAL CURATION")
|
|
print("="*80)
|
|
|
|
enricher = InstitutionEnricher()
|
|
|
|
# Step 1: Load existing data
|
|
enricher.load_existing_institutions()
|
|
|
|
# Step 2: Parse markdown artifacts
|
|
enricher.parse_markdown_artifacts()
|
|
|
|
# Step 3: Enrich institutions
|
|
enricher.enrich_institutions()
|
|
|
|
# Step 4: Save curated YAML
|
|
enricher.save_curated_yaml()
|
|
|
|
# Step 5: Generate statistics
|
|
enricher.generate_statistics()
|
|
|
|
print("\n✅ CURATION COMPLETE!")
|
|
print(f"\nOutput: {OUTPUT_YAML}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|