glam/scripts/enrich_mexican_institutions.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

610 lines
26 KiB
Python

#!/usr/bin/env python3
"""
Mexican GLAM Institutions Manual Curation Script
This script enriches 117 Mexican heritage institutions by extracting comprehensive
metadata from conversation artifacts to create fully enriched LinkML-compliant records.
Input:
- /Users/kempersc/apps/glam/data/instances/mexican_institutions.yaml
- /Users/kempersc/apps/glam/data/temp_conv1_artifact2.md (state-by-state directory)
- /Users/kempersc/apps/glam/data/temp_conv2_artifact1.md (national institutions)
Output:
- /Users/kempersc/apps/glam/data/instances/mexican_institutions_curated.yaml
"""
import re
import yaml
from datetime import datetime, timezone
from typing import Dict, List, Any, Optional
from pathlib import Path
# Paths
BASE_DIR = Path("/Users/kempersc/apps/glam")
INPUT_YAML = BASE_DIR / "data/instances/mexican_institutions.yaml"
ARTIFACT1 = BASE_DIR / "data/temp_conv1_artifact2.md" # State-by-state
ARTIFACT2 = BASE_DIR / "data/temp_conv2_artifact1.md" # National institutions
OUTPUT_YAML = BASE_DIR / "data/instances/mexican_institutions_curated.yaml"
class InstitutionEnricher:
"""Enrich Mexican GLAM institutions with comprehensive metadata."""
def __init__(self):
self.institutions = []
self.enrichment_data = {}
def load_existing_institutions(self):
"""Load the 117 existing institutions from YAML."""
with open(INPUT_YAML, 'r', encoding='utf-8') as f:
content = f.read()
# Skip header comments
yaml_content = re.sub(r'^#.*\n', '', content, flags=re.MULTILINE)
self.institutions = yaml.safe_load(yaml_content)
print(f"✓ Loaded {len(self.institutions)} institutions from {INPUT_YAML.name}")
def parse_markdown_artifacts(self):
"""Parse both markdown artifacts to extract enrichment data."""
print("\n=== Parsing Markdown Artifacts ===")
# Parse state-by-state directory (artifact 1)
with open(ARTIFACT1, 'r', encoding='utf-8') as f:
artifact1_content = f.read()
self._parse_state_directory(artifact1_content)
# Parse national institutions (artifact 2)
with open(ARTIFACT2, 'r', encoding='utf-8') as f:
artifact2_content = f.read()
self._parse_national_institutions(artifact2_content)
print(f"✓ Extracted enrichment data for {len(self.enrichment_data)} institutions")
def _parse_state_directory(self, content: str):
"""Parse state-by-state institutional directory."""
# Extract state sections
state_sections = re.split(r'\n### ([A-Z\s]+)\n', content)
for i in range(1, len(state_sections), 2):
state_name = state_sections[i].strip()
section_content = state_sections[i+1]
# Extract institutions from this state
self._extract_institutions_from_section(section_content, state_name)
def _extract_institutions_from_section(self, content: str, state: str):
"""Extract institution details from a state section."""
# Match institution blocks - two patterns:
# Pattern 1: List items with bold names (- **Name**: details or - **Name** newline details)
# Pattern 2: Standalone bold headers followed by details
# Pattern 1: List item format (- **Name**: details OR - **Name** followed by details)
list_pattern = r'- \*\*([^*]+)\*\*(?::?\s*([^\n]+))?(?:\n((?: .*\n?)+))?'
# Pattern 2: Standalone bold (for section headers - we'll still try to extract)
standalone_pattern = r'(?:^|\n)\*\*([^*]+)\*\*\s*$'
# Try list pattern first (most institutions are in lists)
for match in re.finditer(list_pattern, content, re.MULTILINE):
name = match.group(1).strip()
inline_desc = match.group(2) if match.group(2) else ''
indented_details = match.group(3) if match.group(3) else ''
# Combine all details
details = (inline_desc + '\n' + indented_details).strip()
# Skip section headers
if name.endswith(':') or name in ['Institutional Custodians', 'Museums', 'Archives',
'Digital Resources', 'Digital Infrastructure',
'University Collections', 'INAH Regional Center',
'Archaeological Documentation', 'State Infrastructure',
'Key Museums', 'Universities', 'Major Art Museums',
'Anthropology and History Museums', 'Major Public Universities',
'Research Centers']:
continue
# Extract structured data
institution_data = {
'name': name,
'state': state,
'details': details,
'metadata': self._extract_metadata_from_details(details)
}
# Store by normalized name for matching
norm_name = self._normalize_name(name)
if norm_name not in self.enrichment_data:
self.enrichment_data[norm_name] = institution_data
def _parse_national_institutions(self, content: str):
"""Parse national-level institutions and platforms."""
# Extract major sections
sections = re.split(r'\n## (.+)\n', content)
for i in range(1, len(sections), 2):
section_title = sections[i].strip()
section_content = sections[i+1]
# Extract institutions from this section
self._extract_institutions_from_section(section_content, "NATIONAL")
def _extract_metadata_from_details(self, details: str) -> Dict[str, Any]:
"""Extract structured metadata from institution details block."""
metadata = {
'urls': [],
'emails': [],
'phones': [],
'addresses': [],
'cities': [],
'descriptions': [],
'collections': [],
'platforms': [],
'metadata_standards': [],
'identifiers': [],
'directors': [],
'hours': []
}
# Extract URLs (multiple patterns)
url_patterns = [
r'(?:URL|Website|Portal|Digital Library|Catalogue|Repository|OPAC|GitHub|Main Website|Alternative Access|Digital Repository):\s*(https?://[^\s\)]+)',
r'\*\*(?:URL|Website)\*\*:\s*(https?://[^\s\)]+)',
]
for pattern in url_patterns:
metadata['urls'].extend(re.findall(pattern, details, re.IGNORECASE))
# Extract generic URLs (not already captured)
generic_urls = re.findall(r'https?://[^\s\)\]<>,]+', details)
for url in generic_urls:
if url not in metadata['urls'] and not url.endswith('...'):
metadata['urls'].append(url)
# Extract emails
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
metadata['emails'].extend(re.findall(email_pattern, details))
# Extract phone numbers (multiple patterns)
phone_patterns = [
r'(?:Phone|Tel):\s*([+\d\s\(\)-]+)',
r'\+52\s*\d+\s*\d+\s*\d+', # Mexican phone format
r'\d{3}\s*\d{7}', # Local Mexican format
]
for pattern in phone_patterns:
metadata['phones'].extend(re.findall(pattern, details, re.IGNORECASE))
# Extract addresses (multiple patterns)
address_patterns = [
r'Address:\s*([^\n-]+?)(?:\n|$)',
r'(?:Calzada|Avenida|Av\.|Calle|C\.|Boulevard)[\s\w\d,.#°-]+(?:\d{5})?',
]
for pattern in address_patterns:
found = re.findall(pattern, details, re.IGNORECASE)
metadata['addresses'].extend([a.strip() for a in found if len(a.strip()) > 10])
# Extract cities (from addresses or standalone)
city_pattern = r'(?:,\s*)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,|\s+\d{5}|\s*$)'
metadata['cities'].extend(re.findall(city_pattern, details))
# Extract collection information (multiple patterns)
collection_patterns = [
r'(?:Collection|Holdings|Content|Scope):\s*([^\n-]+)',
r'(\d+[,+]\s*(?:objects|works|items|volumes|documents|pages|resources|photographs))',
r'(Over \d+[,\d]*\s+(?:objects|works|items|volumes|documents))',
]
for pattern in collection_patterns:
metadata['collections'].extend(re.findall(pattern, details, re.IGNORECASE))
# Extract directors/contacts
director_pattern = r'Director:\s*([^\n]+)'
metadata['directors'].extend(re.findall(director_pattern, details))
# Extract hours
hours_pattern = r'Hours:\s*([^\n]+)'
metadata['hours'].extend(re.findall(hours_pattern, details))
# Extract metadata standards
standards = ['Dublin Core', 'MARC21', 'OAI-PMH', 'IIIF', 'Schema.org',
'EAD', 'BIBFRAME', 'Tainacan', 'LIDO', 'RDF', 'JSON-LD']
for standard in standards:
if standard.lower() in details.lower():
metadata['metadata_standards'].append(standard)
# Extract platform types
platforms = ['RESTful API', 'API', 'SPARQL', 'WorldCat', 'OCLC',
'Google Arts & Culture', 'Virtual Tours', 'Digital catalogue',
'Open access', 'OAI-PMH protocol']
for platform in platforms:
if platform.lower() in details.lower():
metadata['platforms'].append(platform)
# Extract descriptions (meaningful sentences/fragments)
# Split by newlines and common delimiters
fragments = re.split(r'[\n-]', details)
for fragment in fragments[:5]: # Take first 5 fragments
clean = fragment.strip()
# Skip if it's a field label or too short
if clean and len(clean) > 25 and ':' not in clean[:30]:
metadata['descriptions'].append(clean)
# Also extract inline descriptions after colons
if ': ' in details:
parts = details.split(': ', 1)
if len(parts) > 1 and len(parts[1]) > 30:
first_sentence = parts[1].split('\n')[0].strip()
if first_sentence and len(first_sentence) > 25:
metadata['descriptions'].append(first_sentence)
# Remove duplicates and clean up
for key in metadata:
if isinstance(metadata[key], list):
# Remove duplicates while preserving order
seen = set()
unique = []
for item in metadata[key]:
item_clean = item.strip()
if item_clean and item_clean not in seen:
seen.add(item_clean)
unique.append(item_clean)
metadata[key] = unique
return metadata
def _normalize_name(self, name: str) -> str:
"""Normalize institution name for matching."""
# Remove parenthetical suffixes
name = re.sub(r'\([^)]*\)', '', name)
# Remove common prefixes/suffixes
name = re.sub(r'^(Museo|Biblioteca|Archivo|Instituto|Universidad)\s+', '', name, flags=re.IGNORECASE)
# Lowercase and remove extra whitespace
name = ' '.join(name.lower().split())
# Remove accents (basic)
name = name.replace('á', 'a').replace('é', 'e').replace('í', 'i')
name = name.replace('ó', 'o').replace('ú', 'u').replace('ñ', 'n')
return name
def enrich_institutions(self):
"""Enrich each institution with data from artifacts."""
print("\n=== Enriching Institutions ===")
enriched_count = 0
for institution in self.institutions:
original_name = institution.get('name', '')
norm_name = self._normalize_name(original_name)
# Try exact match first
enrichment = self.enrichment_data.get(norm_name)
# Try fuzzy matching if no exact match
if not enrichment:
enrichment = self._fuzzy_match(norm_name)
if enrichment:
self._apply_enrichment(institution, enrichment)
enriched_count += 1
else:
# Mark as not enriched
if 'description' not in institution or not institution['description']:
institution['description'] = f"Mexican heritage institution. Further enrichment data not available in source conversations."
print(f"✓ Enriched {enriched_count}/{len(self.institutions)} institutions")
def _fuzzy_match(self, norm_name: str) -> Optional[Dict]:
"""Attempt fuzzy matching for institution name."""
# Check if norm_name is contained in any enrichment key
for key, data in self.enrichment_data.items():
if norm_name in key or key in norm_name:
return data
return None
def _apply_enrichment(self, institution: Dict, enrichment: Dict):
"""Apply enrichment data to an institution record."""
metadata = enrichment['metadata']
# Enhance description
desc_parts = []
# Start with existing description if good
existing_desc = institution.get('description', '').strip()
if (existing_desc and
existing_desc != "Mexican heritage institution. Further enrichment data not available in source conversations." and
not existing_desc.startswith('http')): # Avoid URLs in description
desc_parts.append(existing_desc)
# Add new descriptions (skip URLs)
for desc in metadata['descriptions'][:3]:
if desc not in desc_parts and not desc.startswith('http'):
desc_parts.append(desc)
# Add collection information to description if not already present
if metadata['collections'] and not any('collection' in d.lower() or 'holdings' in d.lower() for d in desc_parts):
coll_summary = metadata['collections'][0]
if not coll_summary.startswith('http'):
desc_parts.append(f"Collections: {coll_summary}")
# Create final description
if desc_parts:
institution['description'] = '. '.join(desc_parts)
# Clean up double periods
institution['description'] = institution['description'].replace('..', '.')
# Ensure it ends with period
if not institution['description'].endswith('.'):
institution['description'] += '.'
elif not institution.get('description'):
# Fallback: use institution name and type
type_map = {
'MUSEUM': 'museum',
'ARCHIVE': 'archive',
'LIBRARY': 'library',
'OFFICIAL_INSTITUTION': 'government cultural institution',
'MIXED': 'cultural heritage institution'
}
inst_type = type_map.get(institution.get('institution_type', 'MIXED'), 'heritage institution')
institution['description'] = f"Mexican {inst_type} in {enrichment['state'].title()}."
# Add/update locations (addresses, cities)
if metadata['addresses'] or metadata['cities']:
if 'locations' not in institution:
institution['locations'] = []
# Get or create first location
if not institution['locations']:
institution['locations'].append({'country': 'MX'})
location = institution['locations'][0]
# Add street address (only if it looks like a real address, not a URL)
if metadata['addresses']:
for addr in metadata['addresses']:
if not addr.startswith('http') and len(addr) > 10:
location['street_address'] = addr.strip()
break
# Extract or set city
if 'city' not in location or not location.get('city'):
if metadata['cities']:
location['city'] = metadata['cities'][0]
elif 'street_address' in location:
# Try to extract city from address
addr = location['street_address']
city_match = re.search(r',\s*([A-Z][a-zA-Z\s]+?)(?:,|\s+\d{5}|$)', addr)
if city_match:
location['city'] = city_match.group(1).strip()
# Keep region if already present
if 'region' not in location and enrichment['state'] != 'NATIONAL':
location['region'] = enrichment['state']
# Add identifiers (URLs, emails, OCLC, etc.)
if 'identifiers' not in institution:
institution['identifiers'] = []
# Track existing URLs to avoid duplicates
existing_urls = {id.get('identifier_value') for id in institution['identifiers']}
# Add URLs (limit to top 3 most relevant)
url_priority = []
for url in metadata['urls']:
# Prioritize institutional websites over social media
if 'facebook.com' in url or 'twitter.com' in url or 'instagram.com' in url:
priority = 2
elif 'inah.gob.mx' in url or 'cultura.gob.mx' in url or '.edu' in url or '.gob.mx' in url:
priority = 0
else:
priority = 1
url_priority.append((priority, url))
url_priority.sort()
for _, url in url_priority[:3]:
if url not in existing_urls:
institution['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': url,
'identifier_url': url
})
existing_urls.add(url)
# Add emails (limit to 2)
for email in metadata['emails'][:2]:
if not any(id.get('identifier_value') == email for id in institution['identifiers']):
institution['identifiers'].append({
'identifier_scheme': 'Email',
'identifier_value': email
})
# Add phone numbers
for phone in metadata['phones'][:1]:
if not any(id.get('identifier_value') == phone for id in institution['identifiers']):
institution['identifiers'].append({
'identifier_scheme': 'Phone',
'identifier_value': phone.strip()
})
# Add OCLC identifier if mentioned
if any('OCLC' in p or 'WorldCat' in p for p in metadata['platforms']):
if not any(id.get('identifier_scheme') == 'OCLC' for id in institution['identifiers']):
institution['identifiers'].append({
'identifier_scheme': 'OCLC',
'identifier_value': 'Catalogued in WorldCat'
})
# Add digital platforms
if metadata['platforms'] or metadata['metadata_standards'] or metadata['urls']:
if 'digital_platforms' not in institution:
institution['digital_platforms'] = []
# Create comprehensive platform entry
platform_entry = {}
# Use primary URL
if metadata['urls']:
platform_entry['platform_url'] = metadata['urls'][0]
# Extract platform name from URL or use institution name
url_match = re.search(r'https?://(?:www\.)?([^/]+)', metadata['urls'][0])
if url_match:
platform_entry['platform_name'] = url_match.group(1)
# Add metadata standards
if metadata['metadata_standards']:
platform_entry['metadata_standards'] = metadata['metadata_standards']
# Infer platform type
if 'API' in ' '.join(metadata['platforms']):
platform_entry['platform_type'] = 'API'
elif any(x in ' '.join(metadata['platforms']) for x in ['Virtual Tours', 'Google Arts']):
platform_entry['platform_type'] = 'DISCOVERY_PORTAL'
elif any(x in ' '.join(metadata['platforms']) for x in ['WorldCat', 'OPAC']):
platform_entry['platform_type'] = 'CATALOG'
elif metadata['urls']:
# Default based on URL
url = metadata['urls'][0].lower()
if 'catalog' in url or 'opac' in url:
platform_entry['platform_type'] = 'CATALOG'
elif 'repository' in url or 'repositorio' in url:
platform_entry['platform_type'] = 'REPOSITORY'
else:
platform_entry['platform_type'] = 'DISCOVERY_PORTAL'
if platform_entry:
# Avoid duplicate platforms
if not any(p.get('platform_url') == platform_entry.get('platform_url')
for p in institution['digital_platforms']):
institution['digital_platforms'].append(platform_entry)
# Add collections metadata
if metadata['collections']:
if 'collections' not in institution:
institution['collections'] = []
for coll_desc in metadata['collections'][:2]:
# Skip URLs
if coll_desc.startswith('http'):
continue
# Create detailed collection entry
collection = {
'collection_name': f"{enrichment['name']} Collection"
}
# Parse extent/count from description
extent_match = re.search(r'(\d+[,\d+]*)\s*(objects|works|items|volumes|documents|pages|resources|photographs)',
coll_desc, re.IGNORECASE)
if extent_match:
collection['extent'] = f"{extent_match.group(1)} {extent_match.group(2)}"
else:
collection['extent'] = coll_desc.strip()
# Avoid duplicates
if not any(c.get('extent') == collection.get('extent') for c in institution['collections']):
institution['collections'].append(collection)
# Update provenance with enrichment metadata
if 'provenance' in institution:
institution['provenance']['confidence_score'] = 0.90
institution['provenance']['extraction_method'] = "Multi-file NLP extraction with manual curation and artifact enrichment"
def save_curated_yaml(self):
"""Save enriched institutions to output YAML file."""
# Add header
header = f"""---
# Mexican GLAM Institutions - CURATED VERSION
# Manually enriched from conversation artifacts
#
# Source conversations:
# 1. Mexican GLAM inventories and catalogues (2025-09-22)
# 2. Mexican GLAM resources inventory (2025-09-23)
#
# Enrichment artifacts:
# - Comprehensive Directory of Mexican Heritage Institutions (759 lines)
# - Mexican GLAM Online Resources Inventory (383 lines)
#
# Total institutions: {len(self.institutions)}
# Curation date: {datetime.now(timezone.utc).isoformat()}
# Schema: LinkML v0.2.0 (modular)
# Data tier: TIER_4_INFERRED (with artifact enrichment)
"""
# Convert to YAML
yaml_content = yaml.dump(
self.institutions,
allow_unicode=True,
default_flow_style=False,
sort_keys=False,
width=120
)
# Write to file
with open(OUTPUT_YAML, 'w', encoding='utf-8') as f:
f.write(header)
f.write(yaml_content)
print(f"\n✓ Saved curated data to {OUTPUT_YAML}")
print(f" File size: {OUTPUT_YAML.stat().st_size / 1024:.1f} KB")
def generate_statistics(self):
"""Generate enrichment statistics report."""
print("\n" + "="*80)
print("ENRICHMENT STATISTICS")
print("="*80)
total = len(self.institutions)
# Count field completeness
with_description = sum(1 for i in self.institutions if i.get('description'))
with_addresses = sum(1 for i in self.institutions
if i.get('locations') and any(l.get('street_address') for l in i['locations']))
with_urls = sum(1 for i in self.institutions
if i.get('identifiers') and len(i['identifiers']) > 1)
with_platforms = sum(1 for i in self.institutions if i.get('digital_platforms'))
with_collections = sum(1 for i in self.institutions if i.get('collections'))
print(f"\nTotal institutions: {total}")
print(f"\nField Completeness:")
print(f" Descriptions: {with_description:3d} ({with_description/total*100:5.1f}%)")
print(f" Street addresses: {with_addresses:3d} ({with_addresses/total*100:5.1f}%)")
print(f" Multiple IDs/URLs: {with_urls:3d} ({with_urls/total*100:5.1f}%)")
print(f" Digital platforms: {with_platforms:3d} ({with_platforms/total*100:5.1f}%)")
print(f" Collections: {with_collections:3d} ({with_collections/total*100:5.1f}%)")
# Institution type breakdown
print(f"\nInstitution Types:")
types = {}
for inst in self.institutions:
itype = inst.get('institution_type', 'UNKNOWN')
types[itype] = types.get(itype, 0) + 1
for itype in sorted(types.keys()):
print(f" {itype:20s} {types[itype]:3d} ({types[itype]/total*100:5.1f}%)")
print("\n" + "="*80)
def main():
"""Main enrichment workflow."""
print("="*80)
print("MEXICAN GLAM INSTITUTIONS - MANUAL CURATION")
print("="*80)
enricher = InstitutionEnricher()
# Step 1: Load existing data
enricher.load_existing_institutions()
# Step 2: Parse markdown artifacts
enricher.parse_markdown_artifacts()
# Step 3: Enrich institutions
enricher.enrich_institutions()
# Step 4: Save curated YAML
enricher.save_curated_yaml()
# Step 5: Generate statistics
enricher.generate_statistics()
print("\n✅ CURATION COMPLETE!")
print(f"\nOutput: {OUTPUT_YAML}")
if __name__ == "__main__":
main()