#!/usr/bin/env python3 """ Mexican GLAM Institutions Manual Curation Script This script enriches 117 Mexican heritage institutions by extracting comprehensive metadata from conversation artifacts to create fully enriched LinkML-compliant records. Input: - /Users/kempersc/apps/glam/data/instances/mexican_institutions.yaml - /Users/kempersc/apps/glam/data/temp_conv1_artifact2.md (state-by-state directory) - /Users/kempersc/apps/glam/data/temp_conv2_artifact1.md (national institutions) Output: - /Users/kempersc/apps/glam/data/instances/mexican_institutions_curated.yaml """ import re import yaml from datetime import datetime, timezone from typing import Dict, List, Any, Optional from pathlib import Path # Paths BASE_DIR = Path("/Users/kempersc/apps/glam") INPUT_YAML = BASE_DIR / "data/instances/mexican_institutions.yaml" ARTIFACT1 = BASE_DIR / "data/temp_conv1_artifact2.md" # State-by-state ARTIFACT2 = BASE_DIR / "data/temp_conv2_artifact1.md" # National institutions OUTPUT_YAML = BASE_DIR / "data/instances/mexican_institutions_curated.yaml" class InstitutionEnricher: """Enrich Mexican GLAM institutions with comprehensive metadata.""" def __init__(self): self.institutions = [] self.enrichment_data = {} def load_existing_institutions(self): """Load the 117 existing institutions from YAML.""" with open(INPUT_YAML, 'r', encoding='utf-8') as f: content = f.read() # Skip header comments yaml_content = re.sub(r'^#.*\n', '', content, flags=re.MULTILINE) self.institutions = yaml.safe_load(yaml_content) print(f"✓ Loaded {len(self.institutions)} institutions from {INPUT_YAML.name}") def parse_markdown_artifacts(self): """Parse both markdown artifacts to extract enrichment data.""" print("\n=== Parsing Markdown Artifacts ===") # Parse state-by-state directory (artifact 1) with open(ARTIFACT1, 'r', encoding='utf-8') as f: artifact1_content = f.read() self._parse_state_directory(artifact1_content) # Parse national institutions (artifact 2) with open(ARTIFACT2, 'r', encoding='utf-8') as f: artifact2_content = f.read() self._parse_national_institutions(artifact2_content) print(f"✓ Extracted enrichment data for {len(self.enrichment_data)} institutions") def _parse_state_directory(self, content: str): """Parse state-by-state institutional directory.""" # Extract state sections state_sections = re.split(r'\n### ([A-Z\s]+)\n', content) for i in range(1, len(state_sections), 2): state_name = state_sections[i].strip() section_content = state_sections[i+1] # Extract institutions from this state self._extract_institutions_from_section(section_content, state_name) def _extract_institutions_from_section(self, content: str, state: str): """Extract institution details from a state section.""" # Match institution blocks - two patterns: # Pattern 1: List items with bold names (- **Name**: details or - **Name** newline details) # Pattern 2: Standalone bold headers followed by details # Pattern 1: List item format (- **Name**: details OR - **Name** followed by details) list_pattern = r'- \*\*([^*]+)\*\*(?::?\s*([^\n]+))?(?:\n((?: .*\n?)+))?' # Pattern 2: Standalone bold (for section headers - we'll still try to extract) standalone_pattern = r'(?:^|\n)\*\*([^*]+)\*\*\s*$' # Try list pattern first (most institutions are in lists) for match in re.finditer(list_pattern, content, re.MULTILINE): name = match.group(1).strip() inline_desc = match.group(2) if match.group(2) else '' indented_details = match.group(3) if match.group(3) else '' # Combine all details details = (inline_desc + '\n' + indented_details).strip() # Skip section headers if name.endswith(':') or name in ['Institutional Custodians', 'Museums', 'Archives', 'Digital Resources', 'Digital Infrastructure', 'University Collections', 'INAH Regional Center', 'Archaeological Documentation', 'State Infrastructure', 'Key Museums', 'Universities', 'Major Art Museums', 'Anthropology and History Museums', 'Major Public Universities', 'Research Centers']: continue # Extract structured data institution_data = { 'name': name, 'state': state, 'details': details, 'metadata': self._extract_metadata_from_details(details) } # Store by normalized name for matching norm_name = self._normalize_name(name) if norm_name not in self.enrichment_data: self.enrichment_data[norm_name] = institution_data def _parse_national_institutions(self, content: str): """Parse national-level institutions and platforms.""" # Extract major sections sections = re.split(r'\n## (.+)\n', content) for i in range(1, len(sections), 2): section_title = sections[i].strip() section_content = sections[i+1] # Extract institutions from this section self._extract_institutions_from_section(section_content, "NATIONAL") def _extract_metadata_from_details(self, details: str) -> Dict[str, Any]: """Extract structured metadata from institution details block.""" metadata = { 'urls': [], 'emails': [], 'phones': [], 'addresses': [], 'cities': [], 'descriptions': [], 'collections': [], 'platforms': [], 'metadata_standards': [], 'identifiers': [], 'directors': [], 'hours': [] } # Extract URLs (multiple patterns) url_patterns = [ r'(?:URL|Website|Portal|Digital Library|Catalogue|Repository|OPAC|GitHub|Main Website|Alternative Access|Digital Repository):\s*(https?://[^\s\)]+)', r'\*\*(?:URL|Website)\*\*:\s*(https?://[^\s\)]+)', ] for pattern in url_patterns: metadata['urls'].extend(re.findall(pattern, details, re.IGNORECASE)) # Extract generic URLs (not already captured) generic_urls = re.findall(r'https?://[^\s\)\]<>,]+', details) for url in generic_urls: if url not in metadata['urls'] and not url.endswith('...'): metadata['urls'].append(url) # Extract emails email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}' metadata['emails'].extend(re.findall(email_pattern, details)) # Extract phone numbers (multiple patterns) phone_patterns = [ r'(?:Phone|Tel):\s*([+\d\s\(\)-]+)', r'\+52\s*\d+\s*\d+\s*\d+', # Mexican phone format r'\d{3}\s*\d{7}', # Local Mexican format ] for pattern in phone_patterns: metadata['phones'].extend(re.findall(pattern, details, re.IGNORECASE)) # Extract addresses (multiple patterns) address_patterns = [ r'Address:\s*([^\n-]+?)(?:\n|$)', r'(?:Calzada|Avenida|Av\.|Calle|C\.|Boulevard)[\s\w\d,.#°-]+(?:\d{5})?', ] for pattern in address_patterns: found = re.findall(pattern, details, re.IGNORECASE) metadata['addresses'].extend([a.strip() for a in found if len(a.strip()) > 10]) # Extract cities (from addresses or standalone) city_pattern = r'(?:,\s*)([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)(?:,|\s+\d{5}|\s*$)' metadata['cities'].extend(re.findall(city_pattern, details)) # Extract collection information (multiple patterns) collection_patterns = [ r'(?:Collection|Holdings|Content|Scope):\s*([^\n-]+)', r'(\d+[,+]\s*(?:objects|works|items|volumes|documents|pages|resources|photographs))', r'(Over \d+[,\d]*\s+(?:objects|works|items|volumes|documents))', ] for pattern in collection_patterns: metadata['collections'].extend(re.findall(pattern, details, re.IGNORECASE)) # Extract directors/contacts director_pattern = r'Director:\s*([^\n]+)' metadata['directors'].extend(re.findall(director_pattern, details)) # Extract hours hours_pattern = r'Hours:\s*([^\n]+)' metadata['hours'].extend(re.findall(hours_pattern, details)) # Extract metadata standards standards = ['Dublin Core', 'MARC21', 'OAI-PMH', 'IIIF', 'Schema.org', 'EAD', 'BIBFRAME', 'Tainacan', 'LIDO', 'RDF', 'JSON-LD'] for standard in standards: if standard.lower() in details.lower(): metadata['metadata_standards'].append(standard) # Extract platform types platforms = ['RESTful API', 'API', 'SPARQL', 'WorldCat', 'OCLC', 'Google Arts & Culture', 'Virtual Tours', 'Digital catalogue', 'Open access', 'OAI-PMH protocol'] for platform in platforms: if platform.lower() in details.lower(): metadata['platforms'].append(platform) # Extract descriptions (meaningful sentences/fragments) # Split by newlines and common delimiters fragments = re.split(r'[\n-]', details) for fragment in fragments[:5]: # Take first 5 fragments clean = fragment.strip() # Skip if it's a field label or too short if clean and len(clean) > 25 and ':' not in clean[:30]: metadata['descriptions'].append(clean) # Also extract inline descriptions after colons if ': ' in details: parts = details.split(': ', 1) if len(parts) > 1 and len(parts[1]) > 30: first_sentence = parts[1].split('\n')[0].strip() if first_sentence and len(first_sentence) > 25: metadata['descriptions'].append(first_sentence) # Remove duplicates and clean up for key in metadata: if isinstance(metadata[key], list): # Remove duplicates while preserving order seen = set() unique = [] for item in metadata[key]: item_clean = item.strip() if item_clean and item_clean not in seen: seen.add(item_clean) unique.append(item_clean) metadata[key] = unique return metadata def _normalize_name(self, name: str) -> str: """Normalize institution name for matching.""" # Remove parenthetical suffixes name = re.sub(r'\([^)]*\)', '', name) # Remove common prefixes/suffixes name = re.sub(r'^(Museo|Biblioteca|Archivo|Instituto|Universidad)\s+', '', name, flags=re.IGNORECASE) # Lowercase and remove extra whitespace name = ' '.join(name.lower().split()) # Remove accents (basic) name = name.replace('á', 'a').replace('é', 'e').replace('í', 'i') name = name.replace('ó', 'o').replace('ú', 'u').replace('ñ', 'n') return name def enrich_institutions(self): """Enrich each institution with data from artifacts.""" print("\n=== Enriching Institutions ===") enriched_count = 0 for institution in self.institutions: original_name = institution.get('name', '') norm_name = self._normalize_name(original_name) # Try exact match first enrichment = self.enrichment_data.get(norm_name) # Try fuzzy matching if no exact match if not enrichment: enrichment = self._fuzzy_match(norm_name) if enrichment: self._apply_enrichment(institution, enrichment) enriched_count += 1 else: # Mark as not enriched if 'description' not in institution or not institution['description']: institution['description'] = f"Mexican heritage institution. Further enrichment data not available in source conversations." print(f"✓ Enriched {enriched_count}/{len(self.institutions)} institutions") def _fuzzy_match(self, norm_name: str) -> Optional[Dict]: """Attempt fuzzy matching for institution name.""" # Check if norm_name is contained in any enrichment key for key, data in self.enrichment_data.items(): if norm_name in key or key in norm_name: return data return None def _apply_enrichment(self, institution: Dict, enrichment: Dict): """Apply enrichment data to an institution record.""" metadata = enrichment['metadata'] # Enhance description desc_parts = [] # Start with existing description if good existing_desc = institution.get('description', '').strip() if (existing_desc and existing_desc != "Mexican heritage institution. Further enrichment data not available in source conversations." and not existing_desc.startswith('http')): # Avoid URLs in description desc_parts.append(existing_desc) # Add new descriptions (skip URLs) for desc in metadata['descriptions'][:3]: if desc not in desc_parts and not desc.startswith('http'): desc_parts.append(desc) # Add collection information to description if not already present if metadata['collections'] and not any('collection' in d.lower() or 'holdings' in d.lower() for d in desc_parts): coll_summary = metadata['collections'][0] if not coll_summary.startswith('http'): desc_parts.append(f"Collections: {coll_summary}") # Create final description if desc_parts: institution['description'] = '. '.join(desc_parts) # Clean up double periods institution['description'] = institution['description'].replace('..', '.') # Ensure it ends with period if not institution['description'].endswith('.'): institution['description'] += '.' elif not institution.get('description'): # Fallback: use institution name and type type_map = { 'MUSEUM': 'museum', 'ARCHIVE': 'archive', 'LIBRARY': 'library', 'OFFICIAL_INSTITUTION': 'government cultural institution', 'MIXED': 'cultural heritage institution' } inst_type = type_map.get(institution.get('institution_type', 'MIXED'), 'heritage institution') institution['description'] = f"Mexican {inst_type} in {enrichment['state'].title()}." # Add/update locations (addresses, cities) if metadata['addresses'] or metadata['cities']: if 'locations' not in institution: institution['locations'] = [] # Get or create first location if not institution['locations']: institution['locations'].append({'country': 'MX'}) location = institution['locations'][0] # Add street address (only if it looks like a real address, not a URL) if metadata['addresses']: for addr in metadata['addresses']: if not addr.startswith('http') and len(addr) > 10: location['street_address'] = addr.strip() break # Extract or set city if 'city' not in location or not location.get('city'): if metadata['cities']: location['city'] = metadata['cities'][0] elif 'street_address' in location: # Try to extract city from address addr = location['street_address'] city_match = re.search(r',\s*([A-Z][a-zA-Z\s]+?)(?:,|\s+\d{5}|$)', addr) if city_match: location['city'] = city_match.group(1).strip() # Keep region if already present if 'region' not in location and enrichment['state'] != 'NATIONAL': location['region'] = enrichment['state'] # Add identifiers (URLs, emails, OCLC, etc.) if 'identifiers' not in institution: institution['identifiers'] = [] # Track existing URLs to avoid duplicates existing_urls = {id.get('identifier_value') for id in institution['identifiers']} # Add URLs (limit to top 3 most relevant) url_priority = [] for url in metadata['urls']: # Prioritize institutional websites over social media if 'facebook.com' in url or 'twitter.com' in url or 'instagram.com' in url: priority = 2 elif 'inah.gob.mx' in url or 'cultura.gob.mx' in url or '.edu' in url or '.gob.mx' in url: priority = 0 else: priority = 1 url_priority.append((priority, url)) url_priority.sort() for _, url in url_priority[:3]: if url not in existing_urls: institution['identifiers'].append({ 'identifier_scheme': 'Website', 'identifier_value': url, 'identifier_url': url }) existing_urls.add(url) # Add emails (limit to 2) for email in metadata['emails'][:2]: if not any(id.get('identifier_value') == email for id in institution['identifiers']): institution['identifiers'].append({ 'identifier_scheme': 'Email', 'identifier_value': email }) # Add phone numbers for phone in metadata['phones'][:1]: if not any(id.get('identifier_value') == phone for id in institution['identifiers']): institution['identifiers'].append({ 'identifier_scheme': 'Phone', 'identifier_value': phone.strip() }) # Add OCLC identifier if mentioned if any('OCLC' in p or 'WorldCat' in p for p in metadata['platforms']): if not any(id.get('identifier_scheme') == 'OCLC' for id in institution['identifiers']): institution['identifiers'].append({ 'identifier_scheme': 'OCLC', 'identifier_value': 'Catalogued in WorldCat' }) # Add digital platforms if metadata['platforms'] or metadata['metadata_standards'] or metadata['urls']: if 'digital_platforms' not in institution: institution['digital_platforms'] = [] # Create comprehensive platform entry platform_entry = {} # Use primary URL if metadata['urls']: platform_entry['platform_url'] = metadata['urls'][0] # Extract platform name from URL or use institution name url_match = re.search(r'https?://(?:www\.)?([^/]+)', metadata['urls'][0]) if url_match: platform_entry['platform_name'] = url_match.group(1) # Add metadata standards if metadata['metadata_standards']: platform_entry['metadata_standards'] = metadata['metadata_standards'] # Infer platform type if 'API' in ' '.join(metadata['platforms']): platform_entry['platform_type'] = 'API' elif any(x in ' '.join(metadata['platforms']) for x in ['Virtual Tours', 'Google Arts']): platform_entry['platform_type'] = 'DISCOVERY_PORTAL' elif any(x in ' '.join(metadata['platforms']) for x in ['WorldCat', 'OPAC']): platform_entry['platform_type'] = 'CATALOG' elif metadata['urls']: # Default based on URL url = metadata['urls'][0].lower() if 'catalog' in url or 'opac' in url: platform_entry['platform_type'] = 'CATALOG' elif 'repository' in url or 'repositorio' in url: platform_entry['platform_type'] = 'REPOSITORY' else: platform_entry['platform_type'] = 'DISCOVERY_PORTAL' if platform_entry: # Avoid duplicate platforms if not any(p.get('platform_url') == platform_entry.get('platform_url') for p in institution['digital_platforms']): institution['digital_platforms'].append(platform_entry) # Add collections metadata if metadata['collections']: if 'collections' not in institution: institution['collections'] = [] for coll_desc in metadata['collections'][:2]: # Skip URLs if coll_desc.startswith('http'): continue # Create detailed collection entry collection = { 'collection_name': f"{enrichment['name']} Collection" } # Parse extent/count from description extent_match = re.search(r'(\d+[,\d+]*)\s*(objects|works|items|volumes|documents|pages|resources|photographs)', coll_desc, re.IGNORECASE) if extent_match: collection['extent'] = f"{extent_match.group(1)} {extent_match.group(2)}" else: collection['extent'] = coll_desc.strip() # Avoid duplicates if not any(c.get('extent') == collection.get('extent') for c in institution['collections']): institution['collections'].append(collection) # Update provenance with enrichment metadata if 'provenance' in institution: institution['provenance']['confidence_score'] = 0.90 institution['provenance']['extraction_method'] = "Multi-file NLP extraction with manual curation and artifact enrichment" def save_curated_yaml(self): """Save enriched institutions to output YAML file.""" # Add header header = f"""--- # Mexican GLAM Institutions - CURATED VERSION # Manually enriched from conversation artifacts # # Source conversations: # 1. Mexican GLAM inventories and catalogues (2025-09-22) # 2. Mexican GLAM resources inventory (2025-09-23) # # Enrichment artifacts: # - Comprehensive Directory of Mexican Heritage Institutions (759 lines) # - Mexican GLAM Online Resources Inventory (383 lines) # # Total institutions: {len(self.institutions)} # Curation date: {datetime.now(timezone.utc).isoformat()} # Schema: LinkML v0.2.0 (modular) # Data tier: TIER_4_INFERRED (with artifact enrichment) """ # Convert to YAML yaml_content = yaml.dump( self.institutions, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120 ) # Write to file with open(OUTPUT_YAML, 'w', encoding='utf-8') as f: f.write(header) f.write(yaml_content) print(f"\n✓ Saved curated data to {OUTPUT_YAML}") print(f" File size: {OUTPUT_YAML.stat().st_size / 1024:.1f} KB") def generate_statistics(self): """Generate enrichment statistics report.""" print("\n" + "="*80) print("ENRICHMENT STATISTICS") print("="*80) total = len(self.institutions) # Count field completeness with_description = sum(1 for i in self.institutions if i.get('description')) with_addresses = sum(1 for i in self.institutions if i.get('locations') and any(l.get('street_address') for l in i['locations'])) with_urls = sum(1 for i in self.institutions if i.get('identifiers') and len(i['identifiers']) > 1) with_platforms = sum(1 for i in self.institutions if i.get('digital_platforms')) with_collections = sum(1 for i in self.institutions if i.get('collections')) print(f"\nTotal institutions: {total}") print(f"\nField Completeness:") print(f" Descriptions: {with_description:3d} ({with_description/total*100:5.1f}%)") print(f" Street addresses: {with_addresses:3d} ({with_addresses/total*100:5.1f}%)") print(f" Multiple IDs/URLs: {with_urls:3d} ({with_urls/total*100:5.1f}%)") print(f" Digital platforms: {with_platforms:3d} ({with_platforms/total*100:5.1f}%)") print(f" Collections: {with_collections:3d} ({with_collections/total*100:5.1f}%)") # Institution type breakdown print(f"\nInstitution Types:") types = {} for inst in self.institutions: itype = inst.get('institution_type', 'UNKNOWN') types[itype] = types.get(itype, 0) + 1 for itype in sorted(types.keys()): print(f" {itype:20s} {types[itype]:3d} ({types[itype]/total*100:5.1f}%)") print("\n" + "="*80) def main(): """Main enrichment workflow.""" print("="*80) print("MEXICAN GLAM INSTITUTIONS - MANUAL CURATION") print("="*80) enricher = InstitutionEnricher() # Step 1: Load existing data enricher.load_existing_institutions() # Step 2: Parse markdown artifacts enricher.parse_markdown_artifacts() # Step 3: Enrich institutions enricher.enrich_institutions() # Step 4: Save curated YAML enricher.save_curated_yaml() # Step 5: Generate statistics enricher.generate_statistics() print("\n✅ CURATION COMPLETE!") print(f"\nOutput: {OUTPUT_YAML}") if __name__ == "__main__": main()