#!/usr/bin/env python3 """ Clean and enrich Archives Lab extraction with: 1. Name cleanup (remove markdown artifacts) 2. LLM-based entity extraction using GLAM-NER types 3. Additional relationship types 4. Cross-referencing with Palestinian GLAM data Usage: PYTHONPATH=src python scripts/clean_and_enrich_archiveslab.py """ import json import os import re import sys from datetime import datetime, timezone from pathlib import Path from dataclasses import dataclass, asdict, field from typing import Any, Dict, List, Optional, Set, Tuple # ============================================================================= # Configuration # ============================================================================= ARCHIVESLAB_DIR = Path('/Users/kempersc/apps/glam/data/extracted/archiveslab') PALESTINIAN_CLAIMS = Path('/Users/kempersc/apps/glam/data/extracted/palestinian_glam_claims.json') # Debug: print path print(f"Looking for Palestinian claims at: {PALESTINIAN_CLAIMS}") print(f"File exists: {PALESTINIAN_CLAIMS.exists()}") OUTPUT_FILE = ARCHIVESLAB_DIR / 'archiveslab_claims_enriched.json' # ============================================================================= # Data Classes # ============================================================================= @dataclass class EnrichedProvenance: """Enhanced provenance tracking.""" source_url: str extraction_method: str extraction_date: str confidence: float = 0.85 enrichment_date: Optional[str] = None enrichment_method: Optional[str] = None def to_dict(self) -> Dict[str, Any]: return {k: v for k, v in asdict(self).items() if v is not None} @dataclass class EnrichedEntityClaim: """Enhanced entity claim with GLAM-NER types.""" entity_id: str entity_type: str # GLAM-NER hypernym (e.g., GRP.HER, AGT.PER) name: str clean_name: str # Cleaned version of name context: Optional[str] = None provenance: Optional[EnrichedProvenance] = None metadata: Optional[Dict[str, Any]] = None affiliations: Optional[List[str]] = None # Organizations person is affiliated with roles: Optional[List[str]] = None # Roles/titles def to_dict(self) -> Dict[str, Any]: result = { 'entity_id': self.entity_id, 'entity_type': self.entity_type, 'name': self.name, 'clean_name': self.clean_name, } if self.context: result['context'] = self.context if self.provenance: result['provenance'] = self.provenance.to_dict() if self.metadata: result['metadata'] = self.metadata if self.affiliations: result['affiliations'] = self.affiliations if self.roles: result['roles'] = self.roles return result @dataclass class EnrichedTriple: """Enhanced triple with more relationship types.""" subject: str predicate: str object: str provenance: Optional[EnrichedProvenance] = None confidence: float = 0.85 def to_dict(self) -> Dict[str, Any]: result = { 'subject': self.subject, 'predicate': self.predicate, 'object': self.object, 'confidence': self.confidence, } if self.provenance: result['provenance'] = self.provenance.to_dict() return result # ============================================================================= # Name Cleanup Functions # ============================================================================= def clean_entity_name(name: str) -> str: """ Clean entity name by removing markdown artifacts. Examples: "Mahmoud Balaawy\n\nTitle" -> "Mahmoud Balaawy" "Ana Roeschley \n\n \n\nTitle" -> "Ana Roeschley" "Biographies\n\nAbdallah Omar" -> "Abdallah Omar" """ # Remove "Title:" and variations name = re.sub(r'\s*\n+\s*Title\s*:?\s*', '', name) # Remove "Abstract:" and variations name = re.sub(r'\s*\n+\s*Abstract\s*:?\s*', '', name) # Remove "Biographies" prefix name = re.sub(r'^Biographies\s*\n+\s*', '', name) # Remove "Speaker:" prefix name = re.sub(r'^Speakers?\s*:\s*', '', name) # Remove "Moderator:" prefix name = re.sub(r'^Moderators?\s*:\s*', '', name) # Remove "Keynote Speaker:" prefix name = re.sub(r'^Keynote\s+Speakers?\s*:\s*', '', name) # Clean up multiple newlines and whitespace name = re.sub(r'\s*\n+\s*', ' ', name) name = re.sub(r'\s+', ' ', name) # Remove leading/trailing whitespace name = name.strip() # Remove trailing punctuation name = re.sub(r'[,;:]+$', '', name) return name def extract_title_from_context(context: str) -> Optional[str]: """Extract academic/professional title from context.""" if not context: return None # Look for Dr., Prof., Mr., Ms., etc. title_match = re.search(r'\b(Dr\.|Prof\.|Mr\.|Ms\.|Mrs\.|Eng\.)\s', context) if title_match: return title_match.group(1).rstrip('.') return None def extract_affiliation_from_context(context: str) -> List[str]: """Extract organizational affiliations from context.""" affiliations = [] if not context: return affiliations # Common affiliation patterns patterns = [ r'(?:at|from|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:University|Institute|Archive|Museum|Library|Center|Centre|Lab|Laboratory|Association|Council))', r'(?:at|from|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:of\s+[A-Z][a-zA-Z\s]+))', r'([A-Z][a-zA-Z\s]+Municipality)', ] for pattern in patterns: matches = re.findall(pattern, context) affiliations.extend([m.strip() for m in matches if len(m.strip()) > 5]) return list(set(affiliations)) # ============================================================================= # GLAM-NER Type Mapping # ============================================================================= def map_to_glam_ner_type(entity_type: str, name: str, context: str = "") -> str: """ Map simple entity types to GLAM-NER hypernyms. GLAM-NER v1.7.0 Entity Hypernyms: - AGT.PER: Person - AGT.STF: Staff member - GRP.HER: Heritage institution - GRP.EDU: Educational institution - GRP.ASS: Association/society - GRP.GOV: Government agency - TOP.SET: Settlement (city) - TOP.CTY: Country """ name_lower = name.lower() context_lower = context.lower() if context else "" if entity_type == "PER": # Check if this is a staff member (has role context) if any(term in context_lower for term in ['director', 'professor', 'curator', 'archivist', 'librarian', 'officer']): return "AGT.STF" return "AGT.PER" elif entity_type == "ORG": # Heritage institutions if any(term in name_lower for term in ['archive', 'museum', 'library', 'heritage', 'memorial']): return "GRP.HER" # Educational institutions if any(term in name_lower for term in ['university', 'college', 'school', 'institute', 'academic']): return "GRP.EDU" # Associations and societies if any(term in name_lower for term in ['association', 'society', 'council', 'committee', 'group', 'network']): return "GRP.ASS" # Government if any(term in name_lower for term in ['municipality', 'government', 'ministry', 'national']): return "GRP.GOV" return "GRP" elif entity_type == "LOC": # Countries if name in ['Palestine', 'Lebanon', 'Gaza', 'Israel', 'Jordan', 'Syria', 'Egypt', 'Algeria', 'Malta', 'Ghana']: return "TOP.CTY" # Cities if any(term in name_lower for term in ['city', 'town', 'village']) or name in [ 'Jerusalem', 'Ramallah', 'Hebron', 'Nablus', 'Bethlehem', 'Tulkarm', 'Beirut', 'New York', 'Amsterdam', 'London' ]: return "TOP.SET" return "TOP" elif entity_type == "URL": return "APP.URL" elif entity_type == "EMAIL": return "APP.EML" return entity_type # ============================================================================= # Relationship Extraction # ============================================================================= def extract_relationships(entities: List[Dict], context_map: Dict[str, str]) -> List[EnrichedTriple]: """ Extract additional relationship types from entities and context. Relationship types: - REL.EVT.SPEAKS_AT: Person speaks at event - REL.EVT.MODERATES: Person moderates panel - REL.EVT.PARTICIPATES: Organization participates in event - REL.SOC.MEMBER_OF: Person is member of organization - REL.SOC.AFFILIATED_WITH: Person affiliated with institution - REL.SOC.WORKS_AT: Person works at institution - REL.SPA.LOCATED_IN: Entity located in place - REL.ONT.ISA: Entity is instance of type """ triples = [] event_name = "Resilient Communities Resilient Archives" provenance = EnrichedProvenance( source_url="https://www.archiveslab.org/events/resilient-communities-resilient-archives/english-program-resilient-communities-resilient-archives", extraction_method="Enriched extraction with relationship inference", extraction_date=datetime.now(timezone.utc).isoformat(), confidence=0.85 ) # Group entities by type persons = [e for e in entities if e['entity_type'].startswith('AGT')] organizations = [e for e in entities if e['entity_type'].startswith('GRP')] locations = [e for e in entities if e['entity_type'].startswith('TOP')] # Person relationships for person in persons: name = person['clean_name'] context = context_map.get(name, "") # Speaking at event if 'speaker' in context.lower() or 'keynote' in context.lower(): triples.append(EnrichedTriple( subject=name, predicate="REL.EVT.SPEAKS_AT", object=event_name, provenance=provenance, confidence=0.9 )) # Moderating panel if 'moderator' in context.lower(): triples.append(EnrichedTriple( subject=name, predicate="REL.EVT.MODERATES", object=event_name, provenance=provenance, confidence=0.9 )) # Affiliations affiliations = person.get('affiliations', []) for affiliation in affiliations: triples.append(EnrichedTriple( subject=name, predicate="REL.SOC.AFFILIATED_WITH", object=affiliation, provenance=provenance, confidence=0.75 )) # Work relationships (from context) work_patterns = [ (r'(?:at|with)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+(?:University|Institute|Archive|Museum|Library))', "REL.SOC.WORKS_AT"), (r'(?:fellow|researcher|professor|director)\s+(?:at|of)\s+(?:the\s+)?([A-Z][a-zA-Z\s]+)', "REL.SOC.WORKS_AT"), ] for pattern, predicate in work_patterns: matches = re.findall(pattern, context) for match in matches: if len(match.strip()) > 5: triples.append(EnrichedTriple( subject=name, predicate=predicate, object=match.strip(), provenance=provenance, confidence=0.7 )) # Organization relationships for org in organizations: name = org['clean_name'] # Participating in event triples.append(EnrichedTriple( subject=name, predicate="REL.EVT.PARTICIPATES", object=event_name, provenance=provenance, confidence=0.85 )) # Type classification org_type = org['entity_type'] type_labels = { 'GRP.HER': 'Heritage Institution', 'GRP.EDU': 'Educational Institution', 'GRP.ASS': 'Association', 'GRP.GOV': 'Government Agency', 'GRP': 'Organization' } if org_type in type_labels: triples.append(EnrichedTriple( subject=name, predicate="REL.ONT.ISA", object=type_labels[org_type], provenance=provenance, confidence=0.9 )) # Location relationships for loc in locations: name = loc['clean_name'] # Event location if name in ['Palestine', 'Lebanon', 'Gaza', 'Jerusalem', 'Ramallah']: triples.append(EnrichedTriple( subject=event_name, predicate="REL.SPA.FOCUS_REGION", object=name, provenance=provenance, confidence=0.9 )) return triples # ============================================================================= # Cross-Reference with Palestinian GLAM Data # ============================================================================= def load_palestinian_institutions() -> Dict[str, Dict]: """Load Palestinian GLAM institutions for cross-referencing.""" if not PALESTINIAN_CLAIMS.exists(): print(f" Palestinian claims file not found: {PALESTINIAN_CLAIMS}") return {} with open(PALESTINIAN_CLAIMS, 'r', encoding='utf-8') as f: data = json.load(f) institutions = {} for claim in data.get('entity_claims', []): # Palestinian claims use 'text' not 'name' name = claim.get('text', '') or claim.get('name', '') if name: institutions[name.lower()] = claim print(f" Loaded {len(institutions)} Palestinian institutions") return institutions def find_cross_references(entities: List[Dict], palestinian_institutions: Dict[str, Dict]) -> List[Dict]: """Find entities that match Palestinian GLAM institutions.""" cross_refs = [] for entity in entities: name = entity.get('clean_name', '').lower() # Direct match if name in palestinian_institutions: cross_refs.append({ 'archiveslab_entity': entity['clean_name'], 'palestinian_match': palestinian_institutions[name], 'match_type': 'exact' }) continue # Partial match (check if entity name contains or is contained in institution name) for inst_name, inst_data in palestinian_institutions.items(): if len(name) > 5 and (name in inst_name or inst_name in name): cross_refs.append({ 'archiveslab_entity': entity['clean_name'], 'palestinian_match': inst_data, 'match_type': 'partial' }) break return cross_refs # ============================================================================= # Main Processing # ============================================================================= def process_claims(): """Main processing function.""" # Load original claims claims_file = ARCHIVESLAB_DIR / 'archiveslab_claims.json' with open(claims_file, 'r', encoding='utf-8') as f: original_data = json.load(f) print("=" * 60) print("ARCHIVES LAB CLAIMS ENRICHMENT") print("=" * 60) print(f"\nOriginal entities: {len(original_data['entity_claims'])}") print(f"Original triples: {len(original_data['triples'])}") # Step 1: Clean entity names print("\n--- Step 1: Cleaning entity names ---") cleaned_entities = [] context_map = {} for claim in original_data['entity_claims']: original_name = claim['name'] clean_name = clean_entity_name(original_name) context = claim.get('context', '') # Skip if name is too short after cleaning if len(clean_name) < 3: continue # Skip duplicate clean names if clean_name in context_map: continue context_map[clean_name] = context # Map to GLAM-NER type glam_type = map_to_glam_ner_type(claim['entity_type'], clean_name, context) # Extract affiliations and roles affiliations = extract_affiliation_from_context(context) title = extract_title_from_context(context) roles = [title] if title else [] enriched_claim = EnrichedEntityClaim( entity_id=claim['entity_id'], entity_type=glam_type, name=original_name, clean_name=clean_name, context=context[:200] if context else None, provenance=EnrichedProvenance( source_url=original_data['source_url'], extraction_method="Regex + Playwright", extraction_date=claim['provenance']['extraction_date'], confidence=claim['provenance']['confidence'], enrichment_date=datetime.now(timezone.utc).isoformat(), enrichment_method="GLAM-NER type mapping + name cleanup" ), affiliations=affiliations if affiliations else None, roles=roles if roles else None ) cleaned_entities.append(enriched_claim.to_dict()) print(f"Cleaned entities: {len(cleaned_entities)} (removed {len(original_data['entity_claims']) - len(cleaned_entities)} duplicates/artifacts)") # Show sample cleanups print("\nSample name cleanups:") samples = [e for e in cleaned_entities if e['name'] != e['clean_name']][:5] for s in samples: print(f" '{s['name'][:40]}...' -> '{s['clean_name']}'") # Step 2: Map to GLAM-NER types print("\n--- Step 2: GLAM-NER type distribution ---") type_counts = {} for e in cleaned_entities: t = e['entity_type'] type_counts[t] = type_counts.get(t, 0) + 1 for t, count in sorted(type_counts.items(), key=lambda x: -x[1]): print(f" {t}: {count}") # Step 3: Extract relationships print("\n--- Step 3: Extracting enriched relationships ---") enriched_triples = extract_relationships(cleaned_entities, context_map) print(f"Generated {len(enriched_triples)} enriched triples") # Relationship type breakdown rel_counts = {} for t in enriched_triples: p = t.predicate rel_counts[p] = rel_counts.get(p, 0) + 1 print("\nRelationship types:") for p, count in sorted(rel_counts.items(), key=lambda x: -x[1]): print(f" {p}: {count}") # Step 4: Cross-reference with Palestinian GLAM data print("\n--- Step 4: Cross-referencing with Palestinian GLAM data ---") palestinian_institutions = load_palestinian_institutions() if palestinian_institutions: cross_refs = find_cross_references(cleaned_entities, palestinian_institutions) print(f"Found {len(cross_refs)} cross-references with Palestinian GLAM institutions") if cross_refs: print("\nCross-references found:") for ref in cross_refs[:10]: match_name = ref['palestinian_match'].get('name', 'Unknown') print(f" {ref['archiveslab_entity']} <-> {match_name} ({ref['match_type']})") else: cross_refs = [] print("Palestinian GLAM data not found, skipping cross-referencing") # Save enriched data print("\n--- Saving enriched data ---") enriched_data = { 'source_url': original_data['source_url'], 'fetch_timestamp': original_data['fetch_timestamp'], 'extraction_timestamp': original_data['extraction_timestamp'], 'enrichment_timestamp': datetime.now(timezone.utc).isoformat(), 'entity_claims': cleaned_entities, 'triples': [t.to_dict() for t in enriched_triples], 'cross_references': cross_refs, 'statistics': { 'original_entities': len(original_data['entity_claims']), 'cleaned_entities': len(cleaned_entities), 'original_triples': len(original_data['triples']), 'enriched_triples': len(enriched_triples), 'cross_references': len(cross_refs), 'entity_types': type_counts, 'relationship_types': rel_counts } } with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(enriched_data, f, indent=2, ensure_ascii=False) print(f"Saved: {OUTPUT_FILE}") print("\n" + "=" * 60) print("ENRICHMENT COMPLETE") print("=" * 60) if __name__ == '__main__': process_claims()