- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
456 lines
18 KiB
Python
Executable file
456 lines
18 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Migration Script: Old Enrichment Format → Schema v0.2.2 EnrichmentHistoryEntry
|
|
|
|
Converts three different old enrichment formats to schema v0.2.2 compliant
|
|
enrichment_history structure.
|
|
|
|
OLD FORMATS:
|
|
1. Provenance flat fields (most common - 117 instances in Chile):
|
|
provenance:
|
|
enrichment_batch: 7
|
|
enrichment_method: SPARQL_BULK_QUERY
|
|
wikidata_verified: true
|
|
notes:
|
|
- "Batch 7: SPARQL match - exact name match"
|
|
|
|
2. Old enrichment_history (10 instances in Chile):
|
|
enrichment_history:
|
|
- enrichment_date: "2025-11-09T18:10:41.851904+00:00"
|
|
enrichment_method: "Chilean Batch 5 - University + museum Wikidata verification"
|
|
enrichment_batch: batch_5
|
|
q_number: Q3551323
|
|
verification: "Universidad Arturo Prat, public university..."
|
|
|
|
3. Unstructured notes (Tunisia, Algeria, Libya):
|
|
notes: "Wikidata enriched 2025-11-10 (Q549445, match: 84%)..."
|
|
|
|
NEW FORMAT (schema v0.2.2):
|
|
enrichment_history:
|
|
- enrichment_date: "2025-11-10T14:30:00+00:00"
|
|
enrichment_type: WIKIDATA_IDENTIFIER
|
|
enrichment_method: "Wikidata SPARQL query with fuzzy matching"
|
|
match_score: 0.84
|
|
verified: false
|
|
enrichment_source: "https://www.wikidata.org"
|
|
enrichment_notes: "Matched to '...' (Q549445)"
|
|
|
|
DATASETS TO MIGRATE:
|
|
- Chile: 90 institutions (71 with Wikidata)
|
|
- Tunisia: 68 institutions
|
|
- Algeria: 19 institutions
|
|
- Libya: 24 institutions
|
|
Total: 201 institutions
|
|
|
|
USAGE:
|
|
python scripts/migrate_to_schema_v0.2.2_enrichment.py --dry-run
|
|
python scripts/migrate_to_schema_v0.2.2_enrichment.py --apply
|
|
"""
|
|
|
|
import argparse
|
|
import re
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
from dataclasses import dataclass
|
|
|
|
|
|
@dataclass
|
|
class EnrichmentEntry:
|
|
"""Schema v0.2.2 compliant enrichment history entry"""
|
|
enrichment_date: str
|
|
enrichment_type: str
|
|
enrichment_method: str
|
|
match_score: Optional[float] = None
|
|
verified: bool = False
|
|
enrichment_source: Optional[str] = None
|
|
enrichment_notes: Optional[str] = None
|
|
|
|
|
|
class EnrichmentMigrator:
|
|
"""Migrates old enrichment formats to schema v0.2.2"""
|
|
|
|
# Regex patterns for parsing unstructured notes
|
|
WIKIDATA_NOTE_PATTERN = re.compile(
|
|
r'Wikidata enriched (\d{4}-\d{2}-\d{2})\s*\((Q\d+),\s*match:\s*(\d+)%\)'
|
|
)
|
|
BATCH_NOTE_PATTERN = re.compile(
|
|
r'Batch (\d+):\s*SPARQL match\s*-\s*(.+)'
|
|
)
|
|
|
|
def __init__(self, dry_run: bool = True):
|
|
self.dry_run = dry_run
|
|
self.stats = {
|
|
'processed': 0,
|
|
'migrated': 0,
|
|
'skipped': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
def migrate_institution(self, inst: Dict[str, Any]) -> bool:
|
|
"""
|
|
Migrate a single institution's enrichment data.
|
|
|
|
Returns True if migration was performed, False if skipped.
|
|
"""
|
|
if 'provenance' not in inst:
|
|
return False
|
|
|
|
prov = inst['provenance']
|
|
|
|
# Check if already has new format enrichment_history
|
|
if 'enrichment_history' in prov and isinstance(prov['enrichment_history'], list):
|
|
if prov['enrichment_history'] and 'enrichment_type' in prov['enrichment_history'][0]:
|
|
# Already in new format
|
|
return False
|
|
|
|
# Initialize new enrichment_history list
|
|
new_history: List[EnrichmentEntry] = []
|
|
|
|
# Migration Path 1: Flat provenance fields (enrichment_batch, wikidata_verified)
|
|
if 'enrichment_batch' in prov or 'wikidata_verified' in prov:
|
|
entries = self._migrate_flat_provenance(prov, inst)
|
|
new_history.extend(entries)
|
|
|
|
# Migration Path 2: Old enrichment_history format
|
|
elif 'enrichment_history' in prov and isinstance(prov['enrichment_history'], list):
|
|
if prov['enrichment_history'] and 'q_number' in prov['enrichment_history'][0]:
|
|
entries = self._migrate_old_enrichment_history(prov['enrichment_history'])
|
|
new_history.extend(entries)
|
|
|
|
# Migration Path 3: Standalone enrichment_method field (later batches)
|
|
# NOTE: enrichment_date is optional - we use extraction_date as fallback
|
|
elif 'enrichment_method' in prov:
|
|
entries = self._migrate_standalone_enrichment(prov, inst)
|
|
new_history.extend(entries)
|
|
|
|
# Migration Path 4: Parse unstructured notes
|
|
if 'notes' in prov and not new_history:
|
|
entries = self._parse_notes(prov['notes'])
|
|
new_history.extend(entries)
|
|
|
|
# Apply migration if entries were created
|
|
if new_history:
|
|
# Remove old fields
|
|
old_fields = ['enrichment_batch', 'enrichment_method', 'enrichment_confidence',
|
|
'wikidata_verified', 'wikidata_match_confidence', 'enrichment_date', 'notes']
|
|
for field in old_fields:
|
|
prov.pop(field, None)
|
|
|
|
# Add new enrichment_history (convert EnrichmentEntry objects to dicts)
|
|
prov['enrichment_history'] = [self._entry_to_dict(e) for e in new_history]
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
def _migrate_flat_provenance(self, prov: Dict[str, Any], inst: Dict[str, Any]) -> List[EnrichmentEntry]:
|
|
"""Migrate flat provenance fields to structured history"""
|
|
entries = []
|
|
|
|
# Extract Wikidata Q-number from identifiers
|
|
q_number = None
|
|
if 'identifiers' in inst:
|
|
for ident in inst['identifiers']:
|
|
if ident.get('identifier_scheme') == 'Wikidata':
|
|
q_number = ident.get('identifier_value')
|
|
break
|
|
|
|
# Parse enrichment date from notes or use extraction_date
|
|
enrichment_date = prov.get('extraction_date', datetime.now(timezone.utc).isoformat())
|
|
|
|
# Parse match confidence from notes
|
|
match_score = None
|
|
enrichment_notes = None
|
|
|
|
if 'notes' in prov:
|
|
notes_text = prov['notes']
|
|
if isinstance(notes_text, list):
|
|
notes_text = ' '.join(str(n) for n in notes_text)
|
|
|
|
# Extract batch and match type
|
|
batch_match = self.BATCH_NOTE_PATTERN.search(notes_text)
|
|
if batch_match:
|
|
batch_num = batch_match.group(1)
|
|
match_type = batch_match.group(2)
|
|
enrichment_notes = f"Batch {batch_num}: {match_type}"
|
|
|
|
# Infer match score from match type
|
|
if 'exact name match' in match_type:
|
|
match_score = 1.0
|
|
elif 'partial name' in match_type:
|
|
match_score = 0.85
|
|
elif 'includes full' in match_type:
|
|
match_score = 0.9
|
|
|
|
# Create Wikidata enrichment entry
|
|
if q_number:
|
|
entry = EnrichmentEntry(
|
|
enrichment_date=enrichment_date,
|
|
enrichment_type='WIKIDATA_IDENTIFIER',
|
|
enrichment_method=prov.get('enrichment_method', 'Wikidata SPARQL bulk query'),
|
|
match_score=match_score,
|
|
verified=prov.get('wikidata_verified', False),
|
|
enrichment_source='https://www.wikidata.org',
|
|
enrichment_notes=enrichment_notes or f"Matched to Wikidata entity {q_number}"
|
|
)
|
|
entries.append(entry)
|
|
|
|
return entries
|
|
|
|
def _migrate_old_enrichment_history(self, old_history: List[Dict[str, Any]]) -> List[EnrichmentEntry]:
|
|
"""Migrate old enrichment_history format to new schema"""
|
|
entries = []
|
|
|
|
for old_entry in old_history:
|
|
q_number = old_entry.get('q_number')
|
|
verification = old_entry.get('verification', '')
|
|
|
|
entry = EnrichmentEntry(
|
|
enrichment_date=old_entry.get('enrichment_date', datetime.now(timezone.utc).isoformat()),
|
|
enrichment_type='WIKIDATA_IDENTIFIER',
|
|
enrichment_method=old_entry.get('enrichment_method', 'Wikidata verification'),
|
|
match_score=0.95, # Manual verification implies high confidence
|
|
verified=True, # Old enrichment_history was manually verified
|
|
enrichment_source='https://www.wikidata.org',
|
|
enrichment_notes=f"Matched to {verification} ({q_number})" if q_number else verification
|
|
)
|
|
entries.append(entry)
|
|
|
|
return entries
|
|
|
|
def _migrate_standalone_enrichment(self, prov: Dict[str, Any], inst: Dict[str, Any]) -> List[EnrichmentEntry]:
|
|
"""Migrate standalone enrichment_method field (later batches)"""
|
|
entries = []
|
|
|
|
# Extract Wikidata Q-number from identifiers
|
|
q_number = None
|
|
if 'identifiers' in inst:
|
|
for ident in inst['identifiers']:
|
|
if ident.get('identifier_scheme') == 'Wikidata':
|
|
q_number = ident.get('identifier_value')
|
|
break
|
|
|
|
if not q_number:
|
|
return entries
|
|
|
|
# Parse match confidence
|
|
match_confidence = prov.get('wikidata_match_confidence', 'unknown')
|
|
match_score = None
|
|
if match_confidence == 'high':
|
|
match_score = 0.95
|
|
elif match_confidence == 'partial':
|
|
match_score = 0.80
|
|
elif match_confidence == 'medium':
|
|
match_score = 0.75
|
|
|
|
# Determine if verified (high confidence = verified)
|
|
verified = match_confidence == 'high'
|
|
|
|
# Extract enrichment notes from notes field
|
|
enrichment_notes = None
|
|
if 'notes' in prov:
|
|
notes = prov['notes']
|
|
if isinstance(notes, list):
|
|
enrichment_notes = ' '.join(str(n) for n in notes)
|
|
else:
|
|
enrichment_notes = str(notes)
|
|
|
|
# Use enrichment_date if available, otherwise fall back to extraction_date
|
|
enrichment_date = prov.get('enrichment_date') or prov.get('extraction_date', datetime.now(timezone.utc).isoformat())
|
|
|
|
# Create entry
|
|
entry = EnrichmentEntry(
|
|
enrichment_date=enrichment_date,
|
|
enrichment_type='WIKIDATA_IDENTIFIER',
|
|
enrichment_method=prov.get('enrichment_method', 'Wikidata enrichment'),
|
|
match_score=match_score,
|
|
verified=verified,
|
|
enrichment_source='https://www.wikidata.org',
|
|
enrichment_notes=enrichment_notes or f"Matched to Wikidata entity {q_number}"
|
|
)
|
|
entries.append(entry)
|
|
|
|
return entries
|
|
|
|
def _parse_notes(self, notes: Any) -> List[EnrichmentEntry]:
|
|
"""Parse unstructured notes for enrichment information"""
|
|
entries = []
|
|
|
|
if not notes:
|
|
return entries
|
|
|
|
notes_text = notes if isinstance(notes, str) else ' '.join(str(n) for n in notes)
|
|
|
|
# Pattern: "Wikidata enriched 2025-11-10 (Q549445, match: 84%)"
|
|
match = self.WIKIDATA_NOTE_PATTERN.search(notes_text)
|
|
if match:
|
|
date_str, q_number, match_pct = match.groups()
|
|
|
|
entry = EnrichmentEntry(
|
|
enrichment_date=f"{date_str}T12:00:00+00:00", # Assume midday UTC
|
|
enrichment_type='WIKIDATA_IDENTIFIER',
|
|
enrichment_method='Wikidata SPARQL query with fuzzy matching',
|
|
match_score=int(match_pct) / 100.0,
|
|
verified=False,
|
|
enrichment_source='https://www.wikidata.org',
|
|
enrichment_notes=f"Matched to Wikidata entity {q_number}"
|
|
)
|
|
entries.append(entry)
|
|
|
|
return entries
|
|
|
|
def _entry_to_dict(self, entry: EnrichmentEntry) -> Dict[str, Any]:
|
|
"""Convert EnrichmentEntry to dict, omitting None values"""
|
|
return {
|
|
k: v for k, v in {
|
|
'enrichment_date': entry.enrichment_date,
|
|
'enrichment_type': entry.enrichment_type,
|
|
'enrichment_method': entry.enrichment_method,
|
|
'match_score': entry.match_score,
|
|
'verified': entry.verified,
|
|
'enrichment_source': entry.enrichment_source,
|
|
'enrichment_notes': entry.enrichment_notes
|
|
}.items() if v is not None
|
|
}
|
|
|
|
def migrate_file(self, input_path: Path, output_path: Optional[Path] = None) -> None:
|
|
"""Migrate all institutions in a YAML file"""
|
|
print(f"\n{'[DRY RUN] ' if self.dry_run else ''}Processing: {input_path}")
|
|
|
|
# Read file
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Handle different file structures
|
|
institutions = []
|
|
metadata = None
|
|
|
|
if isinstance(data, dict) and 'institutions' in data:
|
|
# Tunisia format with metadata
|
|
institutions = data['institutions']
|
|
metadata = {k: v for k, v in data.items() if k != 'institutions'}
|
|
elif isinstance(data, list):
|
|
# Direct list format
|
|
institutions = data
|
|
else:
|
|
print(f" ⚠️ Unknown file structure, skipping")
|
|
self.stats['errors'] += 1
|
|
return
|
|
|
|
# Migrate each institution
|
|
migrated_count = 0
|
|
for inst in institutions:
|
|
self.stats['processed'] += 1
|
|
|
|
if self.migrate_institution(inst):
|
|
migrated_count += 1
|
|
self.stats['migrated'] += 1
|
|
|
|
# Show example
|
|
if migrated_count == 1:
|
|
print(f"\n ✓ Example migration:")
|
|
print(f" Institution: {inst.get('name', 'Unknown')}")
|
|
if 'enrichment_history' in inst['provenance']:
|
|
print(f" New enrichment_history entries: {len(inst['provenance']['enrichment_history'])}")
|
|
else:
|
|
self.stats['skipped'] += 1
|
|
|
|
print(f"\n Processed: {len(institutions)} institutions")
|
|
print(f" Migrated: {migrated_count}")
|
|
print(f" Skipped: {len(institutions) - migrated_count}")
|
|
|
|
# Write output
|
|
if not self.dry_run:
|
|
output_path = output_path or input_path
|
|
|
|
# Create backup
|
|
backup_path = input_path.with_suffix('.yaml.pre_v0.2.2_backup')
|
|
if not backup_path.exists():
|
|
import shutil
|
|
shutil.copy2(input_path, backup_path)
|
|
print(f" 📦 Backup created: {backup_path.name}")
|
|
|
|
# Write migrated data
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
if metadata:
|
|
# Update metadata
|
|
if '_metadata' in metadata:
|
|
metadata['_metadata']['schema_version'] = '0.2.2'
|
|
metadata['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
|
|
if 'enhancements' not in metadata['_metadata']:
|
|
metadata['_metadata']['enhancements'] = []
|
|
metadata['_metadata']['enhancements'].append('Schema v0.2.2 enrichment_history migration')
|
|
|
|
output_data = {**metadata, 'institutions': institutions}
|
|
else:
|
|
output_data = institutions
|
|
|
|
yaml.dump(output_data, f, allow_unicode=True, sort_keys=False, width=120)
|
|
|
|
print(f" ✅ Written: {output_path}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description='Migrate old enrichment formats to schema v0.2.2',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__
|
|
)
|
|
parser.add_argument('--dry-run', action='store_true', default=True,
|
|
help='Preview changes without writing (default)')
|
|
parser.add_argument('--apply', action='store_true',
|
|
help='Apply changes and write files')
|
|
parser.add_argument('--files', nargs='+',
|
|
help='Specific files to migrate (default: all datasets)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Default dataset files
|
|
base_path = Path(__file__).parent.parent / 'data' / 'instances'
|
|
default_files = [
|
|
base_path / 'chile' / 'chilean_institutions_batch19_enriched.yaml',
|
|
base_path / 'tunisia' / 'tunisian_institutions_enhanced.yaml',
|
|
base_path / 'algeria' / 'algerian_institutions.yaml',
|
|
base_path / 'libya' / 'libyan_institutions.yaml'
|
|
]
|
|
|
|
files_to_migrate = [Path(f) for f in args.files] if args.files else default_files
|
|
|
|
# Create migrator
|
|
migrator = EnrichmentMigrator(dry_run=not args.apply)
|
|
|
|
print("=" * 80)
|
|
print("SCHEMA v0.2.2 ENRICHMENT MIGRATION")
|
|
print("=" * 80)
|
|
print(f"Mode: {'DRY RUN (preview only)' if migrator.dry_run else 'APPLY CHANGES'}")
|
|
print(f"Files: {len(files_to_migrate)}")
|
|
|
|
# Migrate files
|
|
for file_path in files_to_migrate:
|
|
if not file_path.exists():
|
|
print(f"\n⚠️ File not found: {file_path}")
|
|
continue
|
|
|
|
migrator.migrate_file(file_path)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 80)
|
|
print("MIGRATION SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Total institutions processed: {migrator.stats['processed']}")
|
|
print(f"Migrated to v0.2.2: {migrator.stats['migrated']}")
|
|
print(f"Skipped (already migrated or no enrichment): {migrator.stats['skipped']}")
|
|
print(f"Errors: {migrator.stats['errors']}")
|
|
|
|
if migrator.dry_run:
|
|
print("\n⚠️ DRY RUN MODE - No files were modified")
|
|
print("Run with --apply to write changes")
|
|
else:
|
|
print("\n✅ Migration completed")
|
|
print("Backups created with .pre_v0.2.2_backup extension")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|