glam/scripts/enrich_chilean_batch20_v0.2.2_test.py.bak
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

413 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Chilean GLAM Institutions - Batch 20 Wikidata Enrichment
FIRST PRODUCTION USE OF SCHEMA v0.2.2 enrichment_history
This script demonstrates the NEW structured enrichment tracking:
- Uses Provenance.enrichment_history (list of EnrichmentHistoryEntry)
- Replaces unstructured provenance.notes with queryable metadata
- Tracks enrichment_type, match_score, verified status
- Aligns with PROV-O, ADMS, and Dublin Core ontologies
Target: 19 unenriched institutions → 79% coverage goal
Schema: v0.2.2 (enrichment_history structure)
"""
import yaml
import time
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Optional
from SPARQLWrapper import SPARQLWrapper, JSON
# =============================================================================
# CONFIGURATION
# =============================================================================
INPUT_FILE = Path('data/instances/chile/chilean_institutions_batch19_enriched.yaml')
OUTPUT_FILE = Path('data/instances/chile/chilean_institutions_batch20_enriched.yaml')
WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"
# Fuzzy match threshold (0.0-1.0)
MIN_MATCH_SCORE = 0.75
# TEST MODE: Set to a number to limit enrichment (e.g., 3 for testing)
# Set to None to process all unenriched institutions
TEST_MODE_LIMIT = 3 # TEST MODE - Only process first 3 institutions
# Rate limiting (seconds between queries)
QUERY_DELAY = 2.0 # Respect Wikidata's rate limits (1 req/sec + buffer)
# TEST MODE: Set to a number to limit enrichment (e.g., 3 for testing)
# Set to None to process all unenriched institutions
TEST_MODE_LIMIT = 3 # TEST MODE - Only process first 3 institutions
# Rate limiting (seconds between queries)
QUERY_DELAY = 2.0 # Respect Wikidata's rate limits (1 req/sec + buffer)
# =============================================================================
# WIKIDATA SPARQL QUERIES
# =============================================================================
def query_wikidata_by_name(institution_name: str, country: str = "Chile") -> Optional[Dict]:
"""
Query Wikidata for heritage institutions by name.
Returns dict with Q-number, label, and additional metadata if found.
"""
# SPARQL query for Chilean heritage institutions
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?founded WHERE {{
# Heritage institution types
VALUES ?type {{
wd:Q33506 # Museum
wd:Q7075 # Library
wd:Q166118 # Archive
wd:Q207694 # Art gallery
wd:Q2668072 # Cultural institution
}}
?item wdt:P31/wdt:P279* ?type . # Instance of (with subclasses)
?item wdt:P17 wd:Q298 . # Country: Chile
# Optional identifiers
OPTIONAL {{ ?item wdt:P214 ?viaf }}
OPTIONAL {{ ?item wdt:P791 ?isil }}
OPTIONAL {{ ?item wdt:P571 ?founded }}
# Filter by name (case-insensitive)
FILTER(CONTAINS(LCASE(?itemLabel), "{institution_name.lower()}"))
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
}}
LIMIT 5
"""
sparql = SPARQLWrapper(WIKIDATA_ENDPOINT)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.2 (https://github.com/sct/glam)")
try:
results = sparql.query().convert()
# Rate limiting - wait after each query
time.sleep(QUERY_DELAY)
# Rate limiting - wait after each query
time.sleep(QUERY_DELAY)
if results['results']['bindings']:
# Return first match
result = results['results']['bindings'][0]
q_number = result['item']['value'].split('/')[-1]
label = result['itemLabel']['value']
description = result.get('itemDescription', {}).get('value', '')
viaf = result.get('viaf', {}).get('value', None)
isil = result.get('isil', {}).get('value', None)
founded = result.get('founded', {}).get('value', None)
return {
'q_number': q_number,
'wikidata_label': label,
'description': description,
'viaf': viaf,
'isil': isil,
'founded': founded
}
except Exception as e:
print(f" ⚠️ SPARQL query failed: {e}")
time.sleep(QUERY_DELAY) # Wait even on error
time.sleep(QUERY_DELAY) # Wait even on error
return None
# =============================================================================
# FUZZY MATCHING
# =============================================================================
def calculate_match_score(name1: str, name2: str) -> float:
"""
Calculate fuzzy match score between two institution names.
Uses simple token-based matching (can be enhanced with rapidfuzz).
Returns score 0.0-1.0
"""
from difflib import SequenceMatcher
# Normalize
n1 = name1.lower().strip()
n2 = name2.lower().strip()
# Direct comparison
if n1 == n2:
return 1.0
# Token-based matching
matcher = SequenceMatcher(None, n1, n2)
return matcher.ratio()
# =============================================================================
# SCHEMA v0.2.2 ENRICHMENT FUNCTIONS
# =============================================================================
def create_enrichment_entry(
enrichment_type: str,
enrichment_method: str,
match_score: Optional[float] = None,
verified: bool = False,
enrichment_source: Optional[str] = None,
enrichment_notes: Optional[str] = None
) -> Dict:
"""
Create EnrichmentHistoryEntry conforming to schema v0.2.2.
Args:
enrichment_type: EnrichmentTypeEnum value (e.g., "WIKIDATA_IDENTIFIER")
enrichment_method: Description of enrichment method
match_score: Fuzzy match confidence (0.0-1.0), null for manual
verified: Whether manually verified (default False)
enrichment_source: Source URL (e.g., https://www.wikidata.org)
enrichment_notes: Additional notes about enrichment
Returns:
Dict conforming to EnrichmentHistoryEntry class
"""
entry = {
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': enrichment_method,
'enrichment_type': enrichment_type,
'verified': verified
}
if match_score is not None:
entry['match_score'] = round(match_score, 3)
if enrichment_source:
entry['enrichment_source'] = enrichment_source
if enrichment_notes:
entry['enrichment_notes'] = enrichment_notes
return entry
def add_wikidata_identifier(inst: Dict, wikidata_data: Dict, match_score: float) -> bool:
"""
Add Wikidata identifier and enrichment_history entry to institution.
Returns True if enrichment was added, False if already exists.
"""
# Check if already has Wikidata
existing_ids = inst.get('identifiers', [])
has_wikidata = any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in existing_ids
)
if has_wikidata:
print(f" ⚠️ Already has Wikidata identifier")
return False
# Add Wikidata identifier
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': wikidata_data['q_number'],
'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_data['q_number']}"
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Add VIAF if available
if wikidata_data.get('viaf'):
viaf_id = {
'identifier_scheme': 'VIAF',
'identifier_value': wikidata_data['viaf'],
'identifier_url': f"https://viaf.org/viaf/{wikidata_data['viaf']}"
}
inst['identifiers'].append(viaf_id)
print(f" 📚 Added VIAF: {wikidata_data['viaf']}")
# Create enrichment_history entry (SCHEMA v0.2.2)
if 'provenance' not in inst:
inst['provenance'] = {}
if 'enrichment_history' not in inst['provenance']:
inst['provenance']['enrichment_history'] = []
# Wikidata enrichment entry
wikidata_entry = create_enrichment_entry(
enrichment_type='WIKIDATA_IDENTIFIER',
enrichment_method='Wikidata SPARQL query with fuzzy name matching',
match_score=match_score,
verified=False, # Automated enrichment
enrichment_source='https://www.wikidata.org',
enrichment_notes=f"Matched to '{wikidata_data['wikidata_label']}' (Q{wikidata_data['q_number']})"
)
inst['provenance']['enrichment_history'].append(wikidata_entry)
# If VIAF was added, create separate enrichment entry
if wikidata_data.get('viaf'):
viaf_entry = create_enrichment_entry(
enrichment_type='VIAF_IDENTIFIER',
enrichment_method='VIAF identifier extracted from Wikidata entity',
match_score=None, # Derived data, not fuzzy matched
verified=False,
enrichment_source='https://viaf.org',
enrichment_notes=f"Extracted from Wikidata Q{wikidata_data['q_number']}"
)
inst['provenance']['enrichment_history'].append(viaf_entry)
print(f" ✅ Added Wikidata: {wikidata_data['q_number']} ({wikidata_data['wikidata_label']})")
print(f" 📊 Match score: {match_score:.3f}")
return True
# =============================================================================
# FILE I/O
# =============================================================================
def load_yaml(file_path: Path) -> List[Dict]:
"""Load YAML file."""
with open(file_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_yaml(data: List[Dict], file_path: Path) -> None:
"""Save data to YAML file."""
with open(file_path, 'w', encoding='utf-8') as f:
yaml.dump(
data,
f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=120,
indent=2
)
# =============================================================================
# MAIN ENRICHMENT WORKFLOW
# =============================================================================
def main():
print("=" * 80)
print("CHILEAN GLAM INSTITUTIONS - BATCH 20 ENRICHMENT (SCHEMA v0.2.2)")
print("=" * 80)
print()
print("🆕 FIRST PRODUCTION USE OF enrichment_history STRUCTURE")
print(" Schema: v0.2.2 (structured provenance tracking)")
print(" Target: 19 unenriched institutions")
print()
# Load data
print(f"📖 Loading: {INPUT_FILE}")
institutions = load_yaml(INPUT_FILE)
print(f" Loaded {len(institutions)} institutions")
print()
# Find unenriched institutions
unenriched = [
inst for inst in institutions
if not any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
]
print(f"🔍 Found {len(unenriched)} institutions without Wikidata")
# TEST MODE: Limit number of institutions to enrich
if TEST_MODE_LIMIT is not None:
print(f"⚠️ TEST MODE: Limiting to first {TEST_MODE_LIMIT} institutions")
unenriched = unenriched[:TEST_MODE_LIMIT]
print()
print()
if not unenriched:
print("✅ All institutions already enriched!")
return
# Create backup
backup_file = INPUT_FILE.with_suffix('.yaml.batch20_backup')
print(f"💾 Creating backup: {backup_file}")
save_yaml(institutions, backup_file)
print()
# Enrich institutions
print(f"🔧 Starting Wikidata enrichment...")
print()
enriched_count = 0
skipped_count = 0
for i, inst in enumerate(unenriched, 1):
inst_name = inst['name']
inst_city = inst.get('locations', [{}])[0].get('city', 'Unknown')
print(f"{i}/{len(unenriched)}. {inst_name} ({inst_city})")
# Query Wikidata
wikidata_data = query_wikidata_by_name(inst_name)
if not wikidata_data:
print(f" ❌ No Wikidata match found")
skipped_count += 1
print()
continue
# Calculate match score
match_score = calculate_match_score(inst_name, wikidata_data['wikidata_label'])
if match_score < MIN_MATCH_SCORE:
print(f" ⚠️ Match score too low: {match_score:.3f} < {MIN_MATCH_SCORE}")
print(f" 📝 Wikidata label: {wikidata_data['wikidata_label']}")
skipped_count += 1
print()
continue
# Add enrichment
if add_wikidata_identifier(inst, wikidata_data, match_score):
enriched_count += 1
print()
# Save enriched data
print("=" * 80)
print(f"💾 Saving enriched data to: {OUTPUT_FILE}")
save_yaml(institutions, OUTPUT_FILE)
print()
# Statistics
total_with_wikidata = sum(
1 for inst in institutions
if any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
)
print("=" * 80)
print("ENRICHMENT SUMMARY")
print("=" * 80)
print(f"✅ Newly enriched: {enriched_count}")
print(f"⏭️ Skipped (no match or low confidence): {skipped_count}")
print(f"📊 Total with Wikidata: {total_with_wikidata}/{len(institutions)} ({total_with_wikidata/len(institutions)*100:.1f}%)")
print()
print("🎯 Schema v0.2.2 Features Used:")
print(" - enrichment_history (list of EnrichmentHistoryEntry)")
print(" - enrichment_type: WIKIDATA_IDENTIFIER, VIAF_IDENTIFIER")
print(" - match_score: Fuzzy matching confidence (0.0-1.0)")
print(" - verified: false (automated enrichment)")
print(" - enrichment_source: Wikidata and VIAF URLs")
print()
print("✅ DONE")
if __name__ == '__main__':
main()