- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
413 lines
14 KiB
Python
413 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Chilean GLAM Institutions - Batch 20 Wikidata Enrichment
|
|
FIRST PRODUCTION USE OF SCHEMA v0.2.2 enrichment_history
|
|
|
|
This script demonstrates the NEW structured enrichment tracking:
|
|
- Uses Provenance.enrichment_history (list of EnrichmentHistoryEntry)
|
|
- Replaces unstructured provenance.notes with queryable metadata
|
|
- Tracks enrichment_type, match_score, verified status
|
|
- Aligns with PROV-O, ADMS, and Dublin Core ontologies
|
|
|
|
Target: 19 unenriched institutions → 79% coverage goal
|
|
Schema: v0.2.2 (enrichment_history structure)
|
|
"""
|
|
|
|
import yaml
|
|
import time
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Optional
|
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
|
|
# =============================================================================
|
|
# CONFIGURATION
|
|
# =============================================================================
|
|
|
|
INPUT_FILE = Path('data/instances/chile/chilean_institutions_batch19_enriched.yaml')
|
|
OUTPUT_FILE = Path('data/instances/chile/chilean_institutions_batch20_enriched.yaml')
|
|
WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
|
|
# Fuzzy match threshold (0.0-1.0)
|
|
MIN_MATCH_SCORE = 0.75
|
|
|
|
# TEST MODE: Set to a number to limit enrichment (e.g., 3 for testing)
|
|
# Set to None to process all unenriched institutions
|
|
TEST_MODE_LIMIT = 3 # TEST MODE - Only process first 3 institutions
|
|
|
|
# Rate limiting (seconds between queries)
|
|
QUERY_DELAY = 2.0 # Respect Wikidata's rate limits (1 req/sec + buffer)
|
|
|
|
# TEST MODE: Set to a number to limit enrichment (e.g., 3 for testing)
|
|
# Set to None to process all unenriched institutions
|
|
TEST_MODE_LIMIT = 3 # TEST MODE - Only process first 3 institutions
|
|
|
|
# Rate limiting (seconds between queries)
|
|
QUERY_DELAY = 2.0 # Respect Wikidata's rate limits (1 req/sec + buffer)
|
|
|
|
# =============================================================================
|
|
# WIKIDATA SPARQL QUERIES
|
|
# =============================================================================
|
|
|
|
def query_wikidata_by_name(institution_name: str, country: str = "Chile") -> Optional[Dict]:
|
|
"""
|
|
Query Wikidata for heritage institutions by name.
|
|
|
|
Returns dict with Q-number, label, and additional metadata if found.
|
|
"""
|
|
|
|
# SPARQL query for Chilean heritage institutions
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?founded WHERE {{
|
|
# Heritage institution types
|
|
VALUES ?type {{
|
|
wd:Q33506 # Museum
|
|
wd:Q7075 # Library
|
|
wd:Q166118 # Archive
|
|
wd:Q207694 # Art gallery
|
|
wd:Q2668072 # Cultural institution
|
|
}}
|
|
|
|
?item wdt:P31/wdt:P279* ?type . # Instance of (with subclasses)
|
|
?item wdt:P17 wd:Q298 . # Country: Chile
|
|
|
|
# Optional identifiers
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf }}
|
|
OPTIONAL {{ ?item wdt:P791 ?isil }}
|
|
OPTIONAL {{ ?item wdt:P571 ?founded }}
|
|
|
|
# Filter by name (case-insensitive)
|
|
FILTER(CONTAINS(LCASE(?itemLabel), "{institution_name.lower()}"))
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "es,en" }}
|
|
}}
|
|
LIMIT 5
|
|
"""
|
|
|
|
sparql = SPARQLWrapper(WIKIDATA_ENDPOINT)
|
|
sparql.setQuery(query)
|
|
sparql.setReturnFormat(JSON)
|
|
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.2 (https://github.com/sct/glam)")
|
|
|
|
try:
|
|
results = sparql.query().convert()
|
|
|
|
# Rate limiting - wait after each query
|
|
time.sleep(QUERY_DELAY)
|
|
|
|
# Rate limiting - wait after each query
|
|
time.sleep(QUERY_DELAY)
|
|
|
|
if results['results']['bindings']:
|
|
# Return first match
|
|
result = results['results']['bindings'][0]
|
|
|
|
q_number = result['item']['value'].split('/')[-1]
|
|
label = result['itemLabel']['value']
|
|
description = result.get('itemDescription', {}).get('value', '')
|
|
viaf = result.get('viaf', {}).get('value', None)
|
|
isil = result.get('isil', {}).get('value', None)
|
|
founded = result.get('founded', {}).get('value', None)
|
|
|
|
return {
|
|
'q_number': q_number,
|
|
'wikidata_label': label,
|
|
'description': description,
|
|
'viaf': viaf,
|
|
'isil': isil,
|
|
'founded': founded
|
|
}
|
|
except Exception as e:
|
|
print(f" ⚠️ SPARQL query failed: {e}")
|
|
time.sleep(QUERY_DELAY) # Wait even on error
|
|
time.sleep(QUERY_DELAY) # Wait even on error
|
|
|
|
return None
|
|
|
|
# =============================================================================
|
|
# FUZZY MATCHING
|
|
# =============================================================================
|
|
|
|
def calculate_match_score(name1: str, name2: str) -> float:
|
|
"""
|
|
Calculate fuzzy match score between two institution names.
|
|
Uses simple token-based matching (can be enhanced with rapidfuzz).
|
|
|
|
Returns score 0.0-1.0
|
|
"""
|
|
from difflib import SequenceMatcher
|
|
|
|
# Normalize
|
|
n1 = name1.lower().strip()
|
|
n2 = name2.lower().strip()
|
|
|
|
# Direct comparison
|
|
if n1 == n2:
|
|
return 1.0
|
|
|
|
# Token-based matching
|
|
matcher = SequenceMatcher(None, n1, n2)
|
|
return matcher.ratio()
|
|
|
|
# =============================================================================
|
|
# SCHEMA v0.2.2 ENRICHMENT FUNCTIONS
|
|
# =============================================================================
|
|
|
|
def create_enrichment_entry(
|
|
enrichment_type: str,
|
|
enrichment_method: str,
|
|
match_score: Optional[float] = None,
|
|
verified: bool = False,
|
|
enrichment_source: Optional[str] = None,
|
|
enrichment_notes: Optional[str] = None
|
|
) -> Dict:
|
|
"""
|
|
Create EnrichmentHistoryEntry conforming to schema v0.2.2.
|
|
|
|
Args:
|
|
enrichment_type: EnrichmentTypeEnum value (e.g., "WIKIDATA_IDENTIFIER")
|
|
enrichment_method: Description of enrichment method
|
|
match_score: Fuzzy match confidence (0.0-1.0), null for manual
|
|
verified: Whether manually verified (default False)
|
|
enrichment_source: Source URL (e.g., https://www.wikidata.org)
|
|
enrichment_notes: Additional notes about enrichment
|
|
|
|
Returns:
|
|
Dict conforming to EnrichmentHistoryEntry class
|
|
"""
|
|
entry = {
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': enrichment_method,
|
|
'enrichment_type': enrichment_type,
|
|
'verified': verified
|
|
}
|
|
|
|
if match_score is not None:
|
|
entry['match_score'] = round(match_score, 3)
|
|
|
|
if enrichment_source:
|
|
entry['enrichment_source'] = enrichment_source
|
|
|
|
if enrichment_notes:
|
|
entry['enrichment_notes'] = enrichment_notes
|
|
|
|
return entry
|
|
|
|
def add_wikidata_identifier(inst: Dict, wikidata_data: Dict, match_score: float) -> bool:
|
|
"""
|
|
Add Wikidata identifier and enrichment_history entry to institution.
|
|
|
|
Returns True if enrichment was added, False if already exists.
|
|
"""
|
|
|
|
# Check if already has Wikidata
|
|
existing_ids = inst.get('identifiers', [])
|
|
has_wikidata = any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in existing_ids
|
|
)
|
|
|
|
if has_wikidata:
|
|
print(f" ⚠️ Already has Wikidata identifier")
|
|
return False
|
|
|
|
# Add Wikidata identifier
|
|
wikidata_id = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': wikidata_data['q_number'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_data['q_number']}"
|
|
}
|
|
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append(wikidata_id)
|
|
|
|
# Add VIAF if available
|
|
if wikidata_data.get('viaf'):
|
|
viaf_id = {
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': wikidata_data['viaf'],
|
|
'identifier_url': f"https://viaf.org/viaf/{wikidata_data['viaf']}"
|
|
}
|
|
inst['identifiers'].append(viaf_id)
|
|
print(f" 📚 Added VIAF: {wikidata_data['viaf']}")
|
|
|
|
# Create enrichment_history entry (SCHEMA v0.2.2)
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
if 'enrichment_history' not in inst['provenance']:
|
|
inst['provenance']['enrichment_history'] = []
|
|
|
|
# Wikidata enrichment entry
|
|
wikidata_entry = create_enrichment_entry(
|
|
enrichment_type='WIKIDATA_IDENTIFIER',
|
|
enrichment_method='Wikidata SPARQL query with fuzzy name matching',
|
|
match_score=match_score,
|
|
verified=False, # Automated enrichment
|
|
enrichment_source='https://www.wikidata.org',
|
|
enrichment_notes=f"Matched to '{wikidata_data['wikidata_label']}' (Q{wikidata_data['q_number']})"
|
|
)
|
|
|
|
inst['provenance']['enrichment_history'].append(wikidata_entry)
|
|
|
|
# If VIAF was added, create separate enrichment entry
|
|
if wikidata_data.get('viaf'):
|
|
viaf_entry = create_enrichment_entry(
|
|
enrichment_type='VIAF_IDENTIFIER',
|
|
enrichment_method='VIAF identifier extracted from Wikidata entity',
|
|
match_score=None, # Derived data, not fuzzy matched
|
|
verified=False,
|
|
enrichment_source='https://viaf.org',
|
|
enrichment_notes=f"Extracted from Wikidata Q{wikidata_data['q_number']}"
|
|
)
|
|
inst['provenance']['enrichment_history'].append(viaf_entry)
|
|
|
|
print(f" ✅ Added Wikidata: {wikidata_data['q_number']} ({wikidata_data['wikidata_label']})")
|
|
print(f" 📊 Match score: {match_score:.3f}")
|
|
|
|
return True
|
|
|
|
# =============================================================================
|
|
# FILE I/O
|
|
# =============================================================================
|
|
|
|
def load_yaml(file_path: Path) -> List[Dict]:
|
|
"""Load YAML file."""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def save_yaml(data: List[Dict], file_path: Path) -> None:
|
|
"""Save data to YAML file."""
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
data,
|
|
f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=120,
|
|
indent=2
|
|
)
|
|
|
|
# =============================================================================
|
|
# MAIN ENRICHMENT WORKFLOW
|
|
# =============================================================================
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("CHILEAN GLAM INSTITUTIONS - BATCH 20 ENRICHMENT (SCHEMA v0.2.2)")
|
|
print("=" * 80)
|
|
print()
|
|
print("🆕 FIRST PRODUCTION USE OF enrichment_history STRUCTURE")
|
|
print(" Schema: v0.2.2 (structured provenance tracking)")
|
|
print(" Target: 19 unenriched institutions")
|
|
print()
|
|
|
|
# Load data
|
|
print(f"📖 Loading: {INPUT_FILE}")
|
|
institutions = load_yaml(INPUT_FILE)
|
|
print(f" Loaded {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Find unenriched institutions
|
|
unenriched = [
|
|
inst for inst in institutions
|
|
if not any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
]
|
|
|
|
print(f"🔍 Found {len(unenriched)} institutions without Wikidata")
|
|
|
|
# TEST MODE: Limit number of institutions to enrich
|
|
if TEST_MODE_LIMIT is not None:
|
|
print(f"⚠️ TEST MODE: Limiting to first {TEST_MODE_LIMIT} institutions")
|
|
unenriched = unenriched[:TEST_MODE_LIMIT]
|
|
print()
|
|
print()
|
|
|
|
if not unenriched:
|
|
print("✅ All institutions already enriched!")
|
|
return
|
|
|
|
# Create backup
|
|
backup_file = INPUT_FILE.with_suffix('.yaml.batch20_backup')
|
|
print(f"💾 Creating backup: {backup_file}")
|
|
save_yaml(institutions, backup_file)
|
|
print()
|
|
|
|
# Enrich institutions
|
|
print(f"🔧 Starting Wikidata enrichment...")
|
|
print()
|
|
|
|
enriched_count = 0
|
|
skipped_count = 0
|
|
|
|
for i, inst in enumerate(unenriched, 1):
|
|
inst_name = inst['name']
|
|
inst_city = inst.get('locations', [{}])[0].get('city', 'Unknown')
|
|
|
|
print(f"{i}/{len(unenriched)}. {inst_name} ({inst_city})")
|
|
|
|
# Query Wikidata
|
|
wikidata_data = query_wikidata_by_name(inst_name)
|
|
|
|
if not wikidata_data:
|
|
print(f" ❌ No Wikidata match found")
|
|
skipped_count += 1
|
|
print()
|
|
continue
|
|
|
|
# Calculate match score
|
|
match_score = calculate_match_score(inst_name, wikidata_data['wikidata_label'])
|
|
|
|
if match_score < MIN_MATCH_SCORE:
|
|
print(f" ⚠️ Match score too low: {match_score:.3f} < {MIN_MATCH_SCORE}")
|
|
print(f" 📝 Wikidata label: {wikidata_data['wikidata_label']}")
|
|
skipped_count += 1
|
|
print()
|
|
continue
|
|
|
|
# Add enrichment
|
|
if add_wikidata_identifier(inst, wikidata_data, match_score):
|
|
enriched_count += 1
|
|
|
|
print()
|
|
|
|
# Save enriched data
|
|
print("=" * 80)
|
|
print(f"💾 Saving enriched data to: {OUTPUT_FILE}")
|
|
save_yaml(institutions, OUTPUT_FILE)
|
|
print()
|
|
|
|
# Statistics
|
|
total_with_wikidata = sum(
|
|
1 for inst in institutions
|
|
if any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
)
|
|
|
|
print("=" * 80)
|
|
print("ENRICHMENT SUMMARY")
|
|
print("=" * 80)
|
|
print(f"✅ Newly enriched: {enriched_count}")
|
|
print(f"⏭️ Skipped (no match or low confidence): {skipped_count}")
|
|
print(f"📊 Total with Wikidata: {total_with_wikidata}/{len(institutions)} ({total_with_wikidata/len(institutions)*100:.1f}%)")
|
|
print()
|
|
print("🎯 Schema v0.2.2 Features Used:")
|
|
print(" - enrichment_history (list of EnrichmentHistoryEntry)")
|
|
print(" - enrichment_type: WIKIDATA_IDENTIFIER, VIAF_IDENTIFIER")
|
|
print(" - match_score: Fuzzy matching confidence (0.0-1.0)")
|
|
print(" - verified: false (automated enrichment)")
|
|
print(" - enrichment_source: Wikidata and VIAF URLs")
|
|
print()
|
|
print("✅ DONE")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|