glam/scripts/enrich_libya_wikidata_fuzzy.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

378 lines
15 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Wikidata enrichment for Libyan heritage institutions using two-tier search strategy.
Two-Tier Search Strategy:
1. SPARQL query: Broad query for all Libyan heritage institutions → fuzzy match
2. MediaWiki API fallback: Text search via Wikidata's search API (catches edge cases)
The MediaWiki API search indexes labels, descriptions, aliases, and statement values
in all languages, complementing SPARQL's label-only matching.
GLAM Data Extraction Project
Schema: LinkML v0.2.1
"""
import yaml
import time
import requests
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Dict, Any, List
from rapidfuzz import fuzz
from glam_extractor.wikidata import search_wikidata_mediawiki
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
USER_AGENT = "GLAM-Libya-Wikidata-Enrichment/2.0"
def search_wikidata_fuzzy(name: str, city: Optional[str] = None, timeout: int = 60) -> Optional[Dict[str, Any]]:
"""
Search Wikidata for Libyan heritage institutions using broader criteria.
Returns best fuzzy match from results.
City matching gives bonus but doesn't penalize mismatches (80% threshold).
"""
# Try multiple search strategies
queries = []
# Strategy 1: Search by country + institution type
# Get all museums/libraries/archives in Libya, then fuzzy match client-side
query1 = """
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?cityLabel
WHERE {
# Must be in Libya
?item wdt:P17 wd:Q1016 .
# Must be heritage institution type
?item wdt:P31 ?type .
VALUES ?type {
wd:Q33506 # Museum
wd:Q7075 # Library
wd:Q166118 # Archive
wd:Q1030034 # Archaeological museum
wd:Q473972 # Art museum
wd:Q570116 # Public library
wd:Q22687 # Synagogue
wd:Q7840289 # Art gallery
wd:Q2668072 # National library
}
OPTIONAL { ?item wdt:P214 ?viaf . }
OPTIONAL { ?item wdt:P791 ?isil . }
OPTIONAL { ?item wdt:P856 ?website . }
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P571 ?inception . }
OPTIONAL { ?item wdt:P131 ?city . }
OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
}
LIMIT 100
"""
headers = {'User-Agent': USER_AGENT}
params = {
'query': query1,
'format': 'json'
}
try:
time.sleep(1.5) # Rate limiting
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
response.raise_for_status()
results = response.json()
bindings = results.get("results", {}).get("bindings", [])
if not bindings:
return None
# Fuzzy match against all results
best_match = None
best_score = 0
name_lower = name.lower()
city_lower = city.lower() if city else None
for binding in bindings:
item_label = binding.get("itemLabel", {}).get("value", "").lower()
item_desc = binding.get("itemDescription", {}).get("value", "").lower()
wd_city = binding.get("cityLabel", {}).get("value", "").lower()
# Calculate match score
label_score = fuzz.ratio(name_lower, item_label)
partial_score = fuzz.partial_ratio(name_lower, item_label)
token_score = fuzz.token_set_ratio(name_lower, item_label)
# Best of the three fuzzy match strategies
score = max(label_score, partial_score, token_score)
# MANDATORY geographic filtering: reject if cities clearly don't match
# This prevents false matches like "Barce Museum" matching to Misrata institutions
city_bonus = 0
if city_lower and wd_city:
city_match = fuzz.ratio(city_lower, wd_city)
if city_match >= 70: # Cities match well enough
print(f" ✓ City match: {city}{wd_city} (boosting +5%)")
city_bonus = 5 # Small bonus for city match
elif city_match < 50: # Cities are completely different
print(f" ❌ City mismatch: {city} vs {wd_city} (rejecting - geographic filter)")
continue # Skip this candidate entirely
else:
print(f" ⚠️ City difference: {city} vs {wd_city} (neutral, no bonus)")
# Apply city bonus (but don't exceed 100)
final_score = min(score + city_bonus, 100)
if final_score > best_score:
best_score = final_score
best_match = binding
# Require minimum 85% match for better precision (raised from 80% to prevent false positives)
# Geographic filtering now handles edge cases, so we can be more conservative
if best_score < 85:
# SPARQL didn't find good match - try MediaWiki API fallback
print(f" SPARQL best match {best_score:.1f}% below threshold, trying MediaWiki API fallback...")
fallback_result = search_wikidata_mediawiki(
name=name,
city=city,
country_qid="Q1016", # Libya
fuzzy_threshold=85,
timeout=timeout
)
if fallback_result:
# MediaWiki API found a match!
fallback_result["enrichment_method"] = "MediaWiki API search (fallback)"
return fallback_result
# Neither method found a match
return None
# Extract data from best match
item_uri = best_match.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
return None
result = {
"qid": qid,
"name": best_match.get("itemLabel", {}).get("value", ""),
"description": best_match.get("itemDescription", {}).get("value", ""),
"match_score": best_score
}
if "viaf" in best_match:
result["viaf"] = best_match["viaf"]["value"]
if "isil" in best_match:
result["isil"] = best_match["isil"]["value"]
if "website" in best_match:
result["website"] = best_match["website"]["value"]
if "inception" in best_match:
result["founded_date"] = best_match["inception"]["value"].split("T")[0]
if "coords" in best_match:
coords_str = best_match["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
return result
except requests.exceptions.Timeout:
print(f" ⏱️ Query timeout (>{timeout}s)")
return None
except requests.exceptions.RequestException as e:
print(f" ❌ Network error: {e}")
return None
except Exception as e:
print(f" ❌ Error: {e}")
return None
def add_wikidata_to_institution(institution: dict, wikidata_result: dict):
"""Add Wikidata information to institution record."""
# Add Wikidata identifier
if 'identifiers' not in institution:
institution['identifiers'] = []
# Check if Wikidata already exists
existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}
if 'Wikidata' not in existing_schemes:
institution['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': wikidata_result['qid'],
'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}"
})
# Add VIAF if present
if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes:
institution['identifiers'].append({
'identifier_scheme': 'VIAF',
'identifier_value': wikidata_result['viaf'],
'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}"
})
# Add ISIL if present
if wikidata_result.get('isil') and 'ISIL' not in existing_schemes:
institution['identifiers'].append({
'identifier_scheme': 'ISIL',
'identifier_value': wikidata_result['isil'],
'identifier_url': f"https://isil.org/{wikidata_result['isil']}"
})
# Update provenance notes
if 'provenance' not in institution:
institution['provenance'] = {}
# Track enrichment method (SPARQL vs MediaWiki API)
enrichment_method = wikidata_result.get('enrichment_method', 'SPARQL query')
notes = institution['provenance'].get('notes', '')
enrich_note = f" Wikidata enriched {datetime.now(timezone.utc).strftime('%Y-%m-%d')} ({wikidata_result['qid']}, match: {wikidata_result.get('match_score', 0):.0f}%, method: {enrichment_method})."
institution['provenance']['notes'] = (notes + enrich_note).strip()
def save_checkpoint(data, input_file: Path, stats: dict):
"""Save progress checkpoint."""
print(f"\n💾 Saving checkpoint... (enriched: {stats['enriched']}, total coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']})")
# Handle metadata for dict format (Tunisia) vs list format (Algeria/Libya)
if isinstance(data, dict) and '_metadata' in data:
data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
if 'Wikidata enrichment' not in data['_metadata'].get('enhancements', []):
data['_metadata']['enhancements'].append('Wikidata enrichment')
with open(input_file, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def main():
input_file = Path('data/instances/libya/libyan_institutions.yaml')
print("Libya Wikidata Enrichment (Two-Tier Search)")
print("=" * 60)
print("Features:")
print(" - Two-tier strategy: SPARQL → MediaWiki API fallback")
print(" - SPARQL: Broad query (all Libyan heritage institutions)")
print(" - MediaWiki API: Text search (labels, descriptions, aliases)")
print(" - Client-side fuzzy matching (85% threshold)")
print(" - Geographic filtering (city matching with bonus)")
print(" - Checkpoint saving every 10 institutions")
print(" - Multiple match strategies (exact, partial, token)")
print("=" * 60)
# Load data
print(f"\nReading: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Handle both list format and dict with 'institutions' key
institutions = data if isinstance(data, list) else data.get('institutions', [])
print(f"Total institutions: {len(institutions)}")
# Statistics
stats = {
'total': len(institutions),
'already_enriched': 0,
'searched': 0,
'found': 0,
'enriched': 0,
'failed': 0,
'low_confidence': 0,
'duplicate_prevented': 0
}
# Track Q-numbers used in this enrichment run (prevent duplicates)
used_qids = set()
# Also collect existing Q-numbers from already-enriched institutions
for inst in institutions:
identifiers = inst.get('identifiers', [])
for ident in identifiers:
if ident.get('identifier_scheme') == 'Wikidata':
used_qids.add(ident['identifier_value'])
# Process each institution
checkpoint_interval = 10
for i, inst in enumerate(institutions, 1):
name = inst.get('name', '')
city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else ''
# Check if already has Wikidata
identifiers = inst.get('identifiers', [])
existing_schemes = {id.get('identifier_scheme') for id in identifiers}
if 'Wikidata' in existing_schemes:
stats['already_enriched'] += 1
qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown')
print(f"[{i}/{len(institutions)}] ✓ {name} (already has {qid})")
continue
# Search Wikidata with fuzzy matching
print(f"[{i}/{len(institutions)}] Searching: {name} ({city})")
stats['searched'] += 1
result = search_wikidata_fuzzy(name, city, timeout=60)
if result:
stats['found'] += 1
match_score = result.get('match_score', 0)
qid = result['qid']
print(f" ✅ Found: {qid} - {result.get('name', '')} (match: {match_score:.0f}%)")
# Check if Q-number already used
if qid in used_qids:
stats['duplicate_prevented'] += 1
stats['failed'] += 1
print(f" ⚠️ Q-number {qid} already assigned to another institution, skipping")
# Accept matches above 80% (function already filters, but double-check)
elif match_score >= 80:
add_wikidata_to_institution(inst, result)
used_qids.add(qid) # Track this Q-number
stats['enriched'] += 1
print(f" ✅ Enriched")
else:
stats['low_confidence'] += 1
stats['failed'] += 1
print(f" ⚠️ Match score too low (<80%), skipping")
else:
stats['failed'] += 1
print(f" ❌ Not found")
# Checkpoint every N institutions
if i % checkpoint_interval == 0 or i == len(institutions):
save_checkpoint(data, input_file, stats)
# Final save
save_checkpoint(data, input_file, stats)
# Print statistics
print("\n" + "=" * 60)
print("WIKIDATA ENRICHMENT STATISTICS")
print("=" * 60)
print(f"Total institutions: {stats['total']}")
print(f"Already enriched: {stats['already_enriched']}")
print(f"Searched: {stats['searched']}")
print(f"Found: {stats['found']}")
print(f"Enriched (new): {stats['enriched']}")
print(f"Failed: {stats['failed']}")
print(f" - Low confidence: {stats['low_confidence']}")
print(f" - Duplicate Q-numbers prevented: {stats['duplicate_prevented']}")
print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)")
if stats['enriched'] > 0:
improvement = stats['enriched']
print(f"✨ Added {improvement} new Wikidata identifiers!")
print("\n✅ Wikidata enrichment complete!")
if __name__ == '__main__':
main()