- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
340 lines
13 KiB
Python
340 lines
13 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Wikidata enrichment for Algerian heritage institutions using fuzzy search.
|
|
|
|
Searches Wikidata by CONTAINS search rather than exact label match,
|
|
then uses fuzzy matching to verify results.
|
|
|
|
GLAM Data Extraction Project
|
|
Schema: LinkML v0.2.1
|
|
"""
|
|
|
|
import yaml
|
|
import time
|
|
import requests
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List
|
|
from rapidfuzz import fuzz
|
|
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-Algeria-Wikidata-Enrichment/2.0"
|
|
|
|
def search_wikidata_fuzzy(name: str, city: Optional[str] = None, timeout: int = 60) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Search Wikidata for Algerian heritage institutions using broader criteria.
|
|
|
|
Returns best fuzzy match from results.
|
|
Improved with city verification and higher threshold (85%).
|
|
"""
|
|
|
|
# Try multiple search strategies
|
|
queries = []
|
|
|
|
# Strategy 1: Search by country + institution type
|
|
# Get all museums/libraries/archives in Algeria, then fuzzy match client-side
|
|
query1 = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil ?website ?coords ?inception ?itemAltLabel ?cityLabel
|
|
WHERE {
|
|
# Must be in Algeria
|
|
?item wdt:P17 wd:Q262 .
|
|
|
|
# Must be heritage institution type
|
|
?item wdt:P31 ?type .
|
|
VALUES ?type {
|
|
wd:Q33506 # Museum
|
|
wd:Q7075 # Library
|
|
wd:Q166118 # Archive
|
|
wd:Q1030034 # Archaeological museum
|
|
wd:Q473972 # Art museum
|
|
wd:Q570116 # Public library
|
|
wd:Q22687 # Synagogue
|
|
wd:Q7840289 # Art gallery
|
|
wd:Q2668072 # National library
|
|
}
|
|
|
|
OPTIONAL { ?item wdt:P214 ?viaf . }
|
|
OPTIONAL { ?item wdt:P791 ?isil . }
|
|
OPTIONAL { ?item wdt:P856 ?website . }
|
|
OPTIONAL { ?item wdt:P625 ?coords . }
|
|
OPTIONAL { ?item wdt:P571 ?inception . }
|
|
OPTIONAL { ?item wdt:P131 ?city . }
|
|
OPTIONAL { ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "fr,ar,en" . }
|
|
}
|
|
LIMIT 100
|
|
"""
|
|
|
|
headers = {'User-Agent': USER_AGENT}
|
|
params = {
|
|
'query': query1,
|
|
'format': 'json'
|
|
}
|
|
|
|
try:
|
|
time.sleep(1.5) # Rate limiting
|
|
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get("results", {}).get("bindings", [])
|
|
|
|
if not bindings:
|
|
return None
|
|
|
|
# Fuzzy match against all results
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
name_lower = name.lower()
|
|
city_lower = city.lower() if city else None
|
|
|
|
for binding in bindings:
|
|
item_label = binding.get("itemLabel", {}).get("value", "").lower()
|
|
item_desc = binding.get("itemDescription", {}).get("value", "").lower()
|
|
wd_city = binding.get("cityLabel", {}).get("value", "").lower()
|
|
|
|
# Calculate match score
|
|
label_score = fuzz.ratio(name_lower, item_label)
|
|
partial_score = fuzz.partial_ratio(name_lower, item_label)
|
|
token_score = fuzz.token_set_ratio(name_lower, item_label)
|
|
|
|
# Best of the three fuzzy match strategies
|
|
score = max(label_score, partial_score, token_score)
|
|
|
|
# City verification: if both have cities and they don't match, penalize score
|
|
if city_lower and wd_city:
|
|
city_match = fuzz.ratio(city_lower, wd_city)
|
|
if city_match < 80: # Cities don't match
|
|
print(f" ⚠️ City mismatch: {city} vs {wd_city} - penalizing match")
|
|
score *= 0.5 # Penalize heavily
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = binding
|
|
|
|
# Require minimum 85% match (raised from 70%)
|
|
if best_score < 85:
|
|
return None
|
|
|
|
# Extract data from best match
|
|
item_uri = best_match.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
return None
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": best_match.get("itemLabel", {}).get("value", ""),
|
|
"description": best_match.get("itemDescription", {}).get("value", ""),
|
|
"match_score": best_score
|
|
}
|
|
|
|
if "viaf" in best_match:
|
|
result["viaf"] = best_match["viaf"]["value"]
|
|
|
|
if "isil" in best_match:
|
|
result["isil"] = best_match["isil"]["value"]
|
|
|
|
if "website" in best_match:
|
|
result["website"] = best_match["website"]["value"]
|
|
|
|
if "inception" in best_match:
|
|
result["founded_date"] = best_match["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in best_match:
|
|
coords_str = best_match["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
return result
|
|
|
|
except requests.exceptions.Timeout:
|
|
print(f" ⏱️ Query timeout (>{timeout}s)")
|
|
return None
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" ❌ Network error: {e}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
return None
|
|
|
|
def add_wikidata_to_institution(institution: dict, wikidata_result: dict):
|
|
"""Add Wikidata information to institution record."""
|
|
|
|
# Add Wikidata identifier
|
|
if 'identifiers' not in institution:
|
|
institution['identifiers'] = []
|
|
|
|
# Check if Wikidata already exists
|
|
existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}
|
|
|
|
if 'Wikidata' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': wikidata_result['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}"
|
|
})
|
|
|
|
# Add VIAF if present
|
|
if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': wikidata_result['viaf'],
|
|
'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}"
|
|
})
|
|
|
|
# Add ISIL if present
|
|
if wikidata_result.get('isil') and 'ISIL' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': wikidata_result['isil'],
|
|
'identifier_url': f"https://isil.org/{wikidata_result['isil']}"
|
|
})
|
|
|
|
# Update provenance notes
|
|
if 'provenance' not in institution:
|
|
institution['provenance'] = {}
|
|
|
|
notes = institution['provenance'].get('notes', '')
|
|
enrich_note = f" Wikidata enriched {datetime.now(timezone.utc).strftime('%Y-%m-%d')} ({wikidata_result['qid']}, match: {wikidata_result.get('match_score', 0):.0f}%)."
|
|
institution['provenance']['notes'] = (notes + enrich_note).strip()
|
|
|
|
def save_checkpoint(data, input_file: Path, stats: dict):
|
|
"""Save progress checkpoint."""
|
|
print(f"\n💾 Saving checkpoint... (enriched: {stats['enriched']}, total coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']})")
|
|
|
|
# Handle metadata for dict format (Tunisia) vs list format (Algeria/Libya)
|
|
if isinstance(data, dict) and '_metadata' in data:
|
|
data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
|
|
if 'Wikidata enrichment' not in data['_metadata'].get('enhancements', []):
|
|
data['_metadata']['enhancements'].append('Wikidata enrichment')
|
|
|
|
with open(input_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
def main():
|
|
input_file = Path('data/instances/algeria/algerian_institutions.yaml')
|
|
|
|
print("Algeria Wikidata Enrichment (Fuzzy Search)")
|
|
print("=" * 60)
|
|
print("Features:")
|
|
print(" - Broad SPARQL query (all Algerian heritage institutions)")
|
|
print(" - Client-side fuzzy matching (85% threshold)")
|
|
print(" - City verification (prevents false matches)")
|
|
print(" - Checkpoint saving every 10 institutions")
|
|
print(" - Multiple match strategies (exact, partial, token)")
|
|
print("=" * 60)
|
|
|
|
# Load data
|
|
print(f"\nReading: {input_file}")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Handle both list format and dict with 'institutions' key
|
|
institutions = data if isinstance(data, list) else data.get('institutions', [])
|
|
print(f"Total institutions: {len(institutions)}")
|
|
|
|
# Statistics
|
|
stats = {
|
|
'total': len(institutions),
|
|
'already_enriched': 0,
|
|
'searched': 0,
|
|
'found': 0,
|
|
'enriched': 0,
|
|
'failed': 0,
|
|
'low_confidence': 0,
|
|
'duplicate_prevented': 0
|
|
}
|
|
|
|
# Track Q-numbers used in this enrichment run (prevent duplicates)
|
|
used_qids = set()
|
|
|
|
# Also collect existing Q-numbers from already-enriched institutions
|
|
for inst in institutions:
|
|
identifiers = inst.get('identifiers', [])
|
|
for ident in identifiers:
|
|
if ident.get('identifier_scheme') == 'Wikidata':
|
|
used_qids.add(ident['identifier_value'])
|
|
|
|
# Process each institution
|
|
checkpoint_interval = 10
|
|
|
|
for i, inst in enumerate(institutions, 1):
|
|
name = inst.get('name', '')
|
|
city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else ''
|
|
|
|
# Check if already has Wikidata
|
|
identifiers = inst.get('identifiers', [])
|
|
existing_schemes = {id.get('identifier_scheme') for id in identifiers}
|
|
|
|
if 'Wikidata' in existing_schemes:
|
|
stats['already_enriched'] += 1
|
|
qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown')
|
|
print(f"[{i}/{len(institutions)}] ✓ {name} (already has {qid})")
|
|
continue
|
|
|
|
# Search Wikidata with fuzzy matching
|
|
print(f"[{i}/{len(institutions)}] Searching: {name} ({city})")
|
|
stats['searched'] += 1
|
|
|
|
result = search_wikidata_fuzzy(name, city, timeout=60)
|
|
|
|
if result:
|
|
stats['found'] += 1
|
|
match_score = result.get('match_score', 0)
|
|
qid = result['qid']
|
|
print(f" ✅ Found: {qid} - {result.get('name', '')} (match: {match_score:.0f}%)")
|
|
|
|
# Check if Q-number already used
|
|
if qid in used_qids:
|
|
stats['duplicate_prevented'] += 1
|
|
stats['failed'] += 1
|
|
print(f" ⚠️ Q-number {qid} already assigned to another institution, skipping")
|
|
# Accept matches above 85% (function already filters, but double-check)
|
|
elif match_score >= 85:
|
|
add_wikidata_to_institution(inst, result)
|
|
used_qids.add(qid) # Track this Q-number
|
|
stats['enriched'] += 1
|
|
print(f" ✅ Enriched")
|
|
else:
|
|
stats['low_confidence'] += 1
|
|
stats['failed'] += 1
|
|
print(f" ⚠️ Match score too low (<85%), skipping")
|
|
else:
|
|
stats['failed'] += 1
|
|
print(f" ❌ Not found")
|
|
|
|
# Checkpoint every N institutions
|
|
if i % checkpoint_interval == 0 or i == len(institutions):
|
|
save_checkpoint(data, input_file, stats)
|
|
|
|
# Final save
|
|
save_checkpoint(data, input_file, stats)
|
|
|
|
# Print statistics
|
|
print("\n" + "=" * 60)
|
|
print("WIKIDATA ENRICHMENT STATISTICS")
|
|
print("=" * 60)
|
|
print(f"Total institutions: {stats['total']}")
|
|
print(f"Already enriched: {stats['already_enriched']}")
|
|
print(f"Searched: {stats['searched']}")
|
|
print(f"Found: {stats['found']}")
|
|
print(f"Enriched (new): {stats['enriched']}")
|
|
print(f"Failed: {stats['failed']}")
|
|
print(f" - Low confidence: {stats['low_confidence']}")
|
|
print(f" - Duplicate Q-numbers prevented: {stats['duplicate_prevented']}")
|
|
print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)")
|
|
|
|
if stats['enriched'] > 0:
|
|
improvement = stats['enriched']
|
|
print(f"✨ Added {improvement} new Wikidata identifiers!")
|
|
|
|
print("\n✅ Wikidata enrichment complete!")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|