497 lines
20 KiB
Python
Executable file
497 lines
20 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Wikidata enrichment for Algerian heritage institutions with multilingual validation.
|
|
|
|
Adapts the successful Tunisia enrichment strategy for Algeria:
|
|
- Entity type validation (museums must be museums, not banks or lakes)
|
|
- Geographic validation (institutions must be in correct city)
|
|
- Alternative name matching (French/Arabic/English)
|
|
- Fuzzy matching with 70% threshold
|
|
- Prevents false positives through multiple validation layers
|
|
|
|
Target: 19 Algerian institutions
|
|
Current Wikidata coverage: 5/19 (26.3%)
|
|
Expected improvement: 75-85% coverage (similar to Tunisia's 76.5%)
|
|
|
|
Key challenges addressed:
|
|
- French colonial heritage → multilingual Wikidata labels
|
|
- Archaeological sites labeled as "heritage sites" not "museums"
|
|
- Universities with multiple campuses requiring disambiguation
|
|
- Personal collections unlikely to have Wikidata entries
|
|
|
|
GLAM Data Extraction Project
|
|
Schema: LinkML v0.2.1
|
|
"""
|
|
|
|
import yaml
|
|
import time
|
|
import requests
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List, Set
|
|
from rapidfuzz import fuzz
|
|
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-Algeria-Wikidata-Enrichment/1.0"
|
|
|
|
# Institution type mapping: LinkML enum -> Wikidata entity types
|
|
INSTITUTION_TYPE_MAPPING = {
|
|
'MUSEUM': {
|
|
'Q33506', # Museum
|
|
'Q1030034', # Archaeological museum
|
|
'Q3329412', # Archaeological museum (variant)
|
|
'Q473972', # Art museum
|
|
'Q2668072', # National museum
|
|
'Q207694', # History museum
|
|
'Q7328910', # Science museum
|
|
'Q15243387', # Cultural heritage site
|
|
'Q3152824', # Archaeological site (for heritage museums)
|
|
'Q1153562', # Open-air museum
|
|
'Q1496967', # Folk museum
|
|
'Q17431399', # Heritage museum
|
|
'Q28835878', # Heritage site
|
|
'Q641635', # Natural history museum
|
|
'Q2142332', # Contemporary art museum
|
|
'Q3559325', # Heritage site museum
|
|
'Q16748062', # Site museum (for Timgad, Djémila, Tipasa)
|
|
},
|
|
'LIBRARY': {
|
|
'Q7075', # Library
|
|
'Q2668072', # National library
|
|
'Q570116', # Public library
|
|
'Q5193377', # University library
|
|
'Q28564', # Academic library
|
|
'Q1479716', # Regional library
|
|
'Q1622062', # Digital library
|
|
'Q17297735', # Diocesan library
|
|
},
|
|
'ARCHIVE': {
|
|
'Q166118', # Archive
|
|
'Q2668072', # National archive
|
|
'Q1497375', # Historical archive
|
|
'Q64578911', # Regional archive
|
|
'Q11396317', # State archive
|
|
},
|
|
'RESEARCH_CENTER': {
|
|
'Q31855', # Research institute
|
|
'Q7315155', # Research center
|
|
'Q2467461', # Research institution
|
|
'Q483242', # Laboratory
|
|
'Q1664720', # Institute
|
|
'Q7210356', # Cultural institution
|
|
},
|
|
'EDUCATION_PROVIDER': {
|
|
'Q3918', # University
|
|
'Q875538', # Public university
|
|
'Q2467461', # Private university
|
|
'Q15936437', # Research university
|
|
'Q38723', # Higher education institution
|
|
'Q3354859', # Technical university
|
|
'Q2385804', # Educational institution
|
|
'Q5341295', # Music school
|
|
'Q1664720', # Institute
|
|
'Q180958', # Faculty
|
|
},
|
|
'OFFICIAL_INSTITUTION': {
|
|
'Q7210356', # Cultural institution
|
|
'Q7840289', # Cultural center
|
|
'Q1030034', # Cultural heritage institution
|
|
'Q1664720', # Institute
|
|
'Q43229', # Organization
|
|
},
|
|
'PERSONAL_COLLECTION': {
|
|
'Q7075', # Library (personal libraries)
|
|
'Q166118', # Archive (personal archives)
|
|
'Q33506', # Museum (personal museums)
|
|
'Q16748062', # Collection
|
|
},
|
|
}
|
|
|
|
def get_valid_types_for_institution(inst_type: str) -> Set[str]:
|
|
"""Get set of valid Wikidata entity types for institution type."""
|
|
return INSTITUTION_TYPE_MAPPING.get(inst_type, set())
|
|
|
|
def search_wikidata_with_validation(
|
|
name: str,
|
|
inst_type: str,
|
|
city: Optional[str] = None,
|
|
alternative_names: Optional[List[str]] = None,
|
|
timeout: int = 60
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Search Wikidata for Algerian heritage institutions with entity type and geographic validation.
|
|
|
|
Args:
|
|
name: Institution name to search
|
|
inst_type: Institution type (MUSEUM, LIBRARY, ARCHIVE, etc.)
|
|
city: Optional city name for additional filtering
|
|
alternative_names: List of alternative names to try (French/Arabic/English)
|
|
timeout: Query timeout in seconds
|
|
|
|
Returns:
|
|
Dict with Wikidata data if valid match found, None otherwise
|
|
"""
|
|
|
|
# Get valid Wikidata entity types for this institution type
|
|
valid_types = get_valid_types_for_institution(inst_type)
|
|
|
|
if not valid_types:
|
|
print(f" ⚠️ Unknown institution type: {inst_type}")
|
|
return None
|
|
|
|
# Build VALUES clause for SPARQL query - filter by institution type server-side
|
|
type_values = " ".join([f"wd:{qid}" for qid in valid_types])
|
|
|
|
# Build SPARQL query - fetch Algerian institutions matching the specific type
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
|
|
?viaf ?isil ?website ?coords ?inception ?itemAltLabel
|
|
?location ?locationLabel
|
|
WHERE {{
|
|
# Must be in Algeria
|
|
?item wdt:P17 wd:Q262 .
|
|
|
|
# Must have an instance-of type matching our institution type
|
|
?item wdt:P31 ?type .
|
|
|
|
# Filter to relevant types for this institution (server-side filtering)
|
|
VALUES ?type {{ {type_values} }}
|
|
|
|
# Add location (P131: located in administrative territorial entity)
|
|
OPTIONAL {{ ?item wdt:P131 ?location . }}
|
|
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P791 ?isil . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ("fr", "ar", "en")) }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "fr,ar,en" . }}
|
|
}}
|
|
LIMIT 200
|
|
"""
|
|
|
|
headers = {'User-Agent': USER_AGENT}
|
|
params = {
|
|
'query': query,
|
|
'format': 'json'
|
|
}
|
|
|
|
try:
|
|
time.sleep(1.5) # Rate limiting
|
|
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get("results", {}).get("bindings", [])
|
|
|
|
if not bindings:
|
|
return None
|
|
|
|
# Fuzzy match against results WITH entity type AND geographic validation
|
|
best_match = None
|
|
best_score = 0
|
|
matched_name = name # Track which name produced the match
|
|
|
|
# Prepare all names to try (primary + alternatives)
|
|
names_to_try = [name]
|
|
if alternative_names:
|
|
names_to_try.extend(alternative_names)
|
|
|
|
city_lower = city.lower() if city else None
|
|
|
|
# Location-specific institution types require stricter geographic matching
|
|
requires_city_match = inst_type in {'EDUCATION_PROVIDER', 'RESEARCH_CENTER'}
|
|
|
|
# Try each name variation
|
|
for name_variant in names_to_try:
|
|
name_lower = name_variant.lower()
|
|
|
|
for binding in bindings:
|
|
# CRITICAL: Validate entity type FIRST
|
|
entity_type_uri = binding.get("type", {}).get("value", "")
|
|
entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None
|
|
|
|
# Skip if entity type doesn't match our institution type
|
|
if entity_type_qid not in valid_types:
|
|
continue
|
|
|
|
# GEOGRAPHIC VALIDATION: Check location match for location-specific institutions
|
|
if city_lower and requires_city_match:
|
|
location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""
|
|
|
|
# Must have location data
|
|
if not location_label:
|
|
continue
|
|
|
|
# Location must match expected city (fuzzy match for spelling variations)
|
|
location_match = fuzz.ratio(city_lower, location_label)
|
|
if location_match < 70: # Location mismatch - skip this result
|
|
continue
|
|
|
|
# Now do fuzzy matching on validated entities only
|
|
item_label = binding.get("itemLabel", {}).get("value", "").lower()
|
|
|
|
# Calculate match score using multiple strategies
|
|
label_score = fuzz.ratio(name_lower, item_label)
|
|
partial_score = fuzz.partial_ratio(name_lower, item_label)
|
|
token_score = fuzz.token_set_ratio(name_lower, item_label)
|
|
|
|
# Best of the three fuzzy match strategies
|
|
score = max(label_score, partial_score, token_score)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = binding
|
|
matched_name = name_variant # Record which name variation matched
|
|
|
|
# Require minimum 70% match (capturing multilingual variations)
|
|
if best_score < 70:
|
|
return None
|
|
|
|
# Extract data from best match
|
|
if not best_match:
|
|
return None
|
|
|
|
item_uri = best_match.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
return None
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": best_match.get("itemLabel", {}).get("value", ""),
|
|
"description": best_match.get("itemDescription", {}).get("value", ""),
|
|
"entity_type": best_match.get("typeLabel", {}).get("value", ""),
|
|
"match_score": best_score,
|
|
"matched_name": matched_name # Record which name variant matched
|
|
}
|
|
|
|
# Add optional fields if present
|
|
viaf_data = best_match.get("viaf")
|
|
if viaf_data and isinstance(viaf_data, dict):
|
|
result["viaf"] = viaf_data.get("value", "")
|
|
|
|
isil_data = best_match.get("isil")
|
|
if isil_data and isinstance(isil_data, dict):
|
|
result["isil"] = isil_data.get("value", "")
|
|
|
|
website_data = best_match.get("website")
|
|
if website_data and isinstance(website_data, dict):
|
|
result["website"] = website_data.get("value", "")
|
|
|
|
inception_data = best_match.get("inception")
|
|
if inception_data and isinstance(inception_data, dict):
|
|
result["founded_date"] = inception_data.get("value", "").split("T")[0]
|
|
|
|
coords_data = best_match.get("coords")
|
|
if coords_data and isinstance(coords_data, dict):
|
|
coords_str = coords_data.get("value", "")
|
|
if coords_str and coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
return result
|
|
|
|
except requests.exceptions.Timeout:
|
|
print(f" ⏱️ Query timeout (>{timeout}s)")
|
|
return None
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" ❌ Network error: {e}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
return None
|
|
|
|
def add_wikidata_to_institution(institution: dict, wikidata_result: dict):
|
|
"""Add Wikidata information to institution record."""
|
|
|
|
# Add Wikidata identifier
|
|
if 'identifiers' not in institution:
|
|
institution['identifiers'] = []
|
|
|
|
# Check if Wikidata already exists
|
|
existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}
|
|
|
|
if 'Wikidata' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': wikidata_result['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}"
|
|
})
|
|
|
|
# Add VIAF if present
|
|
if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': wikidata_result['viaf'],
|
|
'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}"
|
|
})
|
|
|
|
# Add ISIL if present
|
|
if wikidata_result.get('isil') and 'ISIL' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': wikidata_result['isil'],
|
|
# ISIL codes don't have a universal URLisil']}"
|
|
})
|
|
|
|
# Add enrichment history entry
|
|
if 'provenance' not in institution:
|
|
institution['provenance'] = {}
|
|
|
|
if 'enrichment_history' not in institution['provenance']:
|
|
institution['provenance']['enrichment_history'] = []
|
|
|
|
# Create enrichment history entry
|
|
entity_type = wikidata_result.get('entity_type', 'unknown')
|
|
matched_name = wikidata_result.get('matched_name', institution.get('name'))
|
|
match_note = f" [matched: {matched_name}]" if matched_name != institution.get('name') else ""
|
|
|
|
enrichment_entry = {
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_type': 'WIKIDATA_IDENTIFIER',
|
|
'enrichment_method': 'Wikidata SPARQL query with fuzzy matching and entity type validation',
|
|
'match_score': wikidata_result.get('match_score', 0) / 100.0, # Convert percentage to 0-1
|
|
'verified': False,
|
|
'enrichment_source': 'https://www.wikidata.org',
|
|
'enrichment_notes': f"Matched to Wikidata entity {wikidata_result['qid']} [{entity_type}]{match_note}. Entity type and geographic location validated."
|
|
}
|
|
|
|
institution['provenance']['enrichment_history'].append(enrichment_entry)
|
|
|
|
def save_checkpoint(institutions: list, output_file: Path, stats: dict):
|
|
"""Save progress checkpoint."""
|
|
print(f"\n💾 Saving checkpoint... (enriched: {stats['enriched']}, total coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']})")
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
def main():
|
|
input_file = Path('data/instances/algeria/algerian_institutions.yaml')
|
|
output_file = Path('data/instances/algeria/algerian_institutions.yaml') # Overwrite
|
|
|
|
print("Algeria Wikidata Enrichment (Multilingual + Validated)")
|
|
print("=" * 70)
|
|
print("Target: 19 Algerian institutions")
|
|
print("Current coverage: 5/19 (26.3%)")
|
|
print("Expected: 14-16/19 (75-85%) - similar to Tunisia's 76.5%")
|
|
print()
|
|
print("Features:")
|
|
print(" ✅ Entity type validation (museums must be museums, not banks)")
|
|
print(" ✅ Geographic validation (universities must be in correct city)")
|
|
print(" ✅ Multilingual matching (French/Arabic/English)")
|
|
print(" ✅ Fuzzy matching (70% threshold)")
|
|
print(" ✅ Checkpoint saving every 5 institutions")
|
|
print(" ✅ Prevents false positives (wrong institutions, wrong cities)")
|
|
print("=" * 70)
|
|
|
|
# Load data
|
|
print(f"\nReading: {input_file}")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"Total institutions: {len(institutions)}")
|
|
|
|
# Statistics
|
|
stats = {
|
|
'total': len(institutions),
|
|
'already_enriched': 0,
|
|
'searched': 0,
|
|
'found': 0,
|
|
'enriched': 0,
|
|
'failed': 0,
|
|
'by_type': {}
|
|
}
|
|
|
|
# Process each institution
|
|
checkpoint_interval = 5
|
|
|
|
for i, inst in enumerate(institutions, 1):
|
|
name = inst.get('name', '')
|
|
inst_type = inst.get('institution_type', 'MIXED')
|
|
city = inst.get('locations', [{}])[0].get('city', '') if inst.get('locations') else ''
|
|
|
|
# Track by institution type
|
|
if inst_type not in stats['by_type']:
|
|
stats['by_type'][inst_type] = {'searched': 0, 'found': 0, 'enriched': 0}
|
|
|
|
# Check if already has Wikidata
|
|
identifiers = inst.get('identifiers', [])
|
|
existing_schemes = {id.get('identifier_scheme') for id in identifiers}
|
|
|
|
if 'Wikidata' in existing_schemes:
|
|
stats['already_enriched'] += 1
|
|
qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown')
|
|
print(f"[{i}/{len(institutions)}] ✓ {name} (already has {qid})")
|
|
continue
|
|
|
|
# Search Wikidata with type validation
|
|
print(f"[{i}/{len(institutions)}] Searching: {name} [{inst_type}] ({city})")
|
|
stats['searched'] += 1
|
|
stats['by_type'][inst_type]['searched'] += 1
|
|
|
|
# Extract alternative names for multilingual matching
|
|
alt_names = inst.get('alternative_names', [])
|
|
print(f" Alternative names: {len(alt_names)} ({', '.join(alt_names[:2])}{'...' if len(alt_names) > 2 else ''})")
|
|
|
|
result = search_wikidata_with_validation(name, inst_type, city, alternative_names=alt_names, timeout=60)
|
|
|
|
if result:
|
|
stats['found'] += 1
|
|
stats['by_type'][inst_type]['found'] += 1
|
|
match_score = result.get('match_score', 0)
|
|
entity_type = result.get('entity_type', 'unknown')
|
|
matched_name = result.get('matched_name', name)
|
|
|
|
# Show which name variant was used for matching
|
|
name_note = f" [matched: {matched_name}]" if matched_name != name else ""
|
|
print(f" ✅ Found: {result['qid']} [{entity_type}] - {result.get('name', '')} (match: {match_score:.0f}%{name_note})")
|
|
|
|
add_wikidata_to_institution(inst, result)
|
|
stats['enriched'] += 1
|
|
stats['by_type'][inst_type]['enriched'] += 1
|
|
print(f" ✅ Enriched with validated match")
|
|
else:
|
|
stats['failed'] += 1
|
|
print(f" ❌ Not found or type mismatch")
|
|
|
|
# Checkpoint every N institutions
|
|
if i % checkpoint_interval == 0 or i == len(institutions):
|
|
save_checkpoint(institutions, output_file, stats)
|
|
|
|
# Final save
|
|
save_checkpoint(institutions, output_file, stats)
|
|
|
|
# Print statistics
|
|
print("\n" + "=" * 70)
|
|
print("ALGERIA WIKIDATA ENRICHMENT STATISTICS")
|
|
print("=" * 70)
|
|
print(f"Total institutions: {stats['total']}")
|
|
print(f"Already enriched: {stats['already_enriched']}")
|
|
print(f"Searched: {stats['searched']}")
|
|
print(f"Found (validated): {stats['found']}")
|
|
print(f"Enriched (new): {stats['enriched']}")
|
|
print(f"Failed: {stats['failed']}")
|
|
print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)")
|
|
|
|
print("\nBy institution type:")
|
|
for inst_type, type_stats in sorted(stats['by_type'].items()):
|
|
if type_stats['searched'] > 0:
|
|
print(f" {inst_type}: searched {type_stats['searched']}, found {type_stats['found']}, enriched {type_stats['enriched']}")
|
|
|
|
if stats['enriched'] > 0:
|
|
improvement = stats['enriched']
|
|
old_coverage = stats['already_enriched']
|
|
new_coverage = stats['already_enriched'] + stats['enriched']
|
|
print(f"\n✨ Added {improvement} new validated Wikidata identifiers!")
|
|
print(f"✨ Coverage improved from {old_coverage}/{stats['total']} ({100*old_coverage/stats['total']:.1f}%) to {new_coverage}/{stats['total']} ({100*new_coverage/stats['total']:.1f}%)")
|
|
print(f"✅ All matches validated against correct entity types and geographic locations")
|
|
|
|
print("\n✅ Wikidata enrichment complete!")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|