glam/scripts/enrich_georgia_batch1.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

440 lines
16 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Georgian heritage institutions - Phase 1 Proof of Concept
Target: 14 Georgian institutions with 0% Wikidata coverage
Goal: Achieve 50%+ Wikidata coverage (7+ institutions matched)
Strategy:
1. Query Wikidata for museums/libraries/archives in Georgia (Q230)
2. Fuzzy match institution names with 0.85+ threshold
3. Verify type compatibility (museum, library, archive)
4. Enrich with Wikidata Q-numbers, VIAF, coordinates, websites
5. Geocode remaining institutions using Nominatim
CRITICAL: This follows the Chilean enrichment success pattern (78.9% coverage)
"""
import sys
from pathlib import Path
from typing import Any, Optional, Dict, List
from datetime import datetime, timezone
import time
import yaml
from difflib import SequenceMatcher
import re
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
def normalize_name(name: str) -> str:
"""Normalize institution name for fuzzy matching."""
name = name.lower()
# Remove common prefixes/suffixes (English, Georgian transliterations)
name = re.sub(r'^(museum|muzeum|museu|library|biblioteka|archive|arkivi)[\s\-]+', '', name)
name = re.sub(r'[\s\-]+(museum|muzeum|library|biblioteka|archive|arkivi|georgia|georgian)$', '', name)
# Remove organizational forms
name = re.sub(r'\b(foundation|institute|state|national|central)\b', '', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', ' ', name)
# Normalize whitespace
name = ' '.join(name.split())
return name
def similarity_score(name1: str, name2: str) -> float:
"""Calculate similarity between two names (0-1)."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
return SequenceMatcher(None, norm1, norm2).ratio()
def institution_type_compatible(inst_type: str, inst_name: str, wd_name: str, wd_desc: str) -> bool:
"""
Check if institution types are compatible.
Prevents mismatches like museum → archive.
"""
museum_kw = ['museum', 'muzeum', 'museu']
archive_kw = ['archive', 'arkivi', 'archiv']
library_kw = ['library', 'biblioteka', 'bibliothek']
inst_lower = (inst_name + ' ' + inst_type).lower()
wd_lower = (wd_name + ' ' + wd_desc).lower()
inst_is_museum = any(kw in inst_lower for kw in museum_kw) or inst_type == 'MUSEUM'
inst_is_archive = any(kw in inst_lower for kw in archive_kw) or inst_type == 'ARCHIVE'
inst_is_library = any(kw in inst_lower for kw in library_kw) or inst_type == 'LIBRARY'
wd_is_museum = any(kw in wd_lower for kw in museum_kw)
wd_is_archive = any(kw in wd_lower for kw in archive_kw)
wd_is_library = any(kw in wd_lower for kw in library_kw)
# If both have explicit types, they must match
if inst_is_museum and not wd_is_museum:
return False
if inst_is_archive and not wd_is_archive:
return False
if inst_is_library and not wd_is_library:
return False
return True
def query_georgian_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]:
"""
Query Wikidata for GLAM institutions in Georgia (Q230).
Returns: dict keyed by QID with institution data
"""
query = """
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?typeLabel ?isil ?viaf ?coords ?website ?inception
WHERE {
# Institution is in Georgia
?item wdt:P17 wd:Q230 .
# Institution is a GLAM type
VALUES ?type {
wd:Q7075 # library
wd:Q166118 # archive
wd:Q33506 # museum
wd:Q1007870 # art gallery
wd:Q28564 # public library
wd:Q11396180 # academic library
wd:Q207694 # art museum
wd:Q2772772 # history museum
wd:Q768717 # ethnographic museum
wd:Q7406919 # state museum
}
?item wdt:P31 ?type .
# Optional enrichment data
OPTIONAL { ?item wdt:P791 ?isil . } # ISIL code
OPTIONAL { ?item wdt:P214 ?viaf . } # VIAF ID
OPTIONAL { ?item wdt:P625 ?coords . } # Coordinates
OPTIONAL { ?item wdt:P856 ?website . } # Official website
OPTIONAL { ?item wdt:P571 ?inception . } # Founding date
# Get labels (English, Georgian, Russian)
SERVICE wikibase:label {
bd:serviceParam wikibase:language "en,ka,ru" .
}
}
LIMIT 500
"""
sparql.setQuery(query)
try:
raw_results = sparql.query().convert()
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
results = {}
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
continue
result = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"type": binding.get("typeLabel", {}).get("value", ""),
"identifiers": {}
}
if "isil" in binding:
result["identifiers"]["ISIL"] = binding["isil"]["value"]
if "viaf" in binding:
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
result["identifiers"]["Website"] = binding["website"]["value"]
if "inception" in binding:
result["founding_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
result["latitude"] = float(lat)
result["longitude"] = float(lon)
results[qid] = result
return results
except Exception as e:
print(f"\n❌ Error querying Wikidata: {e}")
return {}
def geocode_institution(name: str, country: str = "Georgia") -> Optional[Dict[str, Any]]:
"""
Geocode institution using Nominatim.
Respects 1 req/sec rate limit.
"""
try:
import requests
# Try with institution name + country
search_query = f"{name}, {country}"
response = requests.get(
"https://nominatim.openstreetmap.org/search",
params={
"q": search_query,
"format": "json",
"limit": 1,
"countrycodes": "ge" # Georgia ISO code
},
headers={"User-Agent": "GLAM-Dataset-Enrichment/1.0"}
)
if response.status_code == 200:
results = response.json()
if results:
location = results[0]
return {
"latitude": float(location["lat"]),
"longitude": float(location["lon"]),
"display_name": location.get("display_name", "")
}
# Rate limit: 1 request per second
time.sleep(1.1)
except Exception as e:
print(f" ⚠️ Geocoding error: {e}")
return None
def load_georgia_institutions(yaml_path: Path) -> List[Dict[str, Any]]:
"""Load Georgian institutions from unified dataset."""
with open(yaml_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Filter for Georgian institutions
return [inst for inst in data if inst.get('locations', [{}])[0].get('country') == 'GE']
def enrich_institution(
inst: Dict[str, Any],
wikidata_results: Dict[str, Dict[str, Any]],
fuzzy_threshold: float = 0.85
) -> Optional[Dict[str, Any]]:
"""
Try to enrich institution with Wikidata data.
Returns enrichment data if match found, None otherwise.
"""
inst_name = inst.get('name', '')
inst_type = inst.get('institution_type', '')
# Try exact matches first (by existing identifiers)
existing_ids = inst.get('identifiers', [])
for identifier in existing_ids:
if identifier.get('identifier_scheme') == 'Wikidata':
qid = identifier.get('identifier_value', '')
if qid in wikidata_results:
return wikidata_results[qid]
# Fuzzy matching by name
best_match = None
best_score = 0.0
for qid, wd_data in wikidata_results.items():
wd_name = wd_data.get('name', '')
wd_desc = wd_data.get('description', '')
# Check type compatibility first
if not institution_type_compatible(inst_type, inst_name, wd_name, wd_desc):
continue
# Calculate similarity score
score = similarity_score(inst_name, wd_name)
if score > best_score:
best_score = score
best_match = wd_data
# Return match if above threshold
if best_score >= fuzzy_threshold and best_match:
enriched_match = dict(best_match)
enriched_match["match_score"] = best_score
return enriched_match
return None
def main():
print("=" * 80)
print("🇬🇪 Georgia Heritage Institutions Enrichment - Batch 1")
print("=" * 80)
print()
print("Target: 14 institutions with 0% Wikidata coverage")
print("Goal: Achieve 50%+ coverage (7+ institutions)")
print()
# Paths
data_dir = Path(__file__).parent.parent / "data" / "instances"
input_file = data_dir / "all" / "globalglam-20251111.yaml"
output_file = data_dir / "georgia" / "georgian_institutions_enriched_batch1.yaml"
output_file.parent.mkdir(parents=True, exist_ok=True)
# Step 1: Load Georgian institutions
print("📂 Loading Georgian institutions...")
institutions = load_georgia_institutions(input_file)
print(f" ✅ Loaded {len(institutions)} Georgian institutions")
print()
# Step 2: Query Wikidata
print("🌐 Querying Wikidata for Georgian GLAM institutions...")
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
wikidata_results = query_georgian_institutions(sparql)
print(f" ✅ Found {len(wikidata_results)} institutions in Wikidata")
print()
# Step 3: Fuzzy matching and enrichment
print("🔍 Matching institutions with Wikidata (threshold: 0.85)...")
print()
enriched_count = 0
geocoded_count = 0
for i, inst in enumerate(institutions, 1):
inst_name = inst.get('name', 'Unknown')
inst_type = inst.get('institution_type', 'MIXED')
print(f"{i:2d}. {inst_name} ({inst_type})")
# Try Wikidata enrichment
enrichment = enrich_institution(inst, wikidata_results)
if enrichment:
match_score = enrichment.get('match_score', 0.0)
qid = enrichment.get('qid', '')
print(f" ✅ Matched: {enrichment.get('name')} ({qid}) - Score: {match_score:.2f}")
# Add Wikidata identifier
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': qid,
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
})
# Add other identifiers
for scheme, value in enrichment.get('identifiers', {}).items():
if scheme == 'Website':
inst['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': value,
'identifier_url': value
})
else:
inst['identifiers'].append({
'identifier_scheme': scheme,
'identifier_value': value
})
# Add coordinates if available
if 'latitude' in enrichment and 'longitude' in enrichment:
if 'locations' not in inst or not inst['locations']:
inst['locations'] = [{'country': 'GE'}]
inst['locations'][0]['latitude'] = enrichment['latitude']
inst['locations'][0]['longitude'] = enrichment['longitude']
print(f" 📍 Coordinates: {enrichment['latitude']:.4f}, {enrichment['longitude']:.4f}")
# Add founding date if available
if 'founding_date' in enrichment:
inst['founding_date'] = enrichment['founding_date']
print(f" 📅 Founded: {enrichment['founding_date']}")
# Add description from Wikidata
if enrichment.get('description'):
if not inst.get('description'):
inst['description'] = enrichment['description']
print(f" 📝 Description: {enrichment['description'][:60]}...")
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', [])
inst['provenance']['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'Wikidata SPARQL query + fuzzy name matching',
'match_score': match_score,
'verified': False
})
enriched_count += 1
else:
print(f" ⚠️ No Wikidata match found")
# Try geocoding as fallback
geocode_result = geocode_institution(inst_name)
if geocode_result:
if 'locations' not in inst or not inst['locations']:
inst['locations'] = [{'country': 'GE'}]
inst['locations'][0]['latitude'] = geocode_result['latitude']
inst['locations'][0]['longitude'] = geocode_result['longitude']
print(f" 📍 Geocoded: {geocode_result['latitude']:.4f}, {geocode_result['longitude']:.4f}")
geocoded_count += 1
print()
# Step 4: Save enriched data
print("💾 Saving enriched dataset...")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
print(f" ✅ Saved to: {output_file}")
print()
# Step 5: Report results
print("=" * 80)
print("📊 ENRICHMENT RESULTS")
print("=" * 80)
print()
print(f"Total institutions: {len(institutions)}")
print(f"Wikidata matches: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)")
print(f"Geocoded (fallback): {geocoded_count}")
print(f"Still need enrichment: {len(institutions) - enriched_count}")
print()
if enriched_count >= 7:
print("✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
else:
print(f"⚠️ Below target: {7 - enriched_count} more matches needed for 50% coverage")
print()
print("Next steps:")
print("1. Review matches manually (verify institution identities)")
print("2. Update unified dataset with enriched Georgian records")
print("3. Proceed with other critical countries (GB, BE, US, LU)")
print()
if __name__ == "__main__":
main()