glam/scripts/enrich_georgia_batch2_alternative_names.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

343 lines
12 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Enrich Georgian heritage institutions - Batch 2 (Alternative Names)
Strategy: Use alternative names (including Georgian names) for fuzzy matching
Target: 10 remaining institutions without Wikidata matches
Goal: Achieve 50%+ total coverage (7+ institutions)
Improvements over Batch 1:
1. Include alternative names in fuzzy matching
2. Try partial name matching (e.g., "Stalin Museum""Joseph Stalin Museum")
3. Lower fuzzy threshold to 0.80 for specific matches
4. Manual review of close matches (0.75-0.85)
"""
import sys
from pathlib import Path
from typing import Any, Optional, Dict, List
from datetime import datetime, timezone
import yaml
from difflib import SequenceMatcher
import re
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
def normalize_name(name: str) -> str:
"""Normalize institution name for fuzzy matching."""
name = name.lower()
# Remove common prefixes/suffixes
name = re.sub(r'^(museum|muzeum|library|biblioteka|archive|arkivi|state|national|central)[\s\-]+', '', name)
name = re.sub(r'[\s\-]+(museum|muzeum|library|biblioteka|archive|arkivi|georgia|georgian|of georgia)$', '', name)
# Remove punctuation
name = re.sub(r'[^\w\s]', ' ', name)
# Normalize whitespace
name = ' '.join(name.split())
return name
def similarity_score(name1: str, name2: str) -> float:
"""Calculate similarity between two names (0-1)."""
norm1 = normalize_name(name1)
norm2 = normalize_name(name2)
return SequenceMatcher(None, norm1, norm2).ratio()
def query_georgian_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]:
"""Query Wikidata for GLAM institutions in Georgia."""
query = """
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?altLabel ?typeLabel ?isil ?viaf ?coords ?website ?inception
WHERE {
?item wdt:P17 wd:Q230 .
VALUES ?type {
wd:Q7075 wd:Q166118 wd:Q33506 wd:Q1007870 wd:Q28564
wd:Q11396180 wd:Q207694 wd:Q2772772 wd:Q768717 wd:Q7406919
}
?item wdt:P31 ?type .
OPTIONAL { ?item wdt:P791 ?isil . }
OPTIONAL { ?item wdt:P214 ?viaf . }
OPTIONAL { ?item wdt:P625 ?coords . }
OPTIONAL { ?item wdt:P856 ?website . }
OPTIONAL { ?item wdt:P571 ?inception . }
OPTIONAL { ?item skos:altLabel ?altLabel . FILTER(LANG(?altLabel) = "en") }
SERVICE wikibase:label {
bd:serviceParam wikibase:language "en,ka,ru" .
}
}
LIMIT 500
"""
sparql.setQuery(query)
try:
raw_results = sparql.query().convert()
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
results = {}
for binding in bindings:
item_uri = binding.get("item", {}).get("value", "")
qid = item_uri.split("/")[-1] if item_uri else None
if not qid or not qid.startswith("Q"):
continue
# Collect alternative labels
alt_names = []
if "altLabel" in binding:
alt_names.append(binding["altLabel"]["value"])
if qid not in results:
results[qid] = {
"qid": qid,
"name": binding.get("itemLabel", {}).get("value", ""),
"description": binding.get("itemDescription", {}).get("value", ""),
"type": binding.get("typeLabel", {}).get("value", ""),
"alternative_names": alt_names,
"identifiers": {}
}
else:
# Merge alternative names
if "altLabel" in binding:
results[qid]["alternative_names"].append(binding["altLabel"]["value"])
if "isil" in binding:
results[qid]["identifiers"]["ISIL"] = binding["isil"]["value"]
if "viaf" in binding:
results[qid]["identifiers"]["VIAF"] = binding["viaf"]["value"]
if "website" in binding:
results[qid]["identifiers"]["Website"] = binding["website"]["value"]
if "inception" in binding:
results[qid]["founding_date"] = binding["inception"]["value"].split("T")[0]
if "coords" in binding:
coords_str = binding["coords"]["value"]
if coords_str.startswith("Point("):
lon, lat = coords_str[6:-1].split()
results[qid]["latitude"] = float(lat)
results[qid]["longitude"] = float(lon)
return results
except Exception as e:
print(f"\n❌ Error querying Wikidata: {e}")
return {}
def find_best_match(
inst: Dict[str, Any],
wikidata_results: Dict[str, Dict[str, Any]],
threshold: float = 0.80
) -> Optional[tuple[Dict[str, Any], float, str]]:
"""
Find best Wikidata match using primary and alternative names.
Returns: (wikidata_entry, score, matched_name) or None
"""
inst_names = [inst.get('name', '')]
if 'alternative_names' in inst:
inst_names.extend(inst['alternative_names'])
best_match = None
best_score = 0.0
matched_name = ""
for inst_name in inst_names:
for qid, wd_data in wikidata_results.items():
# Try primary name
wd_names = [wd_data.get('name', '')]
# Add alternative names
if 'alternative_names' in wd_data:
wd_names.extend(wd_data['alternative_names'])
for wd_name in wd_names:
score = similarity_score(inst_name, wd_name)
if score > best_score:
best_score = score
best_match = wd_data
matched_name = f"{inst_name}{wd_name}"
if best_score >= threshold and best_match:
return (best_match, best_score, matched_name)
return None
def main():
print("=" * 80)
print("🇬🇪 Georgia Heritage Institutions Enrichment - Batch 2")
print("=" * 80)
print()
print("Strategy: Alternative name matching with lower threshold (0.80)")
print("Target: 10 institutions without Wikidata matches")
print()
# Paths
data_dir = Path(__file__).parent.parent / "data" / "instances" / "georgia"
input_file = data_dir / "georgian_institutions_enriched_batch1.yaml"
output_file = data_dir / "georgian_institutions_enriched_batch2.yaml"
# Load previous batch results
print("📂 Loading Batch 1 results...")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
# Filter for institutions without Wikidata
needs_enrichment = []
already_enriched = 0
for inst in institutions:
has_wikidata = False
if 'identifiers' in inst:
for identifier in inst['identifiers']:
if identifier.get('identifier_scheme') == 'Wikidata':
has_wikidata = True
already_enriched += 1
break
if not has_wikidata:
needs_enrichment.append(inst)
print(f" ✅ Already enriched: {already_enriched} institutions")
print(f" ⏳ Need enrichment: {len(needs_enrichment)} institutions")
print()
# Query Wikidata
print("🌐 Querying Wikidata with alternative names support...")
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(SPARQL_JSON)
wikidata_results = query_georgian_institutions(sparql)
print(f" ✅ Found {len(wikidata_results)} institutions in Wikidata")
print()
# Fuzzy matching with alternative names
print("🔍 Matching with alternative names (threshold: 0.80)...")
print()
new_matches = 0
for i, inst in enumerate(needs_enrichment, 1):
inst_name = inst.get('name', 'Unknown')
inst_type = inst.get('institution_type', 'MIXED')
print(f"{i:2d}. {inst_name} ({inst_type})")
# Check for alternative names
alt_names = inst.get('alternative_names', [])
if alt_names:
print(f" Alternative names: {len(alt_names)}")
# Try matching
match_result = find_best_match(inst, wikidata_results)
if match_result:
wd_data, score, matched_name = match_result
qid = wd_data.get('qid', '')
print(f" ✅ Matched: {wd_data.get('name')} ({qid})")
print(f" Match: {matched_name}")
print(f" Score: {score:.2f}")
# Add Wikidata identifier
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': qid,
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
})
# Add other identifiers
for scheme, value in wd_data.get('identifiers', {}).items():
if scheme == 'Website':
inst['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': value,
'identifier_url': value
})
else:
inst['identifiers'].append({
'identifier_scheme': scheme,
'identifier_value': value
})
# Add coordinates
if 'latitude' in wd_data and 'longitude' in wd_data:
if 'locations' not in inst or not inst['locations']:
inst['locations'] = [{'country': 'GE'}]
inst['locations'][0]['latitude'] = wd_data['latitude']
inst['locations'][0]['longitude'] = wd_data['longitude']
print(f" 📍 Coordinates: {wd_data['latitude']:.4f}, {wd_data['longitude']:.4f}")
# Add founding date
if 'founding_date' in wd_data:
inst['founding_date'] = wd_data['founding_date']
print(f" 📅 Founded: {wd_data['founding_date']}")
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', [])
inst['provenance']['enrichment_history'].append({
'enrichment_date': datetime.now(timezone.utc).isoformat(),
'enrichment_method': 'Wikidata SPARQL + alternative name fuzzy matching',
'match_score': score,
'verified': False
})
new_matches += 1
else:
print(f" ⚠️ No match found (tried {1 + len(alt_names)} name variants)")
print()
# Save results
print("💾 Saving Batch 2 results...")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
print(f" ✅ Saved to: {output_file}")
print()
# Report
total_enriched = already_enriched + new_matches
total_institutions = len(institutions)
print("=" * 80)
print("📊 BATCH 2 RESULTS")
print("=" * 80)
print()
print(f"Batch 1 enriched: {already_enriched}")
print(f"Batch 2 new matches: {new_matches}")
print(f"Total enriched: {total_enriched} ({total_enriched/total_institutions*100:.1f}%)")
print(f"Still need enrichment: {total_institutions - total_enriched}")
print()
if total_enriched >= 7:
print("✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
else:
print(f"⚠️ Below target: {7 - total_enriched} more matches needed")
print()
if __name__ == "__main__":
main()