- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
343 lines
12 KiB
Python
Executable file
343 lines
12 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Georgian heritage institutions - Batch 2 (Alternative Names)
|
|
|
|
Strategy: Use alternative names (including Georgian names) for fuzzy matching
|
|
Target: 10 remaining institutions without Wikidata matches
|
|
Goal: Achieve 50%+ total coverage (7+ institutions)
|
|
|
|
Improvements over Batch 1:
|
|
1. Include alternative names in fuzzy matching
|
|
2. Try partial name matching (e.g., "Stalin Museum" → "Joseph Stalin Museum")
|
|
3. Lower fuzzy threshold to 0.80 for specific matches
|
|
4. Manual review of close matches (0.75-0.85)
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Optional, Dict, List
|
|
from datetime import datetime, timezone
|
|
import yaml
|
|
from difflib import SequenceMatcher
|
|
import re
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for fuzzy matching."""
|
|
name = name.lower()
|
|
|
|
# Remove common prefixes/suffixes
|
|
name = re.sub(r'^(museum|muzeum|library|biblioteka|archive|arkivi|state|national|central)[\s\-]+', '', name)
|
|
name = re.sub(r'[\s\-]+(museum|muzeum|library|biblioteka|archive|arkivi|georgia|georgian|of georgia)$', '', name)
|
|
|
|
# Remove punctuation
|
|
name = re.sub(r'[^\w\s]', ' ', name)
|
|
|
|
# Normalize whitespace
|
|
name = ' '.join(name.split())
|
|
|
|
return name
|
|
|
|
|
|
def similarity_score(name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two names (0-1)."""
|
|
norm1 = normalize_name(name1)
|
|
norm2 = normalize_name(name2)
|
|
return SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
|
|
def query_georgian_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]:
|
|
"""Query Wikidata for GLAM institutions in Georgia."""
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?altLabel ?typeLabel ?isil ?viaf ?coords ?website ?inception
|
|
WHERE {
|
|
?item wdt:P17 wd:Q230 .
|
|
|
|
VALUES ?type {
|
|
wd:Q7075 wd:Q166118 wd:Q33506 wd:Q1007870 wd:Q28564
|
|
wd:Q11396180 wd:Q207694 wd:Q2772772 wd:Q768717 wd:Q7406919
|
|
}
|
|
?item wdt:P31 ?type .
|
|
|
|
OPTIONAL { ?item wdt:P791 ?isil . }
|
|
OPTIONAL { ?item wdt:P214 ?viaf . }
|
|
OPTIONAL { ?item wdt:P625 ?coords . }
|
|
OPTIONAL { ?item wdt:P856 ?website . }
|
|
OPTIONAL { ?item wdt:P571 ?inception . }
|
|
OPTIONAL { ?item skos:altLabel ?altLabel . FILTER(LANG(?altLabel) = "en") }
|
|
|
|
SERVICE wikibase:label {
|
|
bd:serviceParam wikibase:language "en,ka,ru" .
|
|
}
|
|
}
|
|
LIMIT 500
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
|
|
try:
|
|
raw_results = sparql.query().convert()
|
|
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
|
|
|
|
results = {}
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
continue
|
|
|
|
# Collect alternative labels
|
|
alt_names = []
|
|
if "altLabel" in binding:
|
|
alt_names.append(binding["altLabel"]["value"])
|
|
|
|
if qid not in results:
|
|
results[qid] = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"type": binding.get("typeLabel", {}).get("value", ""),
|
|
"alternative_names": alt_names,
|
|
"identifiers": {}
|
|
}
|
|
else:
|
|
# Merge alternative names
|
|
if "altLabel" in binding:
|
|
results[qid]["alternative_names"].append(binding["altLabel"]["value"])
|
|
|
|
if "isil" in binding:
|
|
results[qid]["identifiers"]["ISIL"] = binding["isil"]["value"]
|
|
|
|
if "viaf" in binding:
|
|
results[qid]["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
|
|
if "website" in binding:
|
|
results[qid]["identifiers"]["Website"] = binding["website"]["value"]
|
|
|
|
if "inception" in binding:
|
|
results[qid]["founding_date"] = binding["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
results[qid]["latitude"] = float(lat)
|
|
results[qid]["longitude"] = float(lon)
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ Error querying Wikidata: {e}")
|
|
return {}
|
|
|
|
|
|
def find_best_match(
|
|
inst: Dict[str, Any],
|
|
wikidata_results: Dict[str, Dict[str, Any]],
|
|
threshold: float = 0.80
|
|
) -> Optional[tuple[Dict[str, Any], float, str]]:
|
|
"""
|
|
Find best Wikidata match using primary and alternative names.
|
|
|
|
Returns: (wikidata_entry, score, matched_name) or None
|
|
"""
|
|
inst_names = [inst.get('name', '')]
|
|
if 'alternative_names' in inst:
|
|
inst_names.extend(inst['alternative_names'])
|
|
|
|
best_match = None
|
|
best_score = 0.0
|
|
matched_name = ""
|
|
|
|
for inst_name in inst_names:
|
|
for qid, wd_data in wikidata_results.items():
|
|
# Try primary name
|
|
wd_names = [wd_data.get('name', '')]
|
|
|
|
# Add alternative names
|
|
if 'alternative_names' in wd_data:
|
|
wd_names.extend(wd_data['alternative_names'])
|
|
|
|
for wd_name in wd_names:
|
|
score = similarity_score(inst_name, wd_name)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = wd_data
|
|
matched_name = f"{inst_name} → {wd_name}"
|
|
|
|
if best_score >= threshold and best_match:
|
|
return (best_match, best_score, matched_name)
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("🇬🇪 Georgia Heritage Institutions Enrichment - Batch 2")
|
|
print("=" * 80)
|
|
print()
|
|
print("Strategy: Alternative name matching with lower threshold (0.80)")
|
|
print("Target: 10 institutions without Wikidata matches")
|
|
print()
|
|
|
|
# Paths
|
|
data_dir = Path(__file__).parent.parent / "data" / "instances" / "georgia"
|
|
input_file = data_dir / "georgian_institutions_enriched_batch1.yaml"
|
|
output_file = data_dir / "georgian_institutions_enriched_batch2.yaml"
|
|
|
|
# Load previous batch results
|
|
print("📂 Loading Batch 1 results...")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
# Filter for institutions without Wikidata
|
|
needs_enrichment = []
|
|
already_enriched = 0
|
|
|
|
for inst in institutions:
|
|
has_wikidata = False
|
|
if 'identifiers' in inst:
|
|
for identifier in inst['identifiers']:
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
has_wikidata = True
|
|
already_enriched += 1
|
|
break
|
|
|
|
if not has_wikidata:
|
|
needs_enrichment.append(inst)
|
|
|
|
print(f" ✅ Already enriched: {already_enriched} institutions")
|
|
print(f" ⏳ Need enrichment: {len(needs_enrichment)} institutions")
|
|
print()
|
|
|
|
# Query Wikidata
|
|
print("🌐 Querying Wikidata with alternative names support...")
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
wikidata_results = query_georgian_institutions(sparql)
|
|
print(f" ✅ Found {len(wikidata_results)} institutions in Wikidata")
|
|
print()
|
|
|
|
# Fuzzy matching with alternative names
|
|
print("🔍 Matching with alternative names (threshold: 0.80)...")
|
|
print()
|
|
|
|
new_matches = 0
|
|
|
|
for i, inst in enumerate(needs_enrichment, 1):
|
|
inst_name = inst.get('name', 'Unknown')
|
|
inst_type = inst.get('institution_type', 'MIXED')
|
|
|
|
print(f"{i:2d}. {inst_name} ({inst_type})")
|
|
|
|
# Check for alternative names
|
|
alt_names = inst.get('alternative_names', [])
|
|
if alt_names:
|
|
print(f" Alternative names: {len(alt_names)}")
|
|
|
|
# Try matching
|
|
match_result = find_best_match(inst, wikidata_results)
|
|
|
|
if match_result:
|
|
wd_data, score, matched_name = match_result
|
|
qid = wd_data.get('qid', '')
|
|
|
|
print(f" ✅ Matched: {wd_data.get('name')} ({qid})")
|
|
print(f" Match: {matched_name}")
|
|
print(f" Score: {score:.2f}")
|
|
|
|
# Add Wikidata identifier
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': qid,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
|
|
})
|
|
|
|
# Add other identifiers
|
|
for scheme, value in wd_data.get('identifiers', {}).items():
|
|
if scheme == 'Website':
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': value,
|
|
'identifier_url': value
|
|
})
|
|
else:
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': scheme,
|
|
'identifier_value': value
|
|
})
|
|
|
|
# Add coordinates
|
|
if 'latitude' in wd_data and 'longitude' in wd_data:
|
|
if 'locations' not in inst or not inst['locations']:
|
|
inst['locations'] = [{'country': 'GE'}]
|
|
|
|
inst['locations'][0]['latitude'] = wd_data['latitude']
|
|
inst['locations'][0]['longitude'] = wd_data['longitude']
|
|
print(f" 📍 Coordinates: {wd_data['latitude']:.4f}, {wd_data['longitude']:.4f}")
|
|
|
|
# Add founding date
|
|
if 'founding_date' in wd_data:
|
|
inst['founding_date'] = wd_data['founding_date']
|
|
print(f" 📅 Founded: {wd_data['founding_date']}")
|
|
|
|
# Update provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', [])
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'Wikidata SPARQL + alternative name fuzzy matching',
|
|
'match_score': score,
|
|
'verified': False
|
|
})
|
|
|
|
new_matches += 1
|
|
|
|
else:
|
|
print(f" ⚠️ No match found (tried {1 + len(alt_names)} name variants)")
|
|
|
|
print()
|
|
|
|
# Save results
|
|
print("💾 Saving Batch 2 results...")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f" ✅ Saved to: {output_file}")
|
|
print()
|
|
|
|
# Report
|
|
total_enriched = already_enriched + new_matches
|
|
total_institutions = len(institutions)
|
|
|
|
print("=" * 80)
|
|
print("📊 BATCH 2 RESULTS")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Batch 1 enriched: {already_enriched}")
|
|
print(f"Batch 2 new matches: {new_matches}")
|
|
print(f"Total enriched: {total_enriched} ({total_enriched/total_institutions*100:.1f}%)")
|
|
print(f"Still need enrichment: {total_institutions - total_enriched}")
|
|
print()
|
|
|
|
if total_enriched >= 7:
|
|
print("✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
|
|
else:
|
|
print(f"⚠️ Below target: {7 - total_enriched} more matches needed")
|
|
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|