- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
440 lines
16 KiB
Python
Executable file
440 lines
16 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Georgian heritage institutions - Phase 1 Proof of Concept
|
|
|
|
Target: 14 Georgian institutions with 0% Wikidata coverage
|
|
Goal: Achieve 50%+ Wikidata coverage (7+ institutions matched)
|
|
|
|
Strategy:
|
|
1. Query Wikidata for museums/libraries/archives in Georgia (Q230)
|
|
2. Fuzzy match institution names with 0.85+ threshold
|
|
3. Verify type compatibility (museum, library, archive)
|
|
4. Enrich with Wikidata Q-numbers, VIAF, coordinates, websites
|
|
5. Geocode remaining institutions using Nominatim
|
|
|
|
CRITICAL: This follows the Chilean enrichment success pattern (78.9% coverage)
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Optional, Dict, List
|
|
from datetime import datetime, timezone
|
|
import time
|
|
import yaml
|
|
from difflib import SequenceMatcher
|
|
import re
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
|
|
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for fuzzy matching."""
|
|
name = name.lower()
|
|
|
|
# Remove common prefixes/suffixes (English, Georgian transliterations)
|
|
name = re.sub(r'^(museum|muzeum|museu|library|biblioteka|archive|arkivi)[\s\-]+', '', name)
|
|
name = re.sub(r'[\s\-]+(museum|muzeum|library|biblioteka|archive|arkivi|georgia|georgian)$', '', name)
|
|
|
|
# Remove organizational forms
|
|
name = re.sub(r'\b(foundation|institute|state|national|central)\b', '', name)
|
|
|
|
# Remove punctuation
|
|
name = re.sub(r'[^\w\s]', ' ', name)
|
|
|
|
# Normalize whitespace
|
|
name = ' '.join(name.split())
|
|
|
|
return name
|
|
|
|
|
|
def similarity_score(name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two names (0-1)."""
|
|
norm1 = normalize_name(name1)
|
|
norm2 = normalize_name(name2)
|
|
return SequenceMatcher(None, norm1, norm2).ratio()
|
|
|
|
|
|
def institution_type_compatible(inst_type: str, inst_name: str, wd_name: str, wd_desc: str) -> bool:
|
|
"""
|
|
Check if institution types are compatible.
|
|
Prevents mismatches like museum → archive.
|
|
"""
|
|
museum_kw = ['museum', 'muzeum', 'museu']
|
|
archive_kw = ['archive', 'arkivi', 'archiv']
|
|
library_kw = ['library', 'biblioteka', 'bibliothek']
|
|
|
|
inst_lower = (inst_name + ' ' + inst_type).lower()
|
|
wd_lower = (wd_name + ' ' + wd_desc).lower()
|
|
|
|
inst_is_museum = any(kw in inst_lower for kw in museum_kw) or inst_type == 'MUSEUM'
|
|
inst_is_archive = any(kw in inst_lower for kw in archive_kw) or inst_type == 'ARCHIVE'
|
|
inst_is_library = any(kw in inst_lower for kw in library_kw) or inst_type == 'LIBRARY'
|
|
|
|
wd_is_museum = any(kw in wd_lower for kw in museum_kw)
|
|
wd_is_archive = any(kw in wd_lower for kw in archive_kw)
|
|
wd_is_library = any(kw in wd_lower for kw in library_kw)
|
|
|
|
# If both have explicit types, they must match
|
|
if inst_is_museum and not wd_is_museum:
|
|
return False
|
|
if inst_is_archive and not wd_is_archive:
|
|
return False
|
|
if inst_is_library and not wd_is_library:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def query_georgian_institutions(sparql: SPARQLWrapper) -> Dict[str, Dict[str, Any]]:
|
|
"""
|
|
Query Wikidata for GLAM institutions in Georgia (Q230).
|
|
|
|
Returns: dict keyed by QID with institution data
|
|
"""
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?typeLabel ?isil ?viaf ?coords ?website ?inception
|
|
WHERE {
|
|
# Institution is in Georgia
|
|
?item wdt:P17 wd:Q230 .
|
|
|
|
# Institution is a GLAM type
|
|
VALUES ?type {
|
|
wd:Q7075 # library
|
|
wd:Q166118 # archive
|
|
wd:Q33506 # museum
|
|
wd:Q1007870 # art gallery
|
|
wd:Q28564 # public library
|
|
wd:Q11396180 # academic library
|
|
wd:Q207694 # art museum
|
|
wd:Q2772772 # history museum
|
|
wd:Q768717 # ethnographic museum
|
|
wd:Q7406919 # state museum
|
|
}
|
|
?item wdt:P31 ?type .
|
|
|
|
# Optional enrichment data
|
|
OPTIONAL { ?item wdt:P791 ?isil . } # ISIL code
|
|
OPTIONAL { ?item wdt:P214 ?viaf . } # VIAF ID
|
|
OPTIONAL { ?item wdt:P625 ?coords . } # Coordinates
|
|
OPTIONAL { ?item wdt:P856 ?website . } # Official website
|
|
OPTIONAL { ?item wdt:P571 ?inception . } # Founding date
|
|
|
|
# Get labels (English, Georgian, Russian)
|
|
SERVICE wikibase:label {
|
|
bd:serviceParam wikibase:language "en,ka,ru" .
|
|
}
|
|
}
|
|
LIMIT 500
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
|
|
try:
|
|
raw_results = sparql.query().convert()
|
|
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
|
|
|
|
results = {}
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
continue
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"type": binding.get("typeLabel", {}).get("value", ""),
|
|
"identifiers": {}
|
|
}
|
|
|
|
if "isil" in binding:
|
|
result["identifiers"]["ISIL"] = binding["isil"]["value"]
|
|
|
|
if "viaf" in binding:
|
|
result["identifiers"]["VIAF"] = binding["viaf"]["value"]
|
|
|
|
if "website" in binding:
|
|
result["identifiers"]["Website"] = binding["website"]["value"]
|
|
|
|
if "inception" in binding:
|
|
result["founding_date"] = binding["inception"]["value"].split("T")[0]
|
|
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
results[qid] = result
|
|
|
|
return results
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ Error querying Wikidata: {e}")
|
|
return {}
|
|
|
|
|
|
def geocode_institution(name: str, country: str = "Georgia") -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Geocode institution using Nominatim.
|
|
Respects 1 req/sec rate limit.
|
|
"""
|
|
try:
|
|
import requests
|
|
|
|
# Try with institution name + country
|
|
search_query = f"{name}, {country}"
|
|
|
|
response = requests.get(
|
|
"https://nominatim.openstreetmap.org/search",
|
|
params={
|
|
"q": search_query,
|
|
"format": "json",
|
|
"limit": 1,
|
|
"countrycodes": "ge" # Georgia ISO code
|
|
},
|
|
headers={"User-Agent": "GLAM-Dataset-Enrichment/1.0"}
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json()
|
|
if results:
|
|
location = results[0]
|
|
return {
|
|
"latitude": float(location["lat"]),
|
|
"longitude": float(location["lon"]),
|
|
"display_name": location.get("display_name", "")
|
|
}
|
|
|
|
# Rate limit: 1 request per second
|
|
time.sleep(1.1)
|
|
|
|
except Exception as e:
|
|
print(f" ⚠️ Geocoding error: {e}")
|
|
|
|
return None
|
|
|
|
|
|
def load_georgia_institutions(yaml_path: Path) -> List[Dict[str, Any]]:
|
|
"""Load Georgian institutions from unified dataset."""
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Filter for Georgian institutions
|
|
return [inst for inst in data if inst.get('locations', [{}])[0].get('country') == 'GE']
|
|
|
|
|
|
def enrich_institution(
|
|
inst: Dict[str, Any],
|
|
wikidata_results: Dict[str, Dict[str, Any]],
|
|
fuzzy_threshold: float = 0.85
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Try to enrich institution with Wikidata data.
|
|
|
|
Returns enrichment data if match found, None otherwise.
|
|
"""
|
|
inst_name = inst.get('name', '')
|
|
inst_type = inst.get('institution_type', '')
|
|
|
|
# Try exact matches first (by existing identifiers)
|
|
existing_ids = inst.get('identifiers', [])
|
|
for identifier in existing_ids:
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
qid = identifier.get('identifier_value', '')
|
|
if qid in wikidata_results:
|
|
return wikidata_results[qid]
|
|
|
|
# Fuzzy matching by name
|
|
best_match = None
|
|
best_score = 0.0
|
|
|
|
for qid, wd_data in wikidata_results.items():
|
|
wd_name = wd_data.get('name', '')
|
|
wd_desc = wd_data.get('description', '')
|
|
|
|
# Check type compatibility first
|
|
if not institution_type_compatible(inst_type, inst_name, wd_name, wd_desc):
|
|
continue
|
|
|
|
# Calculate similarity score
|
|
score = similarity_score(inst_name, wd_name)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = wd_data
|
|
|
|
# Return match if above threshold
|
|
if best_score >= fuzzy_threshold and best_match:
|
|
enriched_match = dict(best_match)
|
|
enriched_match["match_score"] = best_score
|
|
return enriched_match
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("🇬🇪 Georgia Heritage Institutions Enrichment - Batch 1")
|
|
print("=" * 80)
|
|
print()
|
|
print("Target: 14 institutions with 0% Wikidata coverage")
|
|
print("Goal: Achieve 50%+ coverage (7+ institutions)")
|
|
print()
|
|
|
|
# Paths
|
|
data_dir = Path(__file__).parent.parent / "data" / "instances"
|
|
input_file = data_dir / "all" / "globalglam-20251111.yaml"
|
|
output_file = data_dir / "georgia" / "georgian_institutions_enriched_batch1.yaml"
|
|
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Step 1: Load Georgian institutions
|
|
print("📂 Loading Georgian institutions...")
|
|
institutions = load_georgia_institutions(input_file)
|
|
print(f" ✅ Loaded {len(institutions)} Georgian institutions")
|
|
print()
|
|
|
|
# Step 2: Query Wikidata
|
|
print("🌐 Querying Wikidata for Georgian GLAM institutions...")
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
wikidata_results = query_georgian_institutions(sparql)
|
|
print(f" ✅ Found {len(wikidata_results)} institutions in Wikidata")
|
|
print()
|
|
|
|
# Step 3: Fuzzy matching and enrichment
|
|
print("🔍 Matching institutions with Wikidata (threshold: 0.85)...")
|
|
print()
|
|
|
|
enriched_count = 0
|
|
geocoded_count = 0
|
|
|
|
for i, inst in enumerate(institutions, 1):
|
|
inst_name = inst.get('name', 'Unknown')
|
|
inst_type = inst.get('institution_type', 'MIXED')
|
|
|
|
print(f"{i:2d}. {inst_name} ({inst_type})")
|
|
|
|
# Try Wikidata enrichment
|
|
enrichment = enrich_institution(inst, wikidata_results)
|
|
|
|
if enrichment:
|
|
match_score = enrichment.get('match_score', 0.0)
|
|
qid = enrichment.get('qid', '')
|
|
print(f" ✅ Matched: {enrichment.get('name')} ({qid}) - Score: {match_score:.2f}")
|
|
|
|
# Add Wikidata identifier
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': qid,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
|
|
})
|
|
|
|
# Add other identifiers
|
|
for scheme, value in enrichment.get('identifiers', {}).items():
|
|
if scheme == 'Website':
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': value,
|
|
'identifier_url': value
|
|
})
|
|
else:
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': scheme,
|
|
'identifier_value': value
|
|
})
|
|
|
|
# Add coordinates if available
|
|
if 'latitude' in enrichment and 'longitude' in enrichment:
|
|
if 'locations' not in inst or not inst['locations']:
|
|
inst['locations'] = [{'country': 'GE'}]
|
|
|
|
inst['locations'][0]['latitude'] = enrichment['latitude']
|
|
inst['locations'][0]['longitude'] = enrichment['longitude']
|
|
print(f" 📍 Coordinates: {enrichment['latitude']:.4f}, {enrichment['longitude']:.4f}")
|
|
|
|
# Add founding date if available
|
|
if 'founding_date' in enrichment:
|
|
inst['founding_date'] = enrichment['founding_date']
|
|
print(f" 📅 Founded: {enrichment['founding_date']}")
|
|
|
|
# Add description from Wikidata
|
|
if enrichment.get('description'):
|
|
if not inst.get('description'):
|
|
inst['description'] = enrichment['description']
|
|
print(f" 📝 Description: {enrichment['description'][:60]}...")
|
|
|
|
# Update provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', [])
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'Wikidata SPARQL query + fuzzy name matching',
|
|
'match_score': match_score,
|
|
'verified': False
|
|
})
|
|
|
|
enriched_count += 1
|
|
|
|
else:
|
|
print(f" ⚠️ No Wikidata match found")
|
|
|
|
# Try geocoding as fallback
|
|
geocode_result = geocode_institution(inst_name)
|
|
|
|
if geocode_result:
|
|
if 'locations' not in inst or not inst['locations']:
|
|
inst['locations'] = [{'country': 'GE'}]
|
|
|
|
inst['locations'][0]['latitude'] = geocode_result['latitude']
|
|
inst['locations'][0]['longitude'] = geocode_result['longitude']
|
|
print(f" 📍 Geocoded: {geocode_result['latitude']:.4f}, {geocode_result['longitude']:.4f}")
|
|
geocoded_count += 1
|
|
|
|
print()
|
|
|
|
# Step 4: Save enriched data
|
|
print("💾 Saving enriched dataset...")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f" ✅ Saved to: {output_file}")
|
|
print()
|
|
|
|
# Step 5: Report results
|
|
print("=" * 80)
|
|
print("📊 ENRICHMENT RESULTS")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Total institutions: {len(institutions)}")
|
|
print(f"Wikidata matches: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)")
|
|
print(f"Geocoded (fallback): {geocoded_count}")
|
|
print(f"Still need enrichment: {len(institutions) - enriched_count}")
|
|
print()
|
|
|
|
if enriched_count >= 7:
|
|
print("✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
|
|
else:
|
|
print(f"⚠️ Below target: {7 - enriched_count} more matches needed for 50% coverage")
|
|
|
|
print()
|
|
print("Next steps:")
|
|
print("1. Review matches manually (verify institution identities)")
|
|
print("2. Update unified dataset with enriched Georgian records")
|
|
print("3. Proceed with other critical countries (GB, BE, US, LU)")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|