- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
244 lines
8.9 KiB
Python
244 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Great Britain Heritage Institutions Enrichment - Batch 1
|
|
==========================================================
|
|
|
|
Strategy: Fuzzy name matching with Wikidata SPARQL queries
|
|
Threshold: 0.85 (same as Georgia Batch 1)
|
|
|
|
Target: 4 GB institutions (0% current coverage)
|
|
Goal: Achieve 50%+ Wikidata coverage
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
from rapidfuzz import fuzz
|
|
import time
|
|
|
|
# Wikidata SPARQL endpoint
|
|
WIKIDATA_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
|
|
def query_wikidata_gb_institutions():
|
|
"""Query Wikidata for British heritage institutions."""
|
|
sparql = SPARQLWrapper(WIKIDATA_ENDPOINT)
|
|
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?coord ?website ?viaf ?isil ?inception WHERE {
|
|
# Archives, research centers, databases in Great Britain
|
|
VALUES ?type {
|
|
wd:Q166118 # archives
|
|
wd:Q21045422 # research database
|
|
wd:Q31855 # research institute
|
|
wd:Q7315155 # research center
|
|
wd:Q3918 # university (for university-based archives/research centers)
|
|
}
|
|
|
|
?item wdt:P31/wdt:P279* ?type .
|
|
?item wdt:P17 wd:Q145 . # Country: United Kingdom
|
|
|
|
OPTIONAL { ?item wdt:P625 ?coord }
|
|
OPTIONAL { ?item wdt:P856 ?website }
|
|
OPTIONAL { ?item wdt:P214 ?viaf }
|
|
OPTIONAL { ?item wdt:P791 ?isil }
|
|
OPTIONAL { ?item wdt:P571 ?inception }
|
|
|
|
SERVICE wikibase:label {
|
|
bd:serviceParam wikibase:language "en,ar" .
|
|
?item rdfs:label ?itemLabel .
|
|
?item skos:altLabel ?itemAltLabel .
|
|
}
|
|
}
|
|
LIMIT 1000
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
sparql.setReturnFormat(JSON)
|
|
|
|
print("🔍 Querying Wikidata for British heritage institutions...")
|
|
try:
|
|
results = sparql.query().convert()
|
|
institutions = results['results']['bindings']
|
|
print(f" ✅ Found {len(institutions)} British institutions in Wikidata\n")
|
|
return institutions
|
|
except Exception as e:
|
|
print(f" ❌ Query failed: {e}")
|
|
return []
|
|
|
|
def fuzzy_match_institutions(our_institutions, wikidata_institutions, threshold=0.85):
|
|
"""Match our institutions to Wikidata using fuzzy name matching."""
|
|
matches = []
|
|
|
|
for our_inst in our_institutions:
|
|
our_name = our_inst['name'].lower()
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
for wd_inst in wikidata_institutions:
|
|
wd_label = wd_inst.get('itemLabel', {}).get('value', '').lower()
|
|
|
|
# Try main label
|
|
score = fuzz.ratio(our_name, wd_label)
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = wd_inst
|
|
|
|
# Try alternative labels
|
|
if 'itemAltLabel' in wd_inst:
|
|
alt_label = wd_inst['itemAltLabel']['value'].lower()
|
|
alt_score = fuzz.ratio(our_name, alt_label)
|
|
if alt_score > best_score:
|
|
best_score = alt_score
|
|
best_match = wd_inst
|
|
|
|
if best_score >= threshold * 100: # rapidfuzz returns 0-100
|
|
matches.append({
|
|
'institution': our_inst,
|
|
'wikidata': best_match,
|
|
'score': best_score / 100
|
|
})
|
|
print(f" ✅ Match (score={best_score/100:.2f}): {our_inst['name']}")
|
|
print(f" → {best_match['itemLabel']['value']} ({best_match['item']['value'].split('/')[-1]})")
|
|
else:
|
|
print(f" ❌ No match: {our_inst['name']} (best score: {best_score/100:.2f})")
|
|
|
|
return matches
|
|
|
|
def enrich_with_wikidata(institution, wikidata_data, match_score):
|
|
"""Add Wikidata identifiers and metadata to institution."""
|
|
q_id = wikidata_data['item']['value'].split('/')[-1]
|
|
|
|
# Add Wikidata identifier
|
|
if 'identifiers' not in institution:
|
|
institution['identifiers'] = []
|
|
|
|
# Check if Wikidata identifier already exists
|
|
has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in institution['identifiers'])
|
|
if not has_wikidata:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_id,
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{q_id}"
|
|
})
|
|
|
|
# Add VIAF if available
|
|
if 'viaf' in wikidata_data:
|
|
has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in institution['identifiers'])
|
|
if not has_viaf:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': wikidata_data['viaf']['value'],
|
|
'identifier_url': f"https://viaf.org/viaf/{wikidata_data['viaf']['value']}"
|
|
})
|
|
|
|
# Add ISIL if available
|
|
if 'isil' in wikidata_data:
|
|
has_isil = any(i.get('identifier_scheme') == 'ISIL' for i in institution['identifiers'])
|
|
if not has_isil:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': wikidata_data['isil']['value']
|
|
})
|
|
|
|
# Add coordinates if available and not already present
|
|
if 'coord' in wikidata_data:
|
|
coord_str = wikidata_data['coord']['value']
|
|
# Parse "Point(lon lat)" format
|
|
coord_str = coord_str.replace('Point(', '').replace(')', '')
|
|
lon, lat = map(float, coord_str.split())
|
|
|
|
for location in institution.get('locations', []):
|
|
if location.get('country') == 'GB' and 'latitude' not in location:
|
|
location['latitude'] = lat
|
|
location['longitude'] = lon
|
|
|
|
# Add founding date if available
|
|
if 'inception' in wikidata_data:
|
|
institution['founding_date'] = wikidata_data['inception']['value'].split('T')[0]
|
|
|
|
# Update provenance
|
|
if 'provenance' not in institution:
|
|
institution['provenance'] = {}
|
|
|
|
if 'notes' not in institution['provenance']:
|
|
institution['provenance']['notes'] = []
|
|
|
|
institution['provenance']['notes'].append(
|
|
f"Batch 1: Fuzzy name match (score={match_score:.2f}) - Wikidata {q_id}"
|
|
)
|
|
institution['provenance']['last_updated'] = datetime.now(timezone.utc).isoformat()
|
|
institution['provenance']['wikidata_verified'] = True
|
|
|
|
return institution
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("🇬🇧 Great Britain Heritage Institutions Enrichment - Batch 1")
|
|
print("=" * 80)
|
|
print("\nStrategy: Fuzzy name matching (threshold 0.85)\n")
|
|
|
|
# Load our dataset
|
|
print("📂 Loading unified global dataset...")
|
|
with open('data/instances/all/globalglam-20251111.yaml', 'r', encoding='utf-8') as f:
|
|
all_institutions = yaml.safe_load(f)
|
|
|
|
# Filter GB institutions
|
|
gb_institutions = [
|
|
inst for inst in all_institutions
|
|
if any(loc.get('country') == 'GB' for loc in inst.get('locations', []))
|
|
]
|
|
print(f" ✅ Found {len(gb_institutions)} GB institutions\n")
|
|
|
|
# Query Wikidata
|
|
wikidata_institutions = query_wikidata_gb_institutions()
|
|
time.sleep(1) # Be nice to Wikidata
|
|
|
|
# Fuzzy matching
|
|
print(f"🔗 Matching institutions (threshold=0.85)...\n")
|
|
matches = fuzzy_match_institutions(gb_institutions, wikidata_institutions, threshold=0.85)
|
|
|
|
print(f"\n📊 Found {len(matches)} matches\n")
|
|
|
|
# Enrich institutions
|
|
if matches:
|
|
print("✨ Enriching institutions with Wikidata metadata...\n")
|
|
for match in matches:
|
|
enrich_with_wikidata(
|
|
match['institution'],
|
|
match['wikidata'],
|
|
match['score']
|
|
)
|
|
print(f" ✅ Enriched: {match['institution']['name']}")
|
|
|
|
# Save results
|
|
output_path = 'data/instances/great_britain/gb_institutions_enriched_batch1.yaml'
|
|
print(f"\n💾 Saving Batch 1 results to {output_path}...")
|
|
|
|
import os
|
|
os.makedirs('data/instances/great_britain', exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(gb_institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print(" ✅ Saved\n")
|
|
|
|
# Summary
|
|
enriched_count = sum(1 for inst in gb_institutions
|
|
if any(i.get('identifier_scheme') == 'Wikidata' for i in inst.get('identifiers', [])))
|
|
|
|
print("=" * 80)
|
|
print("📊 BATCH 1 RESULTS")
|
|
print("=" * 80)
|
|
print(f"Total institutions: {len(gb_institutions)}")
|
|
print(f"Wikidata enriched: {enriched_count} ({enriched_count/len(gb_institutions)*100:.1f}%)")
|
|
print(f"Still need enrichment: {len(gb_institutions) - enriched_count}")
|
|
|
|
if enriched_count >= len(gb_institutions) * 0.5:
|
|
print("\n✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
|
|
else:
|
|
print(f"\n⚠️ Below 50% goal. Batch 2 (alternative names) recommended.")
|
|
|
|
print("\n")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|