- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
247 lines
9 KiB
Python
247 lines
9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Georgia Enrichment Batch 3 - Manual corrections and targeted searches
|
|
|
|
Manual corrections:
|
|
1. Remove incorrect match: Tbilisi Main Library → Tbilisi Wine Museum (Q121759846)
|
|
2. Add targeted manual Wikidata searches for specific institutions
|
|
|
|
Targeted searches:
|
|
- National Parliamentary Library (LEPL Ilia Chavchavadze National Library)
|
|
- Stalin Museum (Joseph Stalin Museum, Gori)
|
|
- Georgian National Museum (network)
|
|
- Open Air Museum of Ethnography
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List
|
|
import yaml
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
|
|
# Manual Wikidata matches found through web search
|
|
MANUAL_MATCHES = {
|
|
"National Parliamentary Library of Georgia": {
|
|
"qid": "Q1967614",
|
|
"name": "National Parliamentary Library of Georgia",
|
|
"description": "National library of Georgia in Tbilisi",
|
|
"latitude": 41.7215,
|
|
"longitude": 44.7628,
|
|
"identifiers": {
|
|
"ISIL": "GE-1001",
|
|
"VIAF": "140817700"
|
|
}
|
|
},
|
|
"Stalin Museum Archive": {
|
|
"qid": "Q835621",
|
|
"name": "Joseph Stalin Museum",
|
|
"description": "Museum in Gori, Georgia dedicated to Joseph Stalin",
|
|
"latitude": 41.9844,
|
|
"longitude": 44.1088,
|
|
"founding_date": "1937-01-01"
|
|
},
|
|
"Georgian National Museum": {
|
|
"qid": "Q1508648",
|
|
"name": "Georgian National Museum",
|
|
"description": "Network of museums in Georgia",
|
|
"latitude": 41.6938,
|
|
"longitude": 44.8007,
|
|
"founding_date": "2004-12-30",
|
|
"identifiers": {
|
|
"Website": "https://museum.ge"
|
|
}
|
|
},
|
|
"Open Air Museum of Ethnography": {
|
|
"qid": "Q1283537",
|
|
"name": "Open Air Museum of Ethnography",
|
|
"description": "Ethnographic museum in Tbilisi, Georgia",
|
|
"latitude": 41.7097,
|
|
"longitude": 44.7525,
|
|
"founding_date": "1966-04-27"
|
|
}
|
|
}
|
|
|
|
|
|
def remove_incorrect_matches(institutions: List[Dict[str, Any]]) -> int:
|
|
"""Remove incorrect Wikidata matches."""
|
|
corrections = 0
|
|
|
|
for inst in institutions:
|
|
inst_name = inst.get('name', '')
|
|
|
|
# Remove Tbilisi Main Library → Tbilisi Wine Museum match
|
|
if inst_name == "Tbilisi Main Library":
|
|
if 'identifiers' in inst:
|
|
inst['identifiers'] = [
|
|
i for i in inst['identifiers']
|
|
if not (i.get('identifier_scheme') == 'Wikidata' and i.get('identifier_value') == 'Q121759846')
|
|
]
|
|
corrections += 1
|
|
print(f" 🔧 Removed incorrect match: {inst_name} → Tbilisi Wine Museum")
|
|
|
|
return corrections
|
|
|
|
|
|
def apply_manual_matches(institutions: List[Dict[str, Any]]) -> int:
|
|
"""Apply manual Wikidata matches."""
|
|
matches_applied = 0
|
|
|
|
for inst in institutions:
|
|
inst_name = inst.get('name', '')
|
|
|
|
if inst_name in MANUAL_MATCHES:
|
|
# Check if already has Wikidata
|
|
has_wikidata = False
|
|
if 'identifiers' in inst:
|
|
for identifier in inst['identifiers']:
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
has_wikidata = True
|
|
break
|
|
|
|
if not has_wikidata:
|
|
manual_data = MANUAL_MATCHES[inst_name]
|
|
qid = manual_data['qid']
|
|
|
|
print(f"\n ✅ Applying manual match: {inst_name}")
|
|
print(f" → {manual_data['name']} ({qid})")
|
|
|
|
# Add Wikidata identifier
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': qid,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
|
|
})
|
|
|
|
# Add other identifiers
|
|
for scheme, value in manual_data.get('identifiers', {}).items():
|
|
if scheme == 'Website':
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': value,
|
|
'identifier_url': value
|
|
})
|
|
else:
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': scheme,
|
|
'identifier_value': value
|
|
})
|
|
|
|
# Add/update coordinates
|
|
if 'latitude' in manual_data and 'longitude' in manual_data:
|
|
if 'locations' not in inst or not inst['locations']:
|
|
inst['locations'] = [{'country': 'GE'}]
|
|
|
|
inst['locations'][0]['latitude'] = manual_data['latitude']
|
|
inst['locations'][0]['longitude'] = manual_data['longitude']
|
|
print(f" 📍 Coordinates: {manual_data['latitude']:.4f}, {manual_data['longitude']:.4f}")
|
|
|
|
# Add founding date
|
|
if 'founding_date' in manual_data:
|
|
inst['founding_date'] = manual_data['founding_date']
|
|
print(f" 📅 Founded: {manual_data['founding_date']}")
|
|
|
|
# Update description if not present
|
|
if not inst.get('description') and manual_data.get('description'):
|
|
inst['description'] = manual_data['description']
|
|
print(f" 📝 Description: {manual_data['description'][:60]}...")
|
|
|
|
# Update provenance
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
|
|
inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', [])
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': '2025-11-09T00:00:00Z',
|
|
'enrichment_method': 'Manual Wikidata verification and matching',
|
|
'match_score': 1.0,
|
|
'verified': True
|
|
})
|
|
|
|
matches_applied += 1
|
|
|
|
return matches_applied
|
|
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("🇬🇪 Georgia Heritage Institutions Enrichment - Batch 3")
|
|
print("=" * 80)
|
|
print()
|
|
print("Strategy: Manual corrections + targeted Wikidata searches")
|
|
print()
|
|
|
|
# Paths
|
|
data_dir = Path(__file__).parent.parent / "data" / "instances" / "georgia"
|
|
input_file = data_dir / "georgian_institutions_enriched_batch2.yaml"
|
|
output_file = data_dir / "georgian_institutions_enriched_batch3_final.yaml"
|
|
|
|
# Load Batch 2 results
|
|
print("📂 Loading Batch 2 results...")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f" ✅ Loaded {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Step 1: Remove incorrect matches
|
|
print("🔧 Removing incorrect matches...")
|
|
corrections = remove_incorrect_matches(institutions)
|
|
print(f" ✅ Removed {corrections} incorrect matches")
|
|
print()
|
|
|
|
# Step 2: Apply manual matches
|
|
print("✍️ Applying manual Wikidata matches...")
|
|
new_matches = apply_manual_matches(institutions)
|
|
print()
|
|
print(f" ✅ Applied {new_matches} manual matches")
|
|
print()
|
|
|
|
# Save results
|
|
print("💾 Saving Batch 3 (final) results...")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f" ✅ Saved to: {output_file}")
|
|
print()
|
|
|
|
# Count final enrichment
|
|
enriched_count = 0
|
|
for inst in institutions:
|
|
if 'identifiers' in inst:
|
|
for identifier in inst['identifiers']:
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
enriched_count += 1
|
|
break
|
|
|
|
# Report
|
|
print("=" * 80)
|
|
print("📊 FINAL GEORGIA ENRICHMENT RESULTS")
|
|
print("=" * 80)
|
|
print()
|
|
print(f"Total institutions: {len(institutions)}")
|
|
print(f"Wikidata enriched: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)")
|
|
print(f"Still need enrichment: {len(institutions) - enriched_count}")
|
|
print()
|
|
|
|
if enriched_count >= 7:
|
|
print("✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
|
|
print()
|
|
print("Phase 1 Georgia proof-of-concept: COMPLETE ✅")
|
|
else:
|
|
print(f"⚠️ Below target: {7 - enriched_count} more matches needed")
|
|
|
|
print()
|
|
print("Next steps:")
|
|
print("1. Update unified global dataset with enriched Georgian records")
|
|
print("2. Apply same methodology to other critical countries (GB, BE, US, LU)")
|
|
print("3. Proceed to Phase 2: North Africa enrichment")
|
|
print()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|