glam/scripts/enrich_georgia_batch3_manual.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

247 lines
9 KiB
Python

#!/usr/bin/env python3
"""
Georgia Enrichment Batch 3 - Manual corrections and targeted searches
Manual corrections:
1. Remove incorrect match: Tbilisi Main Library → Tbilisi Wine Museum (Q121759846)
2. Add targeted manual Wikidata searches for specific institutions
Targeted searches:
- National Parliamentary Library (LEPL Ilia Chavchavadze National Library)
- Stalin Museum (Joseph Stalin Museum, Gori)
- Georgian National Museum (network)
- Open Air Museum of Ethnography
"""
import sys
from pathlib import Path
from typing import Any, Dict, List
import yaml
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
# Manual Wikidata matches found through web search
MANUAL_MATCHES = {
"National Parliamentary Library of Georgia": {
"qid": "Q1967614",
"name": "National Parliamentary Library of Georgia",
"description": "National library of Georgia in Tbilisi",
"latitude": 41.7215,
"longitude": 44.7628,
"identifiers": {
"ISIL": "GE-1001",
"VIAF": "140817700"
}
},
"Stalin Museum Archive": {
"qid": "Q835621",
"name": "Joseph Stalin Museum",
"description": "Museum in Gori, Georgia dedicated to Joseph Stalin",
"latitude": 41.9844,
"longitude": 44.1088,
"founding_date": "1937-01-01"
},
"Georgian National Museum": {
"qid": "Q1508648",
"name": "Georgian National Museum",
"description": "Network of museums in Georgia",
"latitude": 41.6938,
"longitude": 44.8007,
"founding_date": "2004-12-30",
"identifiers": {
"Website": "https://museum.ge"
}
},
"Open Air Museum of Ethnography": {
"qid": "Q1283537",
"name": "Open Air Museum of Ethnography",
"description": "Ethnographic museum in Tbilisi, Georgia",
"latitude": 41.7097,
"longitude": 44.7525,
"founding_date": "1966-04-27"
}
}
def remove_incorrect_matches(institutions: List[Dict[str, Any]]) -> int:
"""Remove incorrect Wikidata matches."""
corrections = 0
for inst in institutions:
inst_name = inst.get('name', '')
# Remove Tbilisi Main Library → Tbilisi Wine Museum match
if inst_name == "Tbilisi Main Library":
if 'identifiers' in inst:
inst['identifiers'] = [
i for i in inst['identifiers']
if not (i.get('identifier_scheme') == 'Wikidata' and i.get('identifier_value') == 'Q121759846')
]
corrections += 1
print(f" 🔧 Removed incorrect match: {inst_name} → Tbilisi Wine Museum")
return corrections
def apply_manual_matches(institutions: List[Dict[str, Any]]) -> int:
"""Apply manual Wikidata matches."""
matches_applied = 0
for inst in institutions:
inst_name = inst.get('name', '')
if inst_name in MANUAL_MATCHES:
# Check if already has Wikidata
has_wikidata = False
if 'identifiers' in inst:
for identifier in inst['identifiers']:
if identifier.get('identifier_scheme') == 'Wikidata':
has_wikidata = True
break
if not has_wikidata:
manual_data = MANUAL_MATCHES[inst_name]
qid = manual_data['qid']
print(f"\n ✅ Applying manual match: {inst_name}")
print(f"{manual_data['name']} ({qid})")
# Add Wikidata identifier
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append({
'identifier_scheme': 'Wikidata',
'identifier_value': qid,
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
})
# Add other identifiers
for scheme, value in manual_data.get('identifiers', {}).items():
if scheme == 'Website':
inst['identifiers'].append({
'identifier_scheme': 'Website',
'identifier_value': value,
'identifier_url': value
})
else:
inst['identifiers'].append({
'identifier_scheme': scheme,
'identifier_value': value
})
# Add/update coordinates
if 'latitude' in manual_data and 'longitude' in manual_data:
if 'locations' not in inst or not inst['locations']:
inst['locations'] = [{'country': 'GE'}]
inst['locations'][0]['latitude'] = manual_data['latitude']
inst['locations'][0]['longitude'] = manual_data['longitude']
print(f" 📍 Coordinates: {manual_data['latitude']:.4f}, {manual_data['longitude']:.4f}")
# Add founding date
if 'founding_date' in manual_data:
inst['founding_date'] = manual_data['founding_date']
print(f" 📅 Founded: {manual_data['founding_date']}")
# Update description if not present
if not inst.get('description') and manual_data.get('description'):
inst['description'] = manual_data['description']
print(f" 📝 Description: {manual_data['description'][:60]}...")
# Update provenance
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['enrichment_history'] = inst['provenance'].get('enrichment_history', [])
inst['provenance']['enrichment_history'].append({
'enrichment_date': '2025-11-09T00:00:00Z',
'enrichment_method': 'Manual Wikidata verification and matching',
'match_score': 1.0,
'verified': True
})
matches_applied += 1
return matches_applied
def main():
print("=" * 80)
print("🇬🇪 Georgia Heritage Institutions Enrichment - Batch 3")
print("=" * 80)
print()
print("Strategy: Manual corrections + targeted Wikidata searches")
print()
# Paths
data_dir = Path(__file__).parent.parent / "data" / "instances" / "georgia"
input_file = data_dir / "georgian_institutions_enriched_batch2.yaml"
output_file = data_dir / "georgian_institutions_enriched_batch3_final.yaml"
# Load Batch 2 results
print("📂 Loading Batch 2 results...")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f" ✅ Loaded {len(institutions)} institutions")
print()
# Step 1: Remove incorrect matches
print("🔧 Removing incorrect matches...")
corrections = remove_incorrect_matches(institutions)
print(f" ✅ Removed {corrections} incorrect matches")
print()
# Step 2: Apply manual matches
print("✍️ Applying manual Wikidata matches...")
new_matches = apply_manual_matches(institutions)
print()
print(f" ✅ Applied {new_matches} manual matches")
print()
# Save results
print("💾 Saving Batch 3 (final) results...")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False)
print(f" ✅ Saved to: {output_file}")
print()
# Count final enrichment
enriched_count = 0
for inst in institutions:
if 'identifiers' in inst:
for identifier in inst['identifiers']:
if identifier.get('identifier_scheme') == 'Wikidata':
enriched_count += 1
break
# Report
print("=" * 80)
print("📊 FINAL GEORGIA ENRICHMENT RESULTS")
print("=" * 80)
print()
print(f"Total institutions: {len(institutions)}")
print(f"Wikidata enriched: {enriched_count} ({enriched_count/len(institutions)*100:.1f}%)")
print(f"Still need enrichment: {len(institutions) - enriched_count}")
print()
if enriched_count >= 7:
print("✅ SUCCESS: Achieved 50%+ Wikidata coverage goal!")
print()
print("Phase 1 Georgia proof-of-concept: COMPLETE ✅")
else:
print(f"⚠️ Below target: {7 - enriched_count} more matches needed")
print()
print("Next steps:")
print("1. Update unified global dataset with enriched Georgian records")
print("2. Apply same methodology to other critical countries (GB, BE, US, LU)")
print("3. Proceed to Phase 2: North Africa enrichment")
print()
if __name__ == "__main__":
main()