glam/scripts/fix_ghriba_geocoding.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

85 lines
3.3 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fix El Ghriba Synagogue geocoding in Tunisia dataset.
The village "Hara Seghira" (now Erriadh) on Djerba island failed initial
geocoding. This script adds the correct coordinates.
GLAM Data Extraction Project
Schema: LinkML v0.2.1
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
def main():
input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
print("Fixing El Ghriba Synagogue Geocoding")
print("=" * 60)
# Load data
print(f"\nReading: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
# Find El Ghriba Synagogue
found = False
for i, inst in enumerate(data['institutions']):
if 'Ghriba' in inst.get('name', ''):
print(f"\nFound at index {i}: {inst['name']}")
# Update location with coordinates
if inst.get('locations'):
for loc in inst['locations']:
if loc.get('city') == 'Hara Seghira':
print(f" Current city: {loc['city']}")
print(f" Current coords: {loc.get('latitude')}, {loc.get('longitude')}")
# Add coordinates from Nominatim (El Ghriba Synagogue direct query)
loc['latitude'] = 33.8139230
loc['longitude'] = 10.8593929
# Update city name to modern name (with note)
loc['city'] = 'Erriadh'
# Add note about name change
if 'alternative_names' not in inst:
inst['alternative_names'] = []
if 'El Ghriba Synagogue, Hara Seghira' not in inst['alternative_names']:
inst['alternative_names'].append('El Ghriba Synagogue, Hara Seghira')
print(f" Updated city: {loc['city']}")
print(f" Updated coords: {loc['latitude']}, {loc['longitude']}")
# Update provenance
if 'provenance' in inst:
notes = inst['provenance'].get('notes', '')
fix_note = f" Geocoding fixed on {datetime.now(timezone.utc).isoformat()} (Hara Seghira → Erriadh, Djerba)."
inst['provenance']['notes'] = notes + fix_note
found = True
break
if found:
break
if not found:
print("\n❌ El Ghriba Synagogue not found!")
return
# Update metadata
data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
# Save updated data
print(f"\nWriting updated data: {input_file}")
with open(input_file, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"\n✅ Geocoding fixed for El Ghriba Synagogue")
print(f" Location: Erriadh (formerly Hara Seghira), Djerba, Tunisia")
print(f" Coordinates: 33.8139230, 10.8593929")
if __name__ == '__main__':
main()