- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
85 lines
3.3 KiB
Python
Executable file
85 lines
3.3 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fix El Ghriba Synagogue geocoding in Tunisia dataset.
|
|
|
|
The village "Hara Seghira" (now Erriadh) on Djerba island failed initial
|
|
geocoding. This script adds the correct coordinates.
|
|
|
|
GLAM Data Extraction Project
|
|
Schema: LinkML v0.2.1
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
def main():
|
|
input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
|
|
|
|
print("Fixing El Ghriba Synagogue Geocoding")
|
|
print("=" * 60)
|
|
|
|
# Load data
|
|
print(f"\nReading: {input_file}")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Find El Ghriba Synagogue
|
|
found = False
|
|
for i, inst in enumerate(data['institutions']):
|
|
if 'Ghriba' in inst.get('name', ''):
|
|
print(f"\nFound at index {i}: {inst['name']}")
|
|
|
|
# Update location with coordinates
|
|
if inst.get('locations'):
|
|
for loc in inst['locations']:
|
|
if loc.get('city') == 'Hara Seghira':
|
|
print(f" Current city: {loc['city']}")
|
|
print(f" Current coords: {loc.get('latitude')}, {loc.get('longitude')}")
|
|
|
|
# Add coordinates from Nominatim (El Ghriba Synagogue direct query)
|
|
loc['latitude'] = 33.8139230
|
|
loc['longitude'] = 10.8593929
|
|
|
|
# Update city name to modern name (with note)
|
|
loc['city'] = 'Erriadh'
|
|
|
|
# Add note about name change
|
|
if 'alternative_names' not in inst:
|
|
inst['alternative_names'] = []
|
|
if 'El Ghriba Synagogue, Hara Seghira' not in inst['alternative_names']:
|
|
inst['alternative_names'].append('El Ghriba Synagogue, Hara Seghira')
|
|
|
|
print(f" Updated city: {loc['city']}")
|
|
print(f" Updated coords: {loc['latitude']}, {loc['longitude']}")
|
|
|
|
# Update provenance
|
|
if 'provenance' in inst:
|
|
notes = inst['provenance'].get('notes', '')
|
|
fix_note = f" Geocoding fixed on {datetime.now(timezone.utc).isoformat()} (Hara Seghira → Erriadh, Djerba)."
|
|
inst['provenance']['notes'] = notes + fix_note
|
|
|
|
found = True
|
|
break
|
|
|
|
if found:
|
|
break
|
|
|
|
if not found:
|
|
print("\n❌ El Ghriba Synagogue not found!")
|
|
return
|
|
|
|
# Update metadata
|
|
data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Save updated data
|
|
print(f"\nWriting updated data: {input_file}")
|
|
with open(input_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"\n✅ Geocoding fixed for El Ghriba Synagogue")
|
|
print(f" Location: Erriadh (formerly Hara Seghira), Djerba, Tunisia")
|
|
print(f" Coordinates: 33.8139230, 10.8593929")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|