glam/scripts/deduplicate_tunisia.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

124 lines
4.4 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Deduplicate Tunisia heritage dataset by merging duplicate Chemtou records.
GLAM Data Extraction Project
Schema: LinkML v0.2.1
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
def merge_chemtou_records(record1, record2):
"""
Merge two Chemtou Archaeological Museum records.
Keep the more complete information from both.
"""
# Use record2 as base (it has better description and digital platform)
merged = record2.copy()
# Merge collections (record1 has 4, record2 has 1)
collections1 = record1.get('collections', [])
collections2 = record2.get('collections', [])
# Combine collections, avoiding duplicates by name
all_collections = collections2.copy()
for coll1 in collections1:
is_duplicate = False
for coll2 in collections2:
if coll1.get('collection_name') == coll2.get('collection_name'):
is_duplicate = True
break
if not is_duplicate:
all_collections.append(coll1)
merged['collections'] = all_collections
# Merge change_history
history1 = record1.get('change_history', [])
history2 = record2.get('change_history', [])
all_history = history2.copy()
for h1 in history1:
is_duplicate = False
for h2 in history2:
if h1.get('event_date') == h2.get('event_date'):
is_duplicate = True
break
if not is_duplicate:
all_history.append(h1)
merged['change_history'] = all_history
# Update provenance to note the merge
if 'provenance' in merged:
notes = merged['provenance'].get('notes', '')
merge_note = f" Merged duplicate record from index 11 on {datetime.now(timezone.utc).isoformat()}."
merged['provenance']['notes'] = notes + merge_note
return merged
def main():
input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
output_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
backup_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.backup.yaml')
print("Tunisia Heritage Dataset Deduplication")
print("=" * 60)
# Load data
print(f"\nReading: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
institutions = data['institutions']
original_count = len(institutions)
print(f"Original count: {original_count} institutions")
# Find Chemtou duplicates
chemtou_indices = []
for i, inst in enumerate(institutions):
if 'Chemtou' in inst.get('name', ''):
chemtou_indices.append(i)
print(f" Found at index {i}: {inst['name']} (GHCID: {inst.get('ghcid')})")
if len(chemtou_indices) != 2:
print(f"\nError: Expected 2 Chemtou records, found {len(chemtou_indices)}")
return
# Merge records
print(f"\nMerging records at indices {chemtou_indices[0]} and {chemtou_indices[1]}...")
record1 = institutions[chemtou_indices[0]]
record2 = institutions[chemtou_indices[1]]
merged_record = merge_chemtou_records(record1, record2)
# Remove duplicates (keep merged version)
# Remove the first occurrence, keep the second (which we merged into)
institutions.pop(chemtou_indices[0])
# Update metadata
data['_metadata']['count'] = len(institutions)
data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
if 'enhancements' not in data['_metadata']:
data['_metadata']['enhancements'] = []
data['_metadata']['enhancements'].append('Deduplication (Chemtou)')
# Create backup
print(f"\nCreating backup: {backup_file}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
# Save deduplicated data
print(f"Writing deduplicated data: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
final_count = len(institutions)
print(f"\nDeduplication complete:")
print(f" Before: {original_count} institutions")
print(f" After: {final_count} institutions")
print(f" Removed: {original_count - final_count} duplicate(s)")
print(f"\n✅ Dataset saved to: {output_file}")
if __name__ == '__main__':
main()