- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
124 lines
4.4 KiB
Python
Executable file
124 lines
4.4 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Deduplicate Tunisia heritage dataset by merging duplicate Chemtou records.
|
|
|
|
GLAM Data Extraction Project
|
|
Schema: LinkML v0.2.1
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
def merge_chemtou_records(record1, record2):
|
|
"""
|
|
Merge two Chemtou Archaeological Museum records.
|
|
Keep the more complete information from both.
|
|
"""
|
|
# Use record2 as base (it has better description and digital platform)
|
|
merged = record2.copy()
|
|
|
|
# Merge collections (record1 has 4, record2 has 1)
|
|
collections1 = record1.get('collections', [])
|
|
collections2 = record2.get('collections', [])
|
|
|
|
# Combine collections, avoiding duplicates by name
|
|
all_collections = collections2.copy()
|
|
for coll1 in collections1:
|
|
is_duplicate = False
|
|
for coll2 in collections2:
|
|
if coll1.get('collection_name') == coll2.get('collection_name'):
|
|
is_duplicate = True
|
|
break
|
|
if not is_duplicate:
|
|
all_collections.append(coll1)
|
|
|
|
merged['collections'] = all_collections
|
|
|
|
# Merge change_history
|
|
history1 = record1.get('change_history', [])
|
|
history2 = record2.get('change_history', [])
|
|
all_history = history2.copy()
|
|
for h1 in history1:
|
|
is_duplicate = False
|
|
for h2 in history2:
|
|
if h1.get('event_date') == h2.get('event_date'):
|
|
is_duplicate = True
|
|
break
|
|
if not is_duplicate:
|
|
all_history.append(h1)
|
|
merged['change_history'] = all_history
|
|
|
|
# Update provenance to note the merge
|
|
if 'provenance' in merged:
|
|
notes = merged['provenance'].get('notes', '')
|
|
merge_note = f" Merged duplicate record from index 11 on {datetime.now(timezone.utc).isoformat()}."
|
|
merged['provenance']['notes'] = notes + merge_note
|
|
|
|
return merged
|
|
|
|
def main():
|
|
input_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
|
|
output_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.yaml')
|
|
backup_file = Path('data/instances/tunisia/tunisian_institutions_enhanced.backup.yaml')
|
|
|
|
print("Tunisia Heritage Dataset Deduplication")
|
|
print("=" * 60)
|
|
|
|
# Load data
|
|
print(f"\nReading: {input_file}")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
institutions = data['institutions']
|
|
original_count = len(institutions)
|
|
print(f"Original count: {original_count} institutions")
|
|
|
|
# Find Chemtou duplicates
|
|
chemtou_indices = []
|
|
for i, inst in enumerate(institutions):
|
|
if 'Chemtou' in inst.get('name', ''):
|
|
chemtou_indices.append(i)
|
|
print(f" Found at index {i}: {inst['name']} (GHCID: {inst.get('ghcid')})")
|
|
|
|
if len(chemtou_indices) != 2:
|
|
print(f"\nError: Expected 2 Chemtou records, found {len(chemtou_indices)}")
|
|
return
|
|
|
|
# Merge records
|
|
print(f"\nMerging records at indices {chemtou_indices[0]} and {chemtou_indices[1]}...")
|
|
record1 = institutions[chemtou_indices[0]]
|
|
record2 = institutions[chemtou_indices[1]]
|
|
|
|
merged_record = merge_chemtou_records(record1, record2)
|
|
|
|
# Remove duplicates (keep merged version)
|
|
# Remove the first occurrence, keep the second (which we merged into)
|
|
institutions.pop(chemtou_indices[0])
|
|
|
|
# Update metadata
|
|
data['_metadata']['count'] = len(institutions)
|
|
data['_metadata']['generated'] = datetime.now(timezone.utc).isoformat()
|
|
if 'enhancements' not in data['_metadata']:
|
|
data['_metadata']['enhancements'] = []
|
|
data['_metadata']['enhancements'].append('Deduplication (Chemtou)')
|
|
|
|
# Create backup
|
|
print(f"\nCreating backup: {backup_file}")
|
|
with open(backup_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
# Save deduplicated data
|
|
print(f"Writing deduplicated data: {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
final_count = len(institutions)
|
|
print(f"\nDeduplication complete:")
|
|
print(f" Before: {original_count} institutions")
|
|
print(f" After: {final_count} institutions")
|
|
print(f" Removed: {original_count - final_count} duplicate(s)")
|
|
print(f"\n✅ Dataset saved to: {output_file}")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|