- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
112 lines
4.2 KiB
Python
112 lines
4.2 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Fix duplicate 'temporal_coverage:' keys caused by batch fix script.
|
||
|
||
This script removes the duplicate prefix pattern:
|
||
temporal_coverage: temporal_coverage: 1500-01-01/1899-12-31
|
||
→ temporal_coverage: 1500-01-01/1899-12-31
|
||
"""
|
||
|
||
import re
|
||
from pathlib import Path
|
||
|
||
# Files with duplicate temporal_coverage keys (from rg search)
|
||
AFFECTED_FILES = [
|
||
"data/instances/algeria/algerian_institutions.yaml",
|
||
"data/instances/libya/libyan_institutions.yaml",
|
||
"data/instances/libya/libyan_institutions_backup_20251111.yaml",
|
||
"data/instances/georgia/georgian_institutions_enriched_batch3_final.yaml",
|
||
"data/instances/georgia/georgian_institutions_enriched_batch1.yaml",
|
||
"data/instances/georgia/georgian_institutions_enriched_batch2.yaml",
|
||
"data/instances/georgia_glam_institutions.yaml",
|
||
"data/instances/all/globalglam-20251111-batch16-pre-fix-20251111-230522.yaml",
|
||
"data/instances/all/globalglam-20251111-batch16-fixed.yaml",
|
||
"data/instances/brazil/brazilian_institutions_final.yaml",
|
||
"data/instances/all/globalglam-20251111-batch16.yaml",
|
||
"data/instances/brazil/brazilian_institutions_batch7_enriched.yaml",
|
||
"data/instances/brazil/brazilian_institutions_batch6_enriched.yaml",
|
||
"data/instances/all/globalglam-20251111.yaml",
|
||
"data/instances/all/globalglam-20251111-pre-batch16-20251111-230249.yaml",
|
||
"data/instances/all/globalglam-20251111_backup_20251111_144746.yaml",
|
||
"data/instances/norway/museums_southern_norway.yaml",
|
||
"data/instances/norway/museums_trondelag.yaml",
|
||
"data/instances/norway/museums_eastern_norway.yaml",
|
||
"data/instances/brazil/brazilian_institutions_batch8_enriched.yaml",
|
||
"data/instances/norway/museums_northern_norway.yaml",
|
||
"data/instances/norway/museums_oslo.yaml",
|
||
"data/instances/norway/national_aggregators.yaml",
|
||
"data/instances/norway/museums_western_norway.yaml",
|
||
"data/instances/tunisia/tunisian_institutions.yaml",
|
||
"data/instances/tunisia/tunisian_institutions_enhanced.yaml",
|
||
"data/instances/tunisia/tunisian_institutions_enhanced.backup.yaml",
|
||
"data/instances/georgia_glam_institutions_enriched.yaml",
|
||
"data/instances/georgia_glam_institutions_enriched.pre_enrichment_backfill_20251111_100230.yaml",
|
||
"data/instances/all/globalglam-20251111_backup_20251111_144624.yaml",
|
||
]
|
||
|
||
|
||
def fix_duplicate_temporal_coverage(content: str) -> tuple[str, int]:
|
||
"""
|
||
Remove duplicate 'temporal_coverage:' prefix.
|
||
|
||
Returns:
|
||
(fixed_content, num_fixes)
|
||
"""
|
||
# Pattern: temporal_coverage: temporal_coverage: <value>
|
||
# Replace with: temporal_coverage: <value>
|
||
pattern = r'^(\s+)temporal_coverage:\s+temporal_coverage:\s+(.+)$'
|
||
|
||
lines = content.split('\n')
|
||
fixed_lines = []
|
||
fix_count = 0
|
||
|
||
for line in lines:
|
||
match = re.match(pattern, line)
|
||
if match:
|
||
indent = match.group(1)
|
||
value = match.group(2)
|
||
fixed_line = f"{indent}temporal_coverage: {value}"
|
||
fixed_lines.append(fixed_line)
|
||
fix_count += 1
|
||
else:
|
||
fixed_lines.append(line)
|
||
|
||
return '\n'.join(fixed_lines), fix_count
|
||
|
||
|
||
def main():
|
||
"""Fix duplicate temporal_coverage keys in all affected files."""
|
||
repo_root = Path(__file__).parent.parent
|
||
|
||
total_fixed = 0
|
||
fixed_files = []
|
||
|
||
for file_path_str in AFFECTED_FILES:
|
||
file_path = repo_root / file_path_str
|
||
|
||
if not file_path.exists():
|
||
print(f"⚠️ File not found: {file_path_str}")
|
||
continue
|
||
|
||
# Read file
|
||
content = file_path.read_text(encoding='utf-8')
|
||
|
||
# Fix duplicates
|
||
fixed_content, fix_count = fix_duplicate_temporal_coverage(content)
|
||
|
||
if fix_count > 0:
|
||
# Write back
|
||
file_path.write_text(fixed_content, encoding='utf-8')
|
||
print(f"✅ Fixed {fix_count} duplicate(s) in {file_path_str}")
|
||
total_fixed += fix_count
|
||
fixed_files.append(file_path_str)
|
||
else:
|
||
print(f"ℹ️ No duplicates found in {file_path_str}")
|
||
|
||
print(f"\n{'='*70}")
|
||
print(f"✅ Total fixes: {total_fixed} duplicates across {len(fixed_files)} files")
|
||
print(f"{'='*70}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|