glam/scripts/fix_duplicate_temporal_coverage.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

112 lines
4.2 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Fix duplicate 'temporal_coverage:' keys caused by batch fix script.
This script removes the duplicate prefix pattern:
temporal_coverage: temporal_coverage: 1500-01-01/1899-12-31
→ temporal_coverage: 1500-01-01/1899-12-31
"""
import re
from pathlib import Path
# Files with duplicate temporal_coverage keys (from rg search)
AFFECTED_FILES = [
"data/instances/algeria/algerian_institutions.yaml",
"data/instances/libya/libyan_institutions.yaml",
"data/instances/libya/libyan_institutions_backup_20251111.yaml",
"data/instances/georgia/georgian_institutions_enriched_batch3_final.yaml",
"data/instances/georgia/georgian_institutions_enriched_batch1.yaml",
"data/instances/georgia/georgian_institutions_enriched_batch2.yaml",
"data/instances/georgia_glam_institutions.yaml",
"data/instances/all/globalglam-20251111-batch16-pre-fix-20251111-230522.yaml",
"data/instances/all/globalglam-20251111-batch16-fixed.yaml",
"data/instances/brazil/brazilian_institutions_final.yaml",
"data/instances/all/globalglam-20251111-batch16.yaml",
"data/instances/brazil/brazilian_institutions_batch7_enriched.yaml",
"data/instances/brazil/brazilian_institutions_batch6_enriched.yaml",
"data/instances/all/globalglam-20251111.yaml",
"data/instances/all/globalglam-20251111-pre-batch16-20251111-230249.yaml",
"data/instances/all/globalglam-20251111_backup_20251111_144746.yaml",
"data/instances/norway/museums_southern_norway.yaml",
"data/instances/norway/museums_trondelag.yaml",
"data/instances/norway/museums_eastern_norway.yaml",
"data/instances/brazil/brazilian_institutions_batch8_enriched.yaml",
"data/instances/norway/museums_northern_norway.yaml",
"data/instances/norway/museums_oslo.yaml",
"data/instances/norway/national_aggregators.yaml",
"data/instances/norway/museums_western_norway.yaml",
"data/instances/tunisia/tunisian_institutions.yaml",
"data/instances/tunisia/tunisian_institutions_enhanced.yaml",
"data/instances/tunisia/tunisian_institutions_enhanced.backup.yaml",
"data/instances/georgia_glam_institutions_enriched.yaml",
"data/instances/georgia_glam_institutions_enriched.pre_enrichment_backfill_20251111_100230.yaml",
"data/instances/all/globalglam-20251111_backup_20251111_144624.yaml",
]
def fix_duplicate_temporal_coverage(content: str) -> tuple[str, int]:
"""
Remove duplicate 'temporal_coverage:' prefix.
Returns:
(fixed_content, num_fixes)
"""
# Pattern: temporal_coverage: temporal_coverage: <value>
# Replace with: temporal_coverage: <value>
pattern = r'^(\s+)temporal_coverage:\s+temporal_coverage:\s+(.+)$'
lines = content.split('\n')
fixed_lines = []
fix_count = 0
for line in lines:
match = re.match(pattern, line)
if match:
indent = match.group(1)
value = match.group(2)
fixed_line = f"{indent}temporal_coverage: {value}"
fixed_lines.append(fixed_line)
fix_count += 1
else:
fixed_lines.append(line)
return '\n'.join(fixed_lines), fix_count
def main():
"""Fix duplicate temporal_coverage keys in all affected files."""
repo_root = Path(__file__).parent.parent
total_fixed = 0
fixed_files = []
for file_path_str in AFFECTED_FILES:
file_path = repo_root / file_path_str
if not file_path.exists():
print(f"⚠️ File not found: {file_path_str}")
continue
# Read file
content = file_path.read_text(encoding='utf-8')
# Fix duplicates
fixed_content, fix_count = fix_duplicate_temporal_coverage(content)
if fix_count > 0:
# Write back
file_path.write_text(fixed_content, encoding='utf-8')
print(f"✅ Fixed {fix_count} duplicate(s) in {file_path_str}")
total_fixed += fix_count
fixed_files.append(file_path_str)
else:
print(f" No duplicates found in {file_path_str}")
print(f"\n{'='*70}")
print(f"✅ Total fixes: {total_fixed} duplicates across {len(fixed_files)} files")
print(f"{'='*70}")
if __name__ == "__main__":
main()