glam/scripts/fix_sbm_duplicate.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

165 lines
5.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Fix Sistema Brasileiro de Museus duplicate in globalglam-20251111-batch16.yaml.
Two records with same ID but slightly different names:
1. "Sistema Brasileiro de Museus (SBM)" - original
2. "Sistema Brasileiro de Museus" - batch16 (enriched)
Strategy: Merge into single record, keep enriched metadata.
"""
import yaml
import shutil
from datetime import datetime
from pathlib import Path
# Paths
BASE_DIR = Path(__file__).parent.parent
INPUT_FILE = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111-batch16.yaml"
OUTPUT_FILE = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111-batch16-fixed.yaml"
BACKUP_FILE = BASE_DIR / "data" / "instances" / "all" / f"globalglam-20251111-batch16-pre-fix-{datetime.now().strftime('%Y%m%d-%H%M%S')}.yaml"
def load_yaml(filepath):
"""Load YAML file."""
with open(filepath, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def save_yaml(data, filepath):
"""Save YAML file."""
with open(filepath, 'w', encoding='utf-8') as f:
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
def fix_sbm_duplicate(institutions):
"""
Fix Sistema Brasileiro de Museus duplicate by merging records.
Args:
institutions: List of institution dictionaries
Returns:
Updated list with duplicate merged
"""
# Find both SBM records
target_id = "https://w3id.org/heritage/custodian/br/sistema-brasileiro-de-museus-sbm"
sbm_records = []
sbm_indices = []
for idx, inst in enumerate(institutions):
if inst.get('id') == target_id:
sbm_records.append(inst)
sbm_indices.append(idx)
print(f"Found {len(sbm_records)} Sistema Brasileiro de Museus records")
if len(sbm_records) != 2:
print(f"ERROR: Expected 2 SBM records, found {len(sbm_records)}")
return institutions
# Display both records
for i, sbm in enumerate(sbm_records, 1):
wikidata_ids = [
ident.get('identifier_value') for ident in sbm.get('identifiers', [])
if ident.get('identifier_scheme') == 'Wikidata'
]
print(f"\nRecord {i}:")
print(f" Name: {sbm.get('name')}")
print(f" Wikidata: {wikidata_ids}")
print(f" Description length: {len(sbm.get('description', ''))}")
print(f" Identifiers: {len(sbm.get('identifiers', []))}")
# Determine which is enriched (has more identifiers)
original = sbm_records[0] if len(sbm_records[0].get('identifiers', [])) < len(sbm_records[1].get('identifiers', [])) else sbm_records[1]
enriched = sbm_records[1] if original == sbm_records[0] else sbm_records[0]
print(f"\nOriginal record: '{original.get('name')}' ({len(original.get('identifiers', []))} identifiers)")
print(f"Enriched record: '{enriched.get('name')}' ({len(enriched.get('identifiers', []))} identifiers)")
# Merge: Keep enriched record, use name with abbreviation
merged = enriched.copy()
merged['name'] = "Sistema Brasileiro de Museus (SBM)" # Keep abbreviation for clarity
# Update provenance notes
if 'provenance' not in merged:
merged['provenance'] = {}
if merged['provenance'].get('notes'):
merged['provenance']['notes'] += "\n\nDuplicate fixed 2025-11-11: Merged with original record, keeping enriched metadata."
else:
merged['provenance']['notes'] = "Duplicate fixed 2025-11-11: Merged with original record, keeping enriched metadata."
print(f"\nMerged record: '{merged.get('name')}'")
desc = merged.get('description', '')
print(f" Description: {desc[:100] if desc else 'None'}...")
print(f" Identifiers: {len(merged.get('identifiers', []))}")
# Remove both originals
for idx in sorted(sbm_indices, reverse=True):
institutions.pop(idx)
# Add merged record
institutions.append(merged)
print(f"\nDeduplication complete:")
print(f" Original count: {len(sbm_records) + len(institutions) - 1}")
print(f" New count: {len(institutions)}")
print(f" Reduction: -1 institution")
return institutions
def main():
print("=" * 80)
print("FIX SISTEMA BRASILEIRO DE MUSEUS DUPLICATE")
print("=" * 80)
# Create backup
print(f"\n1. Creating backup: {BACKUP_FILE.name}")
shutil.copy(INPUT_FILE, BACKUP_FILE)
print(" ✓ Backup created")
# Load data
print(f"\n2. Loading institutions from {INPUT_FILE.name}...")
institutions = load_yaml(INPUT_FILE)
print(f" Loaded {len(institutions)} institutions")
# Fix duplicate
print(f"\n3. Fixing Sistema Brasileiro de Museus duplicate...")
institutions = fix_sbm_duplicate(institutions)
# Save deduplicated dataset
print(f"\n4. Saving deduplicated dataset to {OUTPUT_FILE.name}...")
save_yaml(institutions, OUTPUT_FILE)
print(" ✓ Saved successfully")
# Statistics
brazil_institutions = [
inst for inst in institutions
if inst.get('locations') and inst['locations'][0].get('country') == 'BR'
]
brazil_with_wikidata = [
inst for inst in brazil_institutions
if inst.get('identifiers') and any(
ident.get('identifier_scheme') == 'Wikidata'
for ident in inst['identifiers']
)
]
print(f"\n{'='*80}")
print("FINAL STATISTICS")
print(f"{'='*80}")
print(f"Total institutions: {len(institutions)}")
print(f"Brazilian institutions: {len(brazil_institutions)}")
print(f"Brazilian with Wikidata: {len(brazil_with_wikidata)}")
print(f"Coverage: {len(brazil_with_wikidata)/len(brazil_institutions)*100:.1f}%")
print(f"\n✓ Duplicate fixed successfully!")
print(f" Input: {INPUT_FILE}")
print(f" Output: {OUTPUT_FILE}")
print(f" Backup: {BACKUP_FILE}")
if __name__ == "__main__":
main()