- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
165 lines
5.8 KiB
Python
Executable file
165 lines
5.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Fix Sistema Brasileiro de Museus duplicate in globalglam-20251111-batch16.yaml.
|
|
|
|
Two records with same ID but slightly different names:
|
|
1. "Sistema Brasileiro de Museus (SBM)" - original
|
|
2. "Sistema Brasileiro de Museus" - batch16 (enriched)
|
|
|
|
Strategy: Merge into single record, keep enriched metadata.
|
|
"""
|
|
|
|
import yaml
|
|
import shutil
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# Paths
|
|
BASE_DIR = Path(__file__).parent.parent
|
|
INPUT_FILE = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111-batch16.yaml"
|
|
OUTPUT_FILE = BASE_DIR / "data" / "instances" / "all" / "globalglam-20251111-batch16-fixed.yaml"
|
|
BACKUP_FILE = BASE_DIR / "data" / "instances" / "all" / f"globalglam-20251111-batch16-pre-fix-{datetime.now().strftime('%Y%m%d-%H%M%S')}.yaml"
|
|
|
|
|
|
def load_yaml(filepath):
|
|
"""Load YAML file."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
def save_yaml(data, filepath):
|
|
"""Save YAML file."""
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
|
|
def fix_sbm_duplicate(institutions):
|
|
"""
|
|
Fix Sistema Brasileiro de Museus duplicate by merging records.
|
|
|
|
Args:
|
|
institutions: List of institution dictionaries
|
|
|
|
Returns:
|
|
Updated list with duplicate merged
|
|
"""
|
|
# Find both SBM records
|
|
target_id = "https://w3id.org/heritage/custodian/br/sistema-brasileiro-de-museus-sbm"
|
|
sbm_records = []
|
|
sbm_indices = []
|
|
|
|
for idx, inst in enumerate(institutions):
|
|
if inst.get('id') == target_id:
|
|
sbm_records.append(inst)
|
|
sbm_indices.append(idx)
|
|
|
|
print(f"Found {len(sbm_records)} Sistema Brasileiro de Museus records")
|
|
|
|
if len(sbm_records) != 2:
|
|
print(f"ERROR: Expected 2 SBM records, found {len(sbm_records)}")
|
|
return institutions
|
|
|
|
# Display both records
|
|
for i, sbm in enumerate(sbm_records, 1):
|
|
wikidata_ids = [
|
|
ident.get('identifier_value') for ident in sbm.get('identifiers', [])
|
|
if ident.get('identifier_scheme') == 'Wikidata'
|
|
]
|
|
print(f"\nRecord {i}:")
|
|
print(f" Name: {sbm.get('name')}")
|
|
print(f" Wikidata: {wikidata_ids}")
|
|
print(f" Description length: {len(sbm.get('description', ''))}")
|
|
print(f" Identifiers: {len(sbm.get('identifiers', []))}")
|
|
|
|
# Determine which is enriched (has more identifiers)
|
|
original = sbm_records[0] if len(sbm_records[0].get('identifiers', [])) < len(sbm_records[1].get('identifiers', [])) else sbm_records[1]
|
|
enriched = sbm_records[1] if original == sbm_records[0] else sbm_records[0]
|
|
|
|
print(f"\nOriginal record: '{original.get('name')}' ({len(original.get('identifiers', []))} identifiers)")
|
|
print(f"Enriched record: '{enriched.get('name')}' ({len(enriched.get('identifiers', []))} identifiers)")
|
|
|
|
# Merge: Keep enriched record, use name with abbreviation
|
|
merged = enriched.copy()
|
|
merged['name'] = "Sistema Brasileiro de Museus (SBM)" # Keep abbreviation for clarity
|
|
|
|
# Update provenance notes
|
|
if 'provenance' not in merged:
|
|
merged['provenance'] = {}
|
|
|
|
if merged['provenance'].get('notes'):
|
|
merged['provenance']['notes'] += "\n\nDuplicate fixed 2025-11-11: Merged with original record, keeping enriched metadata."
|
|
else:
|
|
merged['provenance']['notes'] = "Duplicate fixed 2025-11-11: Merged with original record, keeping enriched metadata."
|
|
|
|
print(f"\nMerged record: '{merged.get('name')}'")
|
|
desc = merged.get('description', '')
|
|
print(f" Description: {desc[:100] if desc else 'None'}...")
|
|
print(f" Identifiers: {len(merged.get('identifiers', []))}")
|
|
|
|
# Remove both originals
|
|
for idx in sorted(sbm_indices, reverse=True):
|
|
institutions.pop(idx)
|
|
|
|
# Add merged record
|
|
institutions.append(merged)
|
|
|
|
print(f"\nDeduplication complete:")
|
|
print(f" Original count: {len(sbm_records) + len(institutions) - 1}")
|
|
print(f" New count: {len(institutions)}")
|
|
print(f" Reduction: -1 institution")
|
|
|
|
return institutions
|
|
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("FIX SISTEMA BRASILEIRO DE MUSEUS DUPLICATE")
|
|
print("=" * 80)
|
|
|
|
# Create backup
|
|
print(f"\n1. Creating backup: {BACKUP_FILE.name}")
|
|
shutil.copy(INPUT_FILE, BACKUP_FILE)
|
|
print(" ✓ Backup created")
|
|
|
|
# Load data
|
|
print(f"\n2. Loading institutions from {INPUT_FILE.name}...")
|
|
institutions = load_yaml(INPUT_FILE)
|
|
print(f" Loaded {len(institutions)} institutions")
|
|
|
|
# Fix duplicate
|
|
print(f"\n3. Fixing Sistema Brasileiro de Museus duplicate...")
|
|
institutions = fix_sbm_duplicate(institutions)
|
|
|
|
# Save deduplicated dataset
|
|
print(f"\n4. Saving deduplicated dataset to {OUTPUT_FILE.name}...")
|
|
save_yaml(institutions, OUTPUT_FILE)
|
|
print(" ✓ Saved successfully")
|
|
|
|
# Statistics
|
|
brazil_institutions = [
|
|
inst for inst in institutions
|
|
if inst.get('locations') and inst['locations'][0].get('country') == 'BR'
|
|
]
|
|
brazil_with_wikidata = [
|
|
inst for inst in brazil_institutions
|
|
if inst.get('identifiers') and any(
|
|
ident.get('identifier_scheme') == 'Wikidata'
|
|
for ident in inst['identifiers']
|
|
)
|
|
]
|
|
|
|
print(f"\n{'='*80}")
|
|
print("FINAL STATISTICS")
|
|
print(f"{'='*80}")
|
|
print(f"Total institutions: {len(institutions)}")
|
|
print(f"Brazilian institutions: {len(brazil_institutions)}")
|
|
print(f"Brazilian with Wikidata: {len(brazil_with_wikidata)}")
|
|
print(f"Coverage: {len(brazil_with_wikidata)/len(brazil_institutions)*100:.1f}%")
|
|
print(f"\n✓ Duplicate fixed successfully!")
|
|
print(f" Input: {INPUT_FILE}")
|
|
print(f" Output: {OUTPUT_FILE}")
|
|
print(f" Backup: {BACKUP_FILE}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|