glam/scripts/merge_sparql_and_isil_enrichments.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

226 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""
Merge SPARQL fuzzy matches with ISIL-based enrichments.
Strategy:
1. Start with SPARQL enriched file (has 180 NL fuzzy matches)
2. Add ISIL-based Wikidata IDs from current file where missing
3. Preserve ALL real Q-numbers, prioritize SPARQL fuzzy matches
4. Remove synthetic Q-numbers (>Q100000000)
"""
from pathlib import Path
from datetime import datetime, timezone
import yaml
import sys
def is_real_qnumber(q_value: str) -> bool:
"""Check if Q-number is real (not synthetic)."""
if not q_value or not q_value.startswith('Q'):
return False
try:
q_num = int(q_value[1:])
return q_num < 100000000 # Real Q-numbers are below 100M
except ValueError:
return False
def get_wikidata_id(institution: dict) -> str | None:
"""Extract Wikidata ID from institution identifiers."""
for id_obj in institution.get('identifiers', []):
if id_obj.get('identifier_scheme') == 'Wikidata':
q_val = id_obj.get('identifier_value', '')
if is_real_qnumber(q_val):
return q_val
return None
def merge_identifiers(sparql_inst: dict, isil_inst: dict) -> tuple[list, int]:
"""
Merge identifiers from both sources, prioritizing SPARQL.
Returns: (updated identifiers list, count of identifiers added)
"""
sparql_ids = sparql_inst.get('identifiers', [])
isil_ids = isil_inst.get('identifiers', [])
# Build set of existing schemes from SPARQL
existing_schemes = {}
for id_obj in sparql_ids:
scheme = id_obj.get('identifier_scheme', '')
value = id_obj.get('identifier_value', '')
if scheme:
existing_schemes[scheme] = value
# Add new identifiers from ISIL enrichment
added_count = 0
for id_obj in isil_ids:
scheme = id_obj.get('identifier_scheme', '')
value = id_obj.get('identifier_value', '')
# Skip synthetic Q-numbers
if scheme == 'Wikidata' and not is_real_qnumber(value):
continue
# Add if not already present
if scheme not in existing_schemes or existing_schemes[scheme] != value:
# Check if it's a real addition (not duplicate with different value)
if scheme not in existing_schemes:
sparql_ids.append(id_obj)
added_count += 1
return sparql_ids, added_count
def merge_datasets(sparql_data: list, isil_data: list) -> tuple[list, dict]:
"""
Merge SPARQL and ISIL datasets.
Returns: (merged_data, stats)
"""
# Index ISIL data by ID
isil_by_id = {inst['id']: inst for inst in isil_data}
stats = {
'total_institutions': len(sparql_data),
'isil_additions': 0,
'identifiers_added': 0,
'synthetic_removed': 0,
'countries_enriched': set()
}
merged = []
for sparql_inst in sparql_data:
inst_id = sparql_inst['id']
isil_inst = isil_by_id.get(inst_id)
if isil_inst:
# Check if SPARQL has Wikidata ID
sparql_wd = get_wikidata_id(sparql_inst)
isil_wd = get_wikidata_id(isil_inst)
# If SPARQL doesn't have Wikidata but ISIL does, add ISIL identifiers
if not sparql_wd and isil_wd:
updated_ids, added = merge_identifiers(sparql_inst, isil_inst)
sparql_inst['identifiers'] = updated_ids
if added > 0:
stats['isil_additions'] += 1
stats['identifiers_added'] += added
# Track country
country = sparql_inst.get('locations', [{}])[0].get('country', '')
if country:
stats['countries_enriched'].add(country)
# Update provenance
prov = sparql_inst.get('provenance', {})
if isinstance(prov, dict):
method = prov.get('extraction_method', '')
if 'ISIL batch enrichment' not in method:
prov['extraction_method'] = f"{method} + ISIL batch enrichment"
# Check for synthetic Q-numbers to remove
sparql_wd_ids = [
id_obj for id_obj in sparql_inst.get('identifiers', [])
if id_obj.get('identifier_scheme') == 'Wikidata'
]
for wd_id in sparql_wd_ids:
if not is_real_qnumber(wd_id.get('identifier_value', '')):
sparql_inst['identifiers'].remove(wd_id)
stats['synthetic_removed'] += 1
merged.append(sparql_inst)
return merged, stats
def main():
base_dir = Path(__file__).parent.parent
sparql_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_sparql_enriched.yaml"
isil_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_merged.yaml"
print("="*80)
print("🔀 MERGE SPARQL FUZZY MATCHES + ISIL ENRICHMENTS")
print("="*80)
print("\n📖 Loading datasets...\n")
# Load SPARQL enriched (has fuzzy matches)
print(" Loading SPARQL enriched file...")
with open(sparql_file, 'r', encoding='utf-8') as f:
sparql_data = yaml.safe_load(f)
print(f"{len(sparql_data):,} institutions")
# Load ISIL enriched (has ISIL matches)
print(" Loading ISIL enriched file...")
with open(isil_file, 'r', encoding='utf-8') as f:
isil_data = yaml.safe_load(f)
print(f"{len(isil_data):,} institutions\n")
# Count current NL coverage
nl_sparql = sum(
1 for inst in sparql_data
if any(loc.get('country') == 'NL' for loc in inst.get('locations', []))
and get_wikidata_id(inst)
)
nl_total = sum(
1 for inst in sparql_data
if any(loc.get('country') == 'NL' for loc in inst.get('locations', []))
)
print(f"📊 Netherlands coverage before merge: {nl_sparql}/{nl_total} ({nl_sparql/nl_total*100:.1f}%)\n")
# Merge
print("🔀 Merging datasets...\n")
merged_data, stats = merge_datasets(sparql_data, isil_data)
# Count NL coverage after merge
nl_merged = sum(
1 for inst in merged_data
if any(loc.get('country') == 'NL' for loc in inst.get('locations', []))
and get_wikidata_id(inst)
)
# Write output
print("💾 Writing merged dataset...\n")
header = f"""---
# Global Heritage Institutions - Merged SPARQL + ISIL Enrichment
# Generated: {datetime.now(timezone.utc).isoformat()}
#
# Total institutions: {len(merged_data):,}
# ISIL additions: {stats['isil_additions']:,}
# Identifiers added: {stats['identifiers_added']:,}
# Synthetic Q-numbers removed: {stats['synthetic_removed']:,}
# Countries with additional enrichment: {', '.join(sorted(stats['countries_enriched']))}
"""
with open(output_file, 'w', encoding='utf-8') as f:
f.write(header)
yaml.dump(merged_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
print(f"✅ Complete! Output: {output_file}\n")
# Final report
print("="*80)
print("📊 MERGE REPORT")
print("="*80)
print(f"\n✨ Results:")
print(f" Total institutions: {stats['total_institutions']:,}")
print(f" Institutions with ISIL additions: {stats['isil_additions']:,}")
print(f" Total identifiers added: {stats['identifiers_added']:,}")
print(f" Synthetic Q-numbers removed: {stats['synthetic_removed']:,}")
print(f" Countries enriched: {', '.join(sorted(stats['countries_enriched'])) or 'None'}")
print(f"\n🇳🇱 Netherlands Wikidata coverage:")
print(f" Before: {nl_sparql}/{nl_total} ({nl_sparql/nl_total*100:.1f}%)")
print(f" After: {nl_merged}/{nl_total} ({nl_merged/nl_total*100:.1f}%)")
print(f" Change: {nl_merged - nl_sparql:+,} matches")
print("="*80 + "\n")
if __name__ == "__main__":
main()