- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
226 lines
8.1 KiB
Python
226 lines
8.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Merge SPARQL fuzzy matches with ISIL-based enrichments.
|
|
|
|
Strategy:
|
|
1. Start with SPARQL enriched file (has 180 NL fuzzy matches)
|
|
2. Add ISIL-based Wikidata IDs from current file where missing
|
|
3. Preserve ALL real Q-numbers, prioritize SPARQL fuzzy matches
|
|
4. Remove synthetic Q-numbers (>Q100000000)
|
|
"""
|
|
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
import yaml
|
|
import sys
|
|
|
|
|
|
def is_real_qnumber(q_value: str) -> bool:
|
|
"""Check if Q-number is real (not synthetic)."""
|
|
if not q_value or not q_value.startswith('Q'):
|
|
return False
|
|
try:
|
|
q_num = int(q_value[1:])
|
|
return q_num < 100000000 # Real Q-numbers are below 100M
|
|
except ValueError:
|
|
return False
|
|
|
|
|
|
def get_wikidata_id(institution: dict) -> str | None:
|
|
"""Extract Wikidata ID from institution identifiers."""
|
|
for id_obj in institution.get('identifiers', []):
|
|
if id_obj.get('identifier_scheme') == 'Wikidata':
|
|
q_val = id_obj.get('identifier_value', '')
|
|
if is_real_qnumber(q_val):
|
|
return q_val
|
|
return None
|
|
|
|
|
|
def merge_identifiers(sparql_inst: dict, isil_inst: dict) -> tuple[list, int]:
|
|
"""
|
|
Merge identifiers from both sources, prioritizing SPARQL.
|
|
|
|
Returns: (updated identifiers list, count of identifiers added)
|
|
"""
|
|
sparql_ids = sparql_inst.get('identifiers', [])
|
|
isil_ids = isil_inst.get('identifiers', [])
|
|
|
|
# Build set of existing schemes from SPARQL
|
|
existing_schemes = {}
|
|
for id_obj in sparql_ids:
|
|
scheme = id_obj.get('identifier_scheme', '')
|
|
value = id_obj.get('identifier_value', '')
|
|
if scheme:
|
|
existing_schemes[scheme] = value
|
|
|
|
# Add new identifiers from ISIL enrichment
|
|
added_count = 0
|
|
for id_obj in isil_ids:
|
|
scheme = id_obj.get('identifier_scheme', '')
|
|
value = id_obj.get('identifier_value', '')
|
|
|
|
# Skip synthetic Q-numbers
|
|
if scheme == 'Wikidata' and not is_real_qnumber(value):
|
|
continue
|
|
|
|
# Add if not already present
|
|
if scheme not in existing_schemes or existing_schemes[scheme] != value:
|
|
# Check if it's a real addition (not duplicate with different value)
|
|
if scheme not in existing_schemes:
|
|
sparql_ids.append(id_obj)
|
|
added_count += 1
|
|
|
|
return sparql_ids, added_count
|
|
|
|
|
|
def merge_datasets(sparql_data: list, isil_data: list) -> tuple[list, dict]:
|
|
"""
|
|
Merge SPARQL and ISIL datasets.
|
|
|
|
Returns: (merged_data, stats)
|
|
"""
|
|
# Index ISIL data by ID
|
|
isil_by_id = {inst['id']: inst for inst in isil_data}
|
|
|
|
stats = {
|
|
'total_institutions': len(sparql_data),
|
|
'isil_additions': 0,
|
|
'identifiers_added': 0,
|
|
'synthetic_removed': 0,
|
|
'countries_enriched': set()
|
|
}
|
|
|
|
merged = []
|
|
|
|
for sparql_inst in sparql_data:
|
|
inst_id = sparql_inst['id']
|
|
isil_inst = isil_by_id.get(inst_id)
|
|
|
|
if isil_inst:
|
|
# Check if SPARQL has Wikidata ID
|
|
sparql_wd = get_wikidata_id(sparql_inst)
|
|
isil_wd = get_wikidata_id(isil_inst)
|
|
|
|
# If SPARQL doesn't have Wikidata but ISIL does, add ISIL identifiers
|
|
if not sparql_wd and isil_wd:
|
|
updated_ids, added = merge_identifiers(sparql_inst, isil_inst)
|
|
sparql_inst['identifiers'] = updated_ids
|
|
|
|
if added > 0:
|
|
stats['isil_additions'] += 1
|
|
stats['identifiers_added'] += added
|
|
|
|
# Track country
|
|
country = sparql_inst.get('locations', [{}])[0].get('country', '')
|
|
if country:
|
|
stats['countries_enriched'].add(country)
|
|
|
|
# Update provenance
|
|
prov = sparql_inst.get('provenance', {})
|
|
if isinstance(prov, dict):
|
|
method = prov.get('extraction_method', '')
|
|
if 'ISIL batch enrichment' not in method:
|
|
prov['extraction_method'] = f"{method} + ISIL batch enrichment"
|
|
|
|
# Check for synthetic Q-numbers to remove
|
|
sparql_wd_ids = [
|
|
id_obj for id_obj in sparql_inst.get('identifiers', [])
|
|
if id_obj.get('identifier_scheme') == 'Wikidata'
|
|
]
|
|
for wd_id in sparql_wd_ids:
|
|
if not is_real_qnumber(wd_id.get('identifier_value', '')):
|
|
sparql_inst['identifiers'].remove(wd_id)
|
|
stats['synthetic_removed'] += 1
|
|
|
|
merged.append(sparql_inst)
|
|
|
|
return merged, stats
|
|
|
|
|
|
def main():
|
|
base_dir = Path(__file__).parent.parent
|
|
sparql_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_sparql_enriched.yaml"
|
|
isil_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_wikidata_enriched.yaml"
|
|
output_file = base_dir / "data" / "instances" / "global" / "global_heritage_institutions_merged.yaml"
|
|
|
|
print("="*80)
|
|
print("🔀 MERGE SPARQL FUZZY MATCHES + ISIL ENRICHMENTS")
|
|
print("="*80)
|
|
print("\n📖 Loading datasets...\n")
|
|
|
|
# Load SPARQL enriched (has fuzzy matches)
|
|
print(" Loading SPARQL enriched file...")
|
|
with open(sparql_file, 'r', encoding='utf-8') as f:
|
|
sparql_data = yaml.safe_load(f)
|
|
print(f" ✅ {len(sparql_data):,} institutions")
|
|
|
|
# Load ISIL enriched (has ISIL matches)
|
|
print(" Loading ISIL enriched file...")
|
|
with open(isil_file, 'r', encoding='utf-8') as f:
|
|
isil_data = yaml.safe_load(f)
|
|
print(f" ✅ {len(isil_data):,} institutions\n")
|
|
|
|
# Count current NL coverage
|
|
nl_sparql = sum(
|
|
1 for inst in sparql_data
|
|
if any(loc.get('country') == 'NL' for loc in inst.get('locations', []))
|
|
and get_wikidata_id(inst)
|
|
)
|
|
nl_total = sum(
|
|
1 for inst in sparql_data
|
|
if any(loc.get('country') == 'NL' for loc in inst.get('locations', []))
|
|
)
|
|
|
|
print(f"📊 Netherlands coverage before merge: {nl_sparql}/{nl_total} ({nl_sparql/nl_total*100:.1f}%)\n")
|
|
|
|
# Merge
|
|
print("🔀 Merging datasets...\n")
|
|
merged_data, stats = merge_datasets(sparql_data, isil_data)
|
|
|
|
# Count NL coverage after merge
|
|
nl_merged = sum(
|
|
1 for inst in merged_data
|
|
if any(loc.get('country') == 'NL' for loc in inst.get('locations', []))
|
|
and get_wikidata_id(inst)
|
|
)
|
|
|
|
# Write output
|
|
print("💾 Writing merged dataset...\n")
|
|
|
|
header = f"""---
|
|
# Global Heritage Institutions - Merged SPARQL + ISIL Enrichment
|
|
# Generated: {datetime.now(timezone.utc).isoformat()}
|
|
#
|
|
# Total institutions: {len(merged_data):,}
|
|
# ISIL additions: {stats['isil_additions']:,}
|
|
# Identifiers added: {stats['identifiers_added']:,}
|
|
# Synthetic Q-numbers removed: {stats['synthetic_removed']:,}
|
|
# Countries with additional enrichment: {', '.join(sorted(stats['countries_enriched']))}
|
|
|
|
"""
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write(header)
|
|
yaml.dump(merged_data, f, allow_unicode=True, default_flow_style=False, sort_keys=False, width=120)
|
|
|
|
print(f"✅ Complete! Output: {output_file}\n")
|
|
|
|
# Final report
|
|
print("="*80)
|
|
print("📊 MERGE REPORT")
|
|
print("="*80)
|
|
print(f"\n✨ Results:")
|
|
print(f" Total institutions: {stats['total_institutions']:,}")
|
|
print(f" Institutions with ISIL additions: {stats['isil_additions']:,}")
|
|
print(f" Total identifiers added: {stats['identifiers_added']:,}")
|
|
print(f" Synthetic Q-numbers removed: {stats['synthetic_removed']:,}")
|
|
print(f" Countries enriched: {', '.join(sorted(stats['countries_enriched'])) or 'None'}")
|
|
print(f"\n🇳🇱 Netherlands Wikidata coverage:")
|
|
print(f" Before: {nl_sparql}/{nl_total} ({nl_sparql/nl_total*100:.1f}%)")
|
|
print(f" After: {nl_merged}/{nl_total} ({nl_merged/nl_total*100:.1f}%)")
|
|
print(f" Change: {nl_merged - nl_sparql:+,} matches")
|
|
print("="*80 + "\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|