glam/scripts/apply_collision_resolution_dutch_datasets.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

485 lines
18 KiB
Python

#!/usr/bin/env python3
"""
Apply Collision Resolution to Dutch Datasets
This script:
1. Parses Dutch ISIL registry (364 institutions)
2. Parses Dutch organizations CSV (1,351 institutions)
3. Deduplicates combined dataset to remove true duplicates
4. Generates GHCIDs for all institutions
5. Applies collision detection and resolution
6. Exports merged dataset with collision statistics
7. Generates detailed collision analysis report
Usage:
python scripts/apply_collision_resolution_dutch_datasets.py
Output:
- data/dutch_institutions_with_ghcids.yaml - Full dataset with resolved GHCIDs
- data/dutch_collision_report.txt - Detailed collision analysis
- data/dutch_collision_stats.json - Machine-readable statistics
- data/dutch_deduplication_report.txt - Deduplication details
"""
import json
from collections import defaultdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Set, Tuple
import yaml
from glam_extractor.identifiers.collision_detector import GHCIDCollisionDetector
from glam_extractor.identifiers.ghcid import GHCIDGenerator, InstitutionType
from glam_extractor.identifiers.lookups import get_ghcid_components_for_dutch_city
from glam_extractor.models import HeritageCustodian
from glam_extractor.parsers.deduplicator import InstitutionDeduplicator
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser
from glam_extractor.parsers.isil_registry import ISILRegistryParser
def generate_ghcid_for_institution(
institution: HeritageCustodian, generator: GHCIDGenerator
) -> None:
"""
Generate GHCID for a HeritageCustodian instance.
Updates the institution in-place with ghcid, ghcid_uuid, etc.
Args:
institution: HeritageCustodian to update
generator: GHCIDGenerator instance
"""
if not institution.locations or not institution.locations[0].city:
print(f"Warning: No city for {institution.name}, skipping GHCID generation")
return
city = institution.locations[0].city
country = institution.locations[0].country or "NL"
# Get GHCID components for this city
try:
component_dict = get_ghcid_components_for_dutch_city(
city=city,
institution_name=institution.name,
institution_type=InstitutionType[institution.institution_type],
)
except (ValueError, KeyError) as e:
print(
f"Warning: Could not generate GHCID components for {institution.name} "
f"in {city}: {e}"
)
return
if not component_dict:
print(f"Warning: No GHCID components for {institution.name} in {city}")
return
# Generate GHCID components
components = generator.generate(
institution_name=component_dict["institution_name"],
english_name=component_dict["english_name"],
institution_type=InstitutionType[institution.institution_type],
country_code=component_dict["country_code"],
region_code=component_dict["region_code"],
city_locode=component_dict["city_locode"],
)
# Update institution with all identifier formats
institution.ghcid = components.to_string()
institution.ghcid_uuid = str(components.to_uuid())
institution.ghcid_uuid_sha256 = str(components.to_uuid_sha256())
institution.ghcid_numeric = components.to_numeric()
def analyze_collisions(
institutions: List[HeritageCustodian],
) -> Dict[str, any]:
"""
Analyze resolved institutions and generate collision statistics.
Args:
institutions: List of resolved HeritageCustodian objects
Returns:
Dictionary of statistics
"""
stats = {
"total_institutions": len(institutions),
"institutions_with_collisions": 0,
"collision_groups": 0,
"first_batch_groups": 0,
"historical_addition_groups": 0,
"q_numbers_added": 0,
"wikidata_q_numbers": 0,
"synthetic_q_numbers": 0,
"ghcid_changes": 0,
"collisions_by_city": defaultdict(int),
"collisions_by_type": defaultdict(int),
"largest_collision_group": 0,
}
# Group by base GHCID (without Q-number)
base_ghcid_groups = defaultdict(list)
for inst in institutions:
if inst.ghcid:
base_ghcid = inst.ghcid.split("-Q")[0]
base_ghcid_groups[base_ghcid].append(inst)
# Analyze collision groups
for base_ghcid, group in base_ghcid_groups.items():
if len(group) > 1:
stats["collision_groups"] += 1
stats["largest_collision_group"] = max(
stats["largest_collision_group"], len(group)
)
# Determine if first batch or historical addition
extraction_dates = [inst.provenance.extraction_date for inst in group]
unique_dates = set(d.date() for d in extraction_dates)
if len(unique_dates) == 1:
stats["first_batch_groups"] += 1
else:
stats["historical_addition_groups"] += 1
# Count institutions in this collision
for inst in group:
stats["institutions_with_collisions"] += 1
# Track city and type
if inst.locations and inst.locations[0].city:
stats["collisions_by_city"][inst.locations[0].city] += 1
stats["collisions_by_type"][inst.institution_type] += 1
# Check if Q-number was added
if inst.ghcid and "-Q" in inst.ghcid:
stats["q_numbers_added"] += 1
# Check if Wikidata or synthetic
if any(
i.identifier_scheme == "Wikidata"
for i in (inst.identifiers or [])
):
stats["wikidata_q_numbers"] += 1
else:
stats["synthetic_q_numbers"] += 1
# Check if GHCID changed (has history)
if inst.ghcid_history and len(inst.ghcid_history) > 1:
stats["ghcid_changes"] += 1
return stats
def generate_collision_report(
institutions: List[HeritageCustodian], stats: Dict[str, any]
) -> str:
"""
Generate human-readable collision analysis report.
Args:
institutions: List of resolved HeritageCustodian objects
stats: Statistics dictionary from analyze_collisions()
Returns:
Multi-line string report
"""
lines = [
"=" * 80,
"GHCID Collision Analysis Report - Dutch Heritage Institutions",
"=" * 80,
"",
f"Generated: {datetime.now(timezone.utc).isoformat()}",
"",
"OVERVIEW",
"-" * 80,
f"Total institutions processed: {stats['total_institutions']:,}",
f"Institutions with collisions: {stats['institutions_with_collisions']:,} "
f"({stats['institutions_with_collisions'] / stats['total_institutions'] * 100:.1f}%)",
f"Collision groups detected: {stats['collision_groups']:,}",
f" - First batch collisions: {stats['first_batch_groups']:,}",
f" - Historical additions: {stats['historical_addition_groups']:,}",
f"Largest collision group: {stats['largest_collision_group']} institutions",
"",
"Q-NUMBER ASSIGNMENT",
"-" * 80,
f"Q-numbers added: {stats['q_numbers_added']:,}",
f" - From Wikidata: {stats['wikidata_q_numbers']:,}",
f" - Synthetic (generated): {stats['synthetic_q_numbers']:,}",
f"GHCID changes tracked: {stats['ghcid_changes']:,}",
"",
"COLLISIONS BY CITY",
"-" * 80,
]
# Sort cities by collision count
cities_sorted = sorted(
stats["collisions_by_city"].items(), key=lambda x: x[1], reverse=True
)
for city, count in cities_sorted[:20]: # Top 20 cities
lines.append(f" {city:<30} {count:>4} institutions")
lines.extend([
"",
"COLLISIONS BY INSTITUTION TYPE",
"-" * 80,
])
# Sort types by collision count
types_sorted = sorted(
stats["collisions_by_type"].items(), key=lambda x: x[1], reverse=True
)
for inst_type, count in types_sorted:
lines.append(f" {inst_type:<30} {count:>4} institutions")
lines.extend([
"",
"DETAILED COLLISION GROUPS",
"-" * 80,
"",
])
# Group by base GHCID for detailed listing
base_ghcid_groups = defaultdict(list)
for inst in institutions:
if inst.ghcid:
base_ghcid = inst.ghcid.split("-Q")[0]
base_ghcid_groups[base_ghcid].append(inst)
# Show collision groups (2+ institutions per base GHCID)
collision_groups = [
(base_ghcid, group)
for base_ghcid, group in base_ghcid_groups.items()
if len(group) > 1
]
collision_groups.sort(key=lambda x: len(x[1]), reverse=True)
for i, (base_ghcid, group) in enumerate(collision_groups[:50], 1): # Top 50 groups
lines.append(f"{i}. Base GHCID: {base_ghcid}")
lines.append(f" {len(group)} institutions:")
for inst in group:
q_suffix = ""
if inst.ghcid and "-Q" in inst.ghcid:
q_suffix = f"{inst.ghcid}"
city = inst.locations[0].city if inst.locations else "Unknown"
lines.append(
f" - {inst.name} ({city}){q_suffix}"
)
lines.append("")
lines.append("=" * 80)
return "\n".join(lines)
def main():
"""Main execution function"""
print("GHCID Collision Resolution - Dutch Datasets")
print("=" * 80)
# Initialize parsers and detector
isil_parser = ISILRegistryParser()
dutch_parser = DutchOrgsParser()
detector = GHCIDCollisionDetector()
generator = GHCIDGenerator()
# Parse ISIL registry
print("\n1. Parsing ISIL registry...")
isil_csv = Path("data/ISIL-codes_2025-08-01.csv")
if not isil_csv.exists():
print(f"ERROR: ISIL registry not found at {isil_csv}")
return
isil_institutions = isil_parser.parse_and_convert(isil_csv)
print(f" Loaded {len(isil_institutions):,} institutions from ISIL registry")
# Parse Dutch organizations
print("\n2. Parsing Dutch organizations CSV...")
dutch_csv = Path(
"data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv"
)
if not dutch_csv.exists():
print(f"ERROR: Dutch organizations CSV not found at {dutch_csv}")
return
dutch_institutions = dutch_parser.parse_and_convert(dutch_csv)
print(f" Loaded {len(dutch_institutions):,} institutions from Dutch orgs CSV")
# Combine datasets
all_institutions = isil_institutions + dutch_institutions
print(f"\n3. Combined dataset: {len(all_institutions):,} institutions")
# Deduplicate institutions before GHCID generation
print("\n4. Deduplicating institutions...")
deduplicator = InstitutionDeduplicator()
deduplicated_institutions = deduplicator.deduplicate(
all_institutions,
merge_metadata=True
)
duplicates_removed = len(all_institutions) - len(deduplicated_institutions)
print(f" Removed {duplicates_removed:,} duplicates")
print(f" {len(deduplicated_institutions):,} unique institutions remain")
# Generate deduplication report
dedup_report_lines = [
"Dutch Dataset Deduplication Report",
"=" * 80,
f"\nGenerated: {datetime.now(timezone.utc).isoformat()}",
f"\nTotal institutions (before deduplication): {len(all_institutions):,}",
f"Unique institutions (after deduplication): {len(deduplicated_institutions):,}",
f"Duplicates removed: {duplicates_removed:,}",
f"\nDuplicate groups detected: {len(deduplicator.duplicate_groups)}",
"\n" + "=" * 80,
"\nDuplicate Groups:\n"
]
for i, group in enumerate(deduplicator.duplicate_groups, 1):
dedup_report_lines.append(f"\nGroup {i} ({len(group)} duplicates):")
for inst in group:
city = inst.locations[0].city if inst.locations else "Unknown"
tier = inst.provenance.data_tier if inst.provenance else "Unknown"
dedup_report_lines.append(f" - {inst.name} ({city}) [{tier}]")
dedup_report = "\n".join(dedup_report_lines)
# Use deduplicated set for GHCID generation
all_institutions = deduplicated_institutions
# Generate GHCIDs for all institutions
print("\n5. Generating GHCIDs...")
successful_ghcids = 0
for inst in all_institutions:
if not inst.ghcid: # Only generate if not already present
generate_ghcid_for_institution(inst, generator)
if inst.ghcid:
successful_ghcids += 1
print(f" Generated GHCIDs for {successful_ghcids:,} institutions")
# Filter institutions with GHCIDs for collision detection
institutions_with_ghcids = [inst for inst in all_institutions if inst.ghcid]
print(
f" {len(institutions_with_ghcids):,} institutions have GHCIDs "
f"({len(all_institutions) - len(institutions_with_ghcids):,} without)"
)
# Detect and resolve collisions
print("\n6. Resolving collisions...")
resolved_institutions = detector.resolve_collisions(institutions_with_ghcids)
print(f" Resolved {len(resolved_institutions):,} institutions")
# Analyze collisions
print("\n7. Analyzing collision patterns...")
stats = analyze_collisions(resolved_institutions)
# Generate report
print("\n8. Generating collision report...")
report = generate_collision_report(resolved_institutions, stats)
# Export results
output_dir = Path("data")
output_dir.mkdir(exist_ok=True)
# Export merged dataset
print("\n9. Exporting results...")
# Convert to dict for YAML serialization
institutions_dict = [
{
"id": inst.id,
"name": inst.name,
"institution_type": inst.institution_type,
"ghcid": inst.ghcid,
"ghcid_uuid": inst.ghcid_uuid,
"ghcid_uuid_sha256": inst.ghcid_uuid_sha256,
"ghcid_numeric": inst.ghcid_numeric,
"identifiers": [
{
"identifier_scheme": i.identifier_scheme,
"identifier_value": i.identifier_value,
"identifier_url": str(i.identifier_url) if i.identifier_url else None,
}
for i in (inst.identifiers or [])
],
"locations": [
{
"city": loc.city,
"country": loc.country,
"street_address": loc.street_address,
}
for loc in (inst.locations or [])
],
"provenance": {
"data_source": inst.provenance.data_source,
"data_tier": inst.provenance.data_tier,
"extraction_date": inst.provenance.extraction_date.isoformat(),
"confidence_score": inst.provenance.confidence_score,
},
"ghcid_history": [
{
"ghcid": entry.ghcid,
"ghcid_numeric": entry.ghcid_numeric,
"valid_from": entry.valid_from.isoformat(),
"valid_to": entry.valid_to.isoformat() if entry.valid_to else None,
"reason": entry.reason,
}
for entry in (inst.ghcid_history or [])
] if inst.ghcid_history else [],
}
for inst in resolved_institutions
]
yaml_path = output_dir / "dutch_institutions_with_ghcids.yaml"
with open(yaml_path, "w") as f:
yaml.dump(institutions_dict, f, default_flow_style=False, sort_keys=False)
print(f" ✓ Exported dataset to {yaml_path}")
# Export collision report
report_path = output_dir / "dutch_collision_report.txt"
with open(report_path, "w") as f:
f.write(report)
print(f" ✓ Exported collision report to {report_path}")
# Export statistics
stats_path = output_dir / "dutch_collision_stats.json"
# Convert defaultdict to regular dict for JSON serialization
stats_serializable = {
k: dict(v) if isinstance(v, defaultdict) else v
for k, v in stats.items()
}
with open(stats_path, "w") as f:
json.dump(stats_serializable, f, indent=2, default=str)
print(f" ✓ Exported statistics to {stats_path}")
# Export deduplication report
dedup_report_path = output_dir / "dutch_deduplication_report.txt"
with open(dedup_report_path, "w") as f:
f.write(dedup_report)
print(f" ✓ Exported deduplication report to {dedup_report_path}")
# Print summary
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Initial institutions: {len(isil_institutions) + len(dutch_institutions):,}")
print(f"Duplicates removed: {duplicates_removed:,}")
print(f"Unique institutions: {stats['total_institutions']:,}")
print(f"Collision groups: {stats['collision_groups']:,}")
print(f"Q-numbers added: {stats['q_numbers_added']:,}")
print(f" - Wikidata: {stats['wikidata_q_numbers']:,}")
print(f" - Synthetic: {stats['synthetic_q_numbers']:,}")
print("\nOutput files:")
print(f" - {yaml_path}")
print(f" - {report_path}")
print(f" - {stats_path}")
print(f" - {dedup_report_path}")
print("=" * 80)
if __name__ == "__main__":
main()