- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
485 lines
18 KiB
Python
485 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Apply Collision Resolution to Dutch Datasets
|
|
|
|
This script:
|
|
1. Parses Dutch ISIL registry (364 institutions)
|
|
2. Parses Dutch organizations CSV (1,351 institutions)
|
|
3. Deduplicates combined dataset to remove true duplicates
|
|
4. Generates GHCIDs for all institutions
|
|
5. Applies collision detection and resolution
|
|
6. Exports merged dataset with collision statistics
|
|
7. Generates detailed collision analysis report
|
|
|
|
Usage:
|
|
python scripts/apply_collision_resolution_dutch_datasets.py
|
|
|
|
Output:
|
|
- data/dutch_institutions_with_ghcids.yaml - Full dataset with resolved GHCIDs
|
|
- data/dutch_collision_report.txt - Detailed collision analysis
|
|
- data/dutch_collision_stats.json - Machine-readable statistics
|
|
- data/dutch_deduplication_report.txt - Deduplication details
|
|
"""
|
|
|
|
import json
|
|
from collections import defaultdict
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Set, Tuple
|
|
|
|
import yaml
|
|
|
|
from glam_extractor.identifiers.collision_detector import GHCIDCollisionDetector
|
|
from glam_extractor.identifiers.ghcid import GHCIDGenerator, InstitutionType
|
|
from glam_extractor.identifiers.lookups import get_ghcid_components_for_dutch_city
|
|
from glam_extractor.models import HeritageCustodian
|
|
from glam_extractor.parsers.deduplicator import InstitutionDeduplicator
|
|
from glam_extractor.parsers.dutch_orgs import DutchOrgsParser
|
|
from glam_extractor.parsers.isil_registry import ISILRegistryParser
|
|
|
|
|
|
def generate_ghcid_for_institution(
|
|
institution: HeritageCustodian, generator: GHCIDGenerator
|
|
) -> None:
|
|
"""
|
|
Generate GHCID for a HeritageCustodian instance.
|
|
|
|
Updates the institution in-place with ghcid, ghcid_uuid, etc.
|
|
|
|
Args:
|
|
institution: HeritageCustodian to update
|
|
generator: GHCIDGenerator instance
|
|
"""
|
|
if not institution.locations or not institution.locations[0].city:
|
|
print(f"Warning: No city for {institution.name}, skipping GHCID generation")
|
|
return
|
|
|
|
city = institution.locations[0].city
|
|
country = institution.locations[0].country or "NL"
|
|
|
|
# Get GHCID components for this city
|
|
try:
|
|
component_dict = get_ghcid_components_for_dutch_city(
|
|
city=city,
|
|
institution_name=institution.name,
|
|
institution_type=InstitutionType[institution.institution_type],
|
|
)
|
|
except (ValueError, KeyError) as e:
|
|
print(
|
|
f"Warning: Could not generate GHCID components for {institution.name} "
|
|
f"in {city}: {e}"
|
|
)
|
|
return
|
|
|
|
if not component_dict:
|
|
print(f"Warning: No GHCID components for {institution.name} in {city}")
|
|
return
|
|
|
|
# Generate GHCID components
|
|
components = generator.generate(
|
|
institution_name=component_dict["institution_name"],
|
|
english_name=component_dict["english_name"],
|
|
institution_type=InstitutionType[institution.institution_type],
|
|
country_code=component_dict["country_code"],
|
|
region_code=component_dict["region_code"],
|
|
city_locode=component_dict["city_locode"],
|
|
)
|
|
|
|
# Update institution with all identifier formats
|
|
institution.ghcid = components.to_string()
|
|
institution.ghcid_uuid = str(components.to_uuid())
|
|
institution.ghcid_uuid_sha256 = str(components.to_uuid_sha256())
|
|
institution.ghcid_numeric = components.to_numeric()
|
|
|
|
|
|
def analyze_collisions(
|
|
institutions: List[HeritageCustodian],
|
|
) -> Dict[str, any]:
|
|
"""
|
|
Analyze resolved institutions and generate collision statistics.
|
|
|
|
Args:
|
|
institutions: List of resolved HeritageCustodian objects
|
|
|
|
Returns:
|
|
Dictionary of statistics
|
|
"""
|
|
stats = {
|
|
"total_institutions": len(institutions),
|
|
"institutions_with_collisions": 0,
|
|
"collision_groups": 0,
|
|
"first_batch_groups": 0,
|
|
"historical_addition_groups": 0,
|
|
"q_numbers_added": 0,
|
|
"wikidata_q_numbers": 0,
|
|
"synthetic_q_numbers": 0,
|
|
"ghcid_changes": 0,
|
|
"collisions_by_city": defaultdict(int),
|
|
"collisions_by_type": defaultdict(int),
|
|
"largest_collision_group": 0,
|
|
}
|
|
|
|
# Group by base GHCID (without Q-number)
|
|
base_ghcid_groups = defaultdict(list)
|
|
for inst in institutions:
|
|
if inst.ghcid:
|
|
base_ghcid = inst.ghcid.split("-Q")[0]
|
|
base_ghcid_groups[base_ghcid].append(inst)
|
|
|
|
# Analyze collision groups
|
|
for base_ghcid, group in base_ghcid_groups.items():
|
|
if len(group) > 1:
|
|
stats["collision_groups"] += 1
|
|
stats["largest_collision_group"] = max(
|
|
stats["largest_collision_group"], len(group)
|
|
)
|
|
|
|
# Determine if first batch or historical addition
|
|
extraction_dates = [inst.provenance.extraction_date for inst in group]
|
|
unique_dates = set(d.date() for d in extraction_dates)
|
|
|
|
if len(unique_dates) == 1:
|
|
stats["first_batch_groups"] += 1
|
|
else:
|
|
stats["historical_addition_groups"] += 1
|
|
|
|
# Count institutions in this collision
|
|
for inst in group:
|
|
stats["institutions_with_collisions"] += 1
|
|
|
|
# Track city and type
|
|
if inst.locations and inst.locations[0].city:
|
|
stats["collisions_by_city"][inst.locations[0].city] += 1
|
|
stats["collisions_by_type"][inst.institution_type] += 1
|
|
|
|
# Check if Q-number was added
|
|
if inst.ghcid and "-Q" in inst.ghcid:
|
|
stats["q_numbers_added"] += 1
|
|
|
|
# Check if Wikidata or synthetic
|
|
if any(
|
|
i.identifier_scheme == "Wikidata"
|
|
for i in (inst.identifiers or [])
|
|
):
|
|
stats["wikidata_q_numbers"] += 1
|
|
else:
|
|
stats["synthetic_q_numbers"] += 1
|
|
|
|
# Check if GHCID changed (has history)
|
|
if inst.ghcid_history and len(inst.ghcid_history) > 1:
|
|
stats["ghcid_changes"] += 1
|
|
|
|
return stats
|
|
|
|
|
|
def generate_collision_report(
|
|
institutions: List[HeritageCustodian], stats: Dict[str, any]
|
|
) -> str:
|
|
"""
|
|
Generate human-readable collision analysis report.
|
|
|
|
Args:
|
|
institutions: List of resolved HeritageCustodian objects
|
|
stats: Statistics dictionary from analyze_collisions()
|
|
|
|
Returns:
|
|
Multi-line string report
|
|
"""
|
|
lines = [
|
|
"=" * 80,
|
|
"GHCID Collision Analysis Report - Dutch Heritage Institutions",
|
|
"=" * 80,
|
|
"",
|
|
f"Generated: {datetime.now(timezone.utc).isoformat()}",
|
|
"",
|
|
"OVERVIEW",
|
|
"-" * 80,
|
|
f"Total institutions processed: {stats['total_institutions']:,}",
|
|
f"Institutions with collisions: {stats['institutions_with_collisions']:,} "
|
|
f"({stats['institutions_with_collisions'] / stats['total_institutions'] * 100:.1f}%)",
|
|
f"Collision groups detected: {stats['collision_groups']:,}",
|
|
f" - First batch collisions: {stats['first_batch_groups']:,}",
|
|
f" - Historical additions: {stats['historical_addition_groups']:,}",
|
|
f"Largest collision group: {stats['largest_collision_group']} institutions",
|
|
"",
|
|
"Q-NUMBER ASSIGNMENT",
|
|
"-" * 80,
|
|
f"Q-numbers added: {stats['q_numbers_added']:,}",
|
|
f" - From Wikidata: {stats['wikidata_q_numbers']:,}",
|
|
f" - Synthetic (generated): {stats['synthetic_q_numbers']:,}",
|
|
f"GHCID changes tracked: {stats['ghcid_changes']:,}",
|
|
"",
|
|
"COLLISIONS BY CITY",
|
|
"-" * 80,
|
|
]
|
|
|
|
# Sort cities by collision count
|
|
cities_sorted = sorted(
|
|
stats["collisions_by_city"].items(), key=lambda x: x[1], reverse=True
|
|
)
|
|
for city, count in cities_sorted[:20]: # Top 20 cities
|
|
lines.append(f" {city:<30} {count:>4} institutions")
|
|
|
|
lines.extend([
|
|
"",
|
|
"COLLISIONS BY INSTITUTION TYPE",
|
|
"-" * 80,
|
|
])
|
|
|
|
# Sort types by collision count
|
|
types_sorted = sorted(
|
|
stats["collisions_by_type"].items(), key=lambda x: x[1], reverse=True
|
|
)
|
|
for inst_type, count in types_sorted:
|
|
lines.append(f" {inst_type:<30} {count:>4} institutions")
|
|
|
|
lines.extend([
|
|
"",
|
|
"DETAILED COLLISION GROUPS",
|
|
"-" * 80,
|
|
"",
|
|
])
|
|
|
|
# Group by base GHCID for detailed listing
|
|
base_ghcid_groups = defaultdict(list)
|
|
for inst in institutions:
|
|
if inst.ghcid:
|
|
base_ghcid = inst.ghcid.split("-Q")[0]
|
|
base_ghcid_groups[base_ghcid].append(inst)
|
|
|
|
# Show collision groups (2+ institutions per base GHCID)
|
|
collision_groups = [
|
|
(base_ghcid, group)
|
|
for base_ghcid, group in base_ghcid_groups.items()
|
|
if len(group) > 1
|
|
]
|
|
collision_groups.sort(key=lambda x: len(x[1]), reverse=True)
|
|
|
|
for i, (base_ghcid, group) in enumerate(collision_groups[:50], 1): # Top 50 groups
|
|
lines.append(f"{i}. Base GHCID: {base_ghcid}")
|
|
lines.append(f" {len(group)} institutions:")
|
|
|
|
for inst in group:
|
|
q_suffix = ""
|
|
if inst.ghcid and "-Q" in inst.ghcid:
|
|
q_suffix = f" → {inst.ghcid}"
|
|
|
|
city = inst.locations[0].city if inst.locations else "Unknown"
|
|
lines.append(
|
|
f" - {inst.name} ({city}){q_suffix}"
|
|
)
|
|
|
|
lines.append("")
|
|
|
|
lines.append("=" * 80)
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main():
|
|
"""Main execution function"""
|
|
print("GHCID Collision Resolution - Dutch Datasets")
|
|
print("=" * 80)
|
|
|
|
# Initialize parsers and detector
|
|
isil_parser = ISILRegistryParser()
|
|
dutch_parser = DutchOrgsParser()
|
|
detector = GHCIDCollisionDetector()
|
|
generator = GHCIDGenerator()
|
|
|
|
# Parse ISIL registry
|
|
print("\n1. Parsing ISIL registry...")
|
|
isil_csv = Path("data/ISIL-codes_2025-08-01.csv")
|
|
if not isil_csv.exists():
|
|
print(f"ERROR: ISIL registry not found at {isil_csv}")
|
|
return
|
|
|
|
isil_institutions = isil_parser.parse_and_convert(isil_csv)
|
|
print(f" Loaded {len(isil_institutions):,} institutions from ISIL registry")
|
|
|
|
# Parse Dutch organizations
|
|
print("\n2. Parsing Dutch organizations CSV...")
|
|
dutch_csv = Path(
|
|
"data/voorbeeld_lijst_organisaties_en_diensten-totaallijst_nederland.csv"
|
|
)
|
|
if not dutch_csv.exists():
|
|
print(f"ERROR: Dutch organizations CSV not found at {dutch_csv}")
|
|
return
|
|
|
|
dutch_institutions = dutch_parser.parse_and_convert(dutch_csv)
|
|
print(f" Loaded {len(dutch_institutions):,} institutions from Dutch orgs CSV")
|
|
|
|
# Combine datasets
|
|
all_institutions = isil_institutions + dutch_institutions
|
|
print(f"\n3. Combined dataset: {len(all_institutions):,} institutions")
|
|
|
|
# Deduplicate institutions before GHCID generation
|
|
print("\n4. Deduplicating institutions...")
|
|
deduplicator = InstitutionDeduplicator()
|
|
deduplicated_institutions = deduplicator.deduplicate(
|
|
all_institutions,
|
|
merge_metadata=True
|
|
)
|
|
|
|
duplicates_removed = len(all_institutions) - len(deduplicated_institutions)
|
|
print(f" Removed {duplicates_removed:,} duplicates")
|
|
print(f" {len(deduplicated_institutions):,} unique institutions remain")
|
|
|
|
# Generate deduplication report
|
|
dedup_report_lines = [
|
|
"Dutch Dataset Deduplication Report",
|
|
"=" * 80,
|
|
f"\nGenerated: {datetime.now(timezone.utc).isoformat()}",
|
|
f"\nTotal institutions (before deduplication): {len(all_institutions):,}",
|
|
f"Unique institutions (after deduplication): {len(deduplicated_institutions):,}",
|
|
f"Duplicates removed: {duplicates_removed:,}",
|
|
f"\nDuplicate groups detected: {len(deduplicator.duplicate_groups)}",
|
|
"\n" + "=" * 80,
|
|
"\nDuplicate Groups:\n"
|
|
]
|
|
|
|
for i, group in enumerate(deduplicator.duplicate_groups, 1):
|
|
dedup_report_lines.append(f"\nGroup {i} ({len(group)} duplicates):")
|
|
for inst in group:
|
|
city = inst.locations[0].city if inst.locations else "Unknown"
|
|
tier = inst.provenance.data_tier if inst.provenance else "Unknown"
|
|
dedup_report_lines.append(f" - {inst.name} ({city}) [{tier}]")
|
|
|
|
dedup_report = "\n".join(dedup_report_lines)
|
|
|
|
# Use deduplicated set for GHCID generation
|
|
all_institutions = deduplicated_institutions
|
|
|
|
# Generate GHCIDs for all institutions
|
|
print("\n5. Generating GHCIDs...")
|
|
successful_ghcids = 0
|
|
for inst in all_institutions:
|
|
if not inst.ghcid: # Only generate if not already present
|
|
generate_ghcid_for_institution(inst, generator)
|
|
if inst.ghcid:
|
|
successful_ghcids += 1
|
|
|
|
print(f" Generated GHCIDs for {successful_ghcids:,} institutions")
|
|
|
|
# Filter institutions with GHCIDs for collision detection
|
|
institutions_with_ghcids = [inst for inst in all_institutions if inst.ghcid]
|
|
print(
|
|
f" {len(institutions_with_ghcids):,} institutions have GHCIDs "
|
|
f"({len(all_institutions) - len(institutions_with_ghcids):,} without)"
|
|
)
|
|
|
|
# Detect and resolve collisions
|
|
print("\n6. Resolving collisions...")
|
|
resolved_institutions = detector.resolve_collisions(institutions_with_ghcids)
|
|
print(f" Resolved {len(resolved_institutions):,} institutions")
|
|
|
|
# Analyze collisions
|
|
print("\n7. Analyzing collision patterns...")
|
|
stats = analyze_collisions(resolved_institutions)
|
|
|
|
# Generate report
|
|
print("\n8. Generating collision report...")
|
|
report = generate_collision_report(resolved_institutions, stats)
|
|
|
|
# Export results
|
|
output_dir = Path("data")
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
# Export merged dataset
|
|
print("\n9. Exporting results...")
|
|
|
|
# Convert to dict for YAML serialization
|
|
institutions_dict = [
|
|
{
|
|
"id": inst.id,
|
|
"name": inst.name,
|
|
"institution_type": inst.institution_type,
|
|
"ghcid": inst.ghcid,
|
|
"ghcid_uuid": inst.ghcid_uuid,
|
|
"ghcid_uuid_sha256": inst.ghcid_uuid_sha256,
|
|
"ghcid_numeric": inst.ghcid_numeric,
|
|
"identifiers": [
|
|
{
|
|
"identifier_scheme": i.identifier_scheme,
|
|
"identifier_value": i.identifier_value,
|
|
"identifier_url": str(i.identifier_url) if i.identifier_url else None,
|
|
}
|
|
for i in (inst.identifiers or [])
|
|
],
|
|
"locations": [
|
|
{
|
|
"city": loc.city,
|
|
"country": loc.country,
|
|
"street_address": loc.street_address,
|
|
}
|
|
for loc in (inst.locations or [])
|
|
],
|
|
"provenance": {
|
|
"data_source": inst.provenance.data_source,
|
|
"data_tier": inst.provenance.data_tier,
|
|
"extraction_date": inst.provenance.extraction_date.isoformat(),
|
|
"confidence_score": inst.provenance.confidence_score,
|
|
},
|
|
"ghcid_history": [
|
|
{
|
|
"ghcid": entry.ghcid,
|
|
"ghcid_numeric": entry.ghcid_numeric,
|
|
"valid_from": entry.valid_from.isoformat(),
|
|
"valid_to": entry.valid_to.isoformat() if entry.valid_to else None,
|
|
"reason": entry.reason,
|
|
}
|
|
for entry in (inst.ghcid_history or [])
|
|
] if inst.ghcid_history else [],
|
|
}
|
|
for inst in resolved_institutions
|
|
]
|
|
|
|
yaml_path = output_dir / "dutch_institutions_with_ghcids.yaml"
|
|
with open(yaml_path, "w") as f:
|
|
yaml.dump(institutions_dict, f, default_flow_style=False, sort_keys=False)
|
|
print(f" ✓ Exported dataset to {yaml_path}")
|
|
|
|
# Export collision report
|
|
report_path = output_dir / "dutch_collision_report.txt"
|
|
with open(report_path, "w") as f:
|
|
f.write(report)
|
|
print(f" ✓ Exported collision report to {report_path}")
|
|
|
|
# Export statistics
|
|
stats_path = output_dir / "dutch_collision_stats.json"
|
|
# Convert defaultdict to regular dict for JSON serialization
|
|
stats_serializable = {
|
|
k: dict(v) if isinstance(v, defaultdict) else v
|
|
for k, v in stats.items()
|
|
}
|
|
with open(stats_path, "w") as f:
|
|
json.dump(stats_serializable, f, indent=2, default=str)
|
|
print(f" ✓ Exported statistics to {stats_path}")
|
|
|
|
# Export deduplication report
|
|
dedup_report_path = output_dir / "dutch_deduplication_report.txt"
|
|
with open(dedup_report_path, "w") as f:
|
|
f.write(dedup_report)
|
|
print(f" ✓ Exported deduplication report to {dedup_report_path}")
|
|
|
|
# Print summary
|
|
print("\n" + "=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Initial institutions: {len(isil_institutions) + len(dutch_institutions):,}")
|
|
print(f"Duplicates removed: {duplicates_removed:,}")
|
|
print(f"Unique institutions: {stats['total_institutions']:,}")
|
|
print(f"Collision groups: {stats['collision_groups']:,}")
|
|
print(f"Q-numbers added: {stats['q_numbers_added']:,}")
|
|
print(f" - Wikidata: {stats['wikidata_q_numbers']:,}")
|
|
print(f" - Synthetic: {stats['synthetic_q_numbers']:,}")
|
|
print("\nOutput files:")
|
|
print(f" - {yaml_path}")
|
|
print(f" - {report_path}")
|
|
print(f" - {stats_path}")
|
|
print(f" - {dedup_report_path}")
|
|
print("=" * 80)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|