186 lines
5.9 KiB
Python
186 lines
5.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Quick cross-link Czech datasets - exact matches only + provenance fix
|
|
|
|
Fast version focusing on:
|
|
1. Exact name matching (11 institutions)
|
|
2. Fix provenance metadata for all institutions
|
|
3. Create unified dataset
|
|
|
|
Skips fuzzy matching for speed (can add later if needed).
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
|
|
# File paths
|
|
ADR_FILE = Path("data/instances/czech_institutions.yaml")
|
|
ARON_FILE = Path("data/instances/czech_archives_aron.yaml")
|
|
OUTPUT_FILE = Path("data/instances/czech_unified.yaml")
|
|
|
|
print("=" * 70)
|
|
print("Czech Dataset Quick Cross-linking")
|
|
print("=" * 70)
|
|
|
|
# Load datasets
|
|
print("\nLoading datasets...")
|
|
with open(ADR_FILE, 'r', encoding='utf-8') as f:
|
|
adr = yaml.safe_load(f)
|
|
|
|
with open(ARON_FILE, 'r', encoding='utf-8') as f:
|
|
aron = yaml.safe_load(f)
|
|
|
|
print(f" ADR: {len(adr):,} institutions")
|
|
print(f" ARON: {len(aron):,} institutions")
|
|
|
|
# Find exact matches
|
|
print("\nFinding exact name matches...")
|
|
adr_by_name = {inst['name']: (i, inst) for i, inst in enumerate(adr)}
|
|
aron_by_name = {inst['name']: (i, inst) for i, inst in enumerate(aron)}
|
|
|
|
exact_matches = set(adr_by_name.keys()) & set(aron_by_name.keys())
|
|
print(f" Found {len(exact_matches)} exact matches")
|
|
|
|
matched_adr_indices = set()
|
|
matched_aron_indices = set()
|
|
|
|
# Build unified dataset
|
|
print("\nBuilding unified dataset...")
|
|
unified = []
|
|
|
|
# 1. Merge exact matches
|
|
print(f" Merging {len(exact_matches)} exact matches...")
|
|
for name in exact_matches:
|
|
adr_idx, adr_inst = adr_by_name[name]
|
|
aron_idx, aron_inst = aron_by_name[name]
|
|
|
|
matched_adr_indices.add(adr_idx)
|
|
matched_aron_indices.add(aron_idx)
|
|
|
|
# Use ADR as base, add ARON identifiers
|
|
merged = adr_inst.copy()
|
|
|
|
# Add ARON identifiers
|
|
if 'identifiers' not in merged:
|
|
merged['identifiers'] = []
|
|
|
|
for aron_id in aron_inst.get('identifiers', []):
|
|
if not any(i.get('identifier_scheme') == aron_id['identifier_scheme']
|
|
for i in merged['identifiers']):
|
|
merged['identifiers'].append(aron_id)
|
|
|
|
# Merge descriptions
|
|
adr_desc = adr_inst.get('description', '')
|
|
aron_desc = aron_inst.get('description', '')
|
|
|
|
if adr_desc and aron_desc and adr_desc != aron_desc:
|
|
merged['description'] = f"{adr_desc}\\n\\nArchival context: {aron_desc}"
|
|
elif aron_desc and not adr_desc:
|
|
merged['description'] = aron_desc
|
|
|
|
# Fix provenance
|
|
merged['provenance'] = {
|
|
'data_source': 'API_SCRAPING',
|
|
'data_tier': 'TIER_1_AUTHORITATIVE',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Merged from ADR (library API) and ARON (archive API) - exact name match',
|
|
'confidence_score': 1.0,
|
|
'source_url': 'https://adr.cz + https://portal.nacr.cz/aron',
|
|
'notes': 'Combined metadata from both ADR and ARON databases'
|
|
}
|
|
|
|
unified.append(merged)
|
|
|
|
# 2. Add ADR-only institutions (with fixed provenance)
|
|
adr_only_count = 0
|
|
print(" Adding ADR-only institutions...")
|
|
for i, inst in enumerate(adr):
|
|
if i not in matched_adr_indices:
|
|
inst = inst.copy()
|
|
inst['provenance']['data_source'] = 'API_SCRAPING'
|
|
inst['provenance']['source_url'] = 'https://adr.cz/api/institution/list'
|
|
inst['provenance']['extraction_method'] = 'ADR library database API scraping'
|
|
unified.append(inst)
|
|
adr_only_count += 1
|
|
|
|
print(f" Added {adr_only_count:,} ADR-only institutions")
|
|
|
|
# 3. Add ARON-only institutions (with fixed provenance)
|
|
aron_only_count = 0
|
|
print(" Adding ARON-only institutions...")
|
|
for i, inst in enumerate(aron):
|
|
if i not in matched_aron_indices:
|
|
inst = inst.copy()
|
|
inst['provenance']['data_source'] = 'API_SCRAPING'
|
|
inst['provenance']['source_url'] = 'https://portal.nacr.cz/aron/institution'
|
|
inst['provenance']['extraction_method'] = 'ARON archive portal API scraping (reverse-engineered with type filter)'
|
|
unified.append(inst)
|
|
aron_only_count += 1
|
|
|
|
print(f" Added {aron_only_count:,} ARON-only institutions")
|
|
|
|
print(f"\nUnified dataset: {len(unified):,} institutions")
|
|
print(f" Merged: {len(exact_matches)}")
|
|
print(f" ADR only: {adr_only_count:,}")
|
|
print(f" ARON only: {aron_only_count:,}")
|
|
|
|
# Save
|
|
print(f"\nSaving to {OUTPUT_FILE}...")
|
|
OUTPUT_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
yaml.dump(unified, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print(f"Saved {len(unified):,} institutions")
|
|
|
|
# Generate quick report
|
|
print("\nGenerating report...")
|
|
report = f"""# Czech Dataset Cross-linking Report (Quick Version)
|
|
|
|
**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
|
|
**Status**: ✅ COMPLETE (exact matches only)
|
|
|
|
## Summary
|
|
|
|
- **Exact matches**: {len(exact_matches)} institutions
|
|
- **ADR only**: {adr_only_count:,} institutions
|
|
- **ARON only**: {aron_only_count:,} institutions
|
|
- **Total unified**: {len(unified):,} institutions
|
|
|
|
## Exact Matches
|
|
|
|
"""
|
|
|
|
for name in sorted(exact_matches):
|
|
report += f"- {name}\n"
|
|
|
|
report += f"""
|
|
|
|
## Provenance Fixes ✅
|
|
|
|
All {len(unified):,} institutions now have corrected metadata:
|
|
- **data_source**: Changed from `CONVERSATION_NLP` to `API_SCRAPING`
|
|
- **source_url**: Added proper API endpoints
|
|
- **extraction_method**: Clarified for ADR vs ARON vs merged
|
|
|
|
## Files Created
|
|
|
|
1. **`{OUTPUT_FILE}`** - Unified dataset ({len(unified):,} institutions)
|
|
2. **`CZECH_CROSSLINK_REPORT.md`** - This report
|
|
|
|
## Next Steps
|
|
|
|
- [x] Cross-link datasets (exact matches)
|
|
- [x] Fix provenance metadata
|
|
- [ ] Geocode addresses (Priority 1 - next)
|
|
- [ ] Fuzzy matching (optional - can add later)
|
|
- [ ] Wikidata enrichment (Priority 2)
|
|
"""
|
|
|
|
with open('CZECH_CROSSLINK_REPORT.md', 'w', encoding='utf-8') as f:
|
|
f.write(report)
|
|
|
|
print("Report saved: CZECH_CROSSLINK_REPORT.md")
|
|
print("\n✅ Cross-linking complete!")
|
|
print(f"\nNext: Geocode {adr_only_count + len(exact_matches):,} ADR addresses")
|