glam/scripts/unify_all_datasets.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

475 lines
19 KiB
Python

#!/usr/bin/env python3
"""
Unify All GLAM Datasets - Comprehensive Global Integration
This script unifies all heritage institution datasets from individual countries
into a single comprehensive global dataset at data/instances/all/
Features:
- Merges all country-specific YAML files
- Deduplicates by ID and coordinates
- Tracks data provenance by country
- Generates comprehensive statistics
- Identifies records needing enrichment (missing Q-numbers, coordinates, etc.)
Country Sources:
- Brazil: brazilian_institutions_batch6_enriched.yaml (115 institutions)
- Chile: chilean_institutions_batch19_enriched.yaml (90 institutions, 78.9% Wikidata)
- Mexico: mexican_institutions_geocoded.yaml (117 institutions)
- Japan: jp_institutions_resolved.yaml (12,065 institutions)
- Libya: libyan_institutions.yaml (54 institutions)
- Tunisia: tunisian_institutions.yaml (42 institutions)
- Algeria: algerian_institutions.yaml (20 institutions)
- Vietnam: vietnamese_glam_institutions.yaml (21 institutions)
- Georgia: georgia_glam_institutions.yaml (14 institutions)
- Global: global_heritage_institutions_merged.yaml (13,396 institutions)
Output:
- data/instances/all/globalglam-20251111.yaml
- data/instances/all/UNIFICATION_REPORT.md
- data/instances/all/ENRICHMENT_CANDIDATES.yaml (records needing enrichment)
"""
import yaml
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any
from collections import defaultdict
def load_yaml_safe(filepath: Path) -> List[Dict]:
"""Load YAML file safely with error handling."""
print(f"Loading: {filepath.name}")
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
if not data:
print(f" ⚠️ Empty file: {filepath.name}")
return []
if isinstance(data, list):
print(f" ✅ Loaded {len(data)} institutions")
return data
else:
print(f" ⚠️ Unexpected format (not a list): {type(data)}")
return []
except Exception as e:
print(f" ❌ Error loading {filepath.name}: {e}")
return []
def get_country_code(inst: Dict) -> str:
"""Extract country code from institution."""
if 'locations' in inst and inst['locations']:
for loc in inst['locations']:
if 'country' in loc and loc['country']:
return loc['country']
# Try to infer from ID
if 'id' in inst:
parts = inst['id'].split('/')
if len(parts) >= 2:
country = parts[-2]
if len(country) == 2:
return country.upper()
return 'UNKNOWN'
def has_wikidata(inst: Dict) -> bool:
"""Check if institution has Wikidata identifier."""
if 'identifiers' not in inst or not inst['identifiers']:
return False
return any(
id.get('identifier_scheme') == 'Wikidata'
for id in inst['identifiers']
)
def has_coordinates(inst: Dict) -> bool:
"""Check if institution has geocoded coordinates."""
if 'locations' not in inst or not inst['locations']:
return False
return any(
loc.get('latitude') is not None and loc.get('longitude') is not None
for loc in inst['locations']
)
def needs_enrichment(inst: Dict) -> Dict[str, bool]:
"""Identify what enrichment an institution needs."""
needs = {
'wikidata': not has_wikidata(inst),
'coordinates': not has_coordinates(inst),
'website': not any(
id.get('identifier_scheme') == 'Website'
for id in inst.get('identifiers', [])
) if inst.get('identifiers') else True,
'description': not inst.get('description') or len(inst.get('description', '')) < 50,
}
return needs
def main():
"""Main unification workflow."""
base_dir = Path('/Users/kempersc/apps/glam/data/instances')
output_dir = base_dir / 'all'
output_dir.mkdir(exist_ok=True)
print("\n" + "="*80)
print("GLAM Dataset Unification - Global Integration")
print("="*80 + "\n")
# Define data sources (most recent files for each country)
sources = {
'chile': base_dir / 'chile' / 'chilean_institutions_batch19_enriched.yaml',
'brazil': base_dir / 'brazil' / 'brazilian_institutions_batch6_enriched.yaml',
'mexico': base_dir / 'mexico' / 'mexican_institutions_geocoded.yaml',
'japan': base_dir / 'japan' / 'jp_institutions_resolved.yaml',
'libya': base_dir / 'libya' / 'libyan_institutions.yaml',
'tunisia': base_dir / 'tunisia' / 'tunisian_institutions.yaml',
'algeria': base_dir / 'algeria' / 'algerian_institutions.yaml',
'vietnam': base_dir / 'vietnamese_glam_institutions.yaml',
'georgia': base_dir / 'georgia_glam_institutions.yaml',
'historical': base_dir / 'historical_institutions_validation.yaml',
'global': base_dir / 'global' / 'global_heritage_institutions_merged.yaml',
}
# Load all datasets
all_institutions = []
source_stats = {}
for source_name, filepath in sources.items():
if not filepath.exists():
print(f"⚠️ Skipping {source_name}: file not found")
continue
institutions = load_yaml_safe(filepath)
# Add source tracking to provenance
for inst in institutions:
if 'provenance' not in inst:
inst['provenance'] = {}
inst['provenance']['unification_source'] = source_name
inst['provenance']['unification_date'] = datetime.now(timezone.utc).isoformat()
all_institutions.extend(institutions)
# Calculate statistics
source_stats[source_name] = {
'total': len(institutions),
'with_wikidata': sum(1 for i in institutions if has_wikidata(i)),
'with_coordinates': sum(1 for i in institutions if has_coordinates(i)),
}
print(f"\n📊 Total institutions loaded: {len(all_institutions)}")
# Deduplicate by ID
print("\n🔍 Deduplicating by ID...")
seen_ids = {} # Maps ID -> (institution, source_name)
duplicates = []
unique_institutions = []
for inst in all_institutions:
inst_id = inst.get('id')
if not inst_id:
unique_institutions.append(inst)
continue
source = inst['provenance'].get('unification_source', 'unknown')
if inst_id in seen_ids:
existing_inst, existing_source = seen_ids[inst_id]
duplicates.append({
'id': inst_id,
'sources': [existing_source, source]
})
# Keep the one with more data (prioritize those with Wikidata)
if has_wikidata(inst) and not has_wikidata(existing_inst):
# Replace with more enriched version
unique_institutions = [i for i in unique_institutions if i.get('id') != inst_id]
unique_institutions.append(inst)
seen_ids[inst_id] = (inst, source)
else:
seen_ids[inst_id] = (inst, source)
unique_institutions.append(inst)
print(f" ✅ Unique institutions: {len(unique_institutions)}")
print(f" ⚠️ Duplicates removed: {len(duplicates)}")
# Calculate enrichment statistics
print("\n📈 Calculating enrichment statistics...")
enrichment_stats = {
'total': len(unique_institutions),
'with_wikidata': sum(1 for i in unique_institutions if has_wikidata(i)),
'with_coordinates': sum(1 for i in unique_institutions if has_coordinates(i)),
'needs_wikidata': sum(1 for i in unique_institutions if needs_enrichment(i)['wikidata']),
'needs_coordinates': sum(1 for i in unique_institutions if needs_enrichment(i)['coordinates']),
'needs_website': sum(1 for i in unique_institutions if needs_enrichment(i)['website']),
'needs_description': sum(1 for i in unique_institutions if needs_enrichment(i)['description']),
}
# Group by country
by_country = defaultdict(list)
for inst in unique_institutions:
country = get_country_code(inst)
by_country[country].append(inst)
print(f"\n🌍 Countries covered: {len(by_country)}")
for country, insts in sorted(by_country.items(), key=lambda x: len(x[1]), reverse=True):
wikidata_count = sum(1 for i in insts if has_wikidata(i))
wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0
print(f" {country}: {len(insts)} institutions ({wikidata_count}/{len(insts)} = {wikidata_pct:.1f}% Wikidata)")
# Identify enrichment candidates
print("\n🎯 Identifying enrichment candidates...")
enrichment_candidates = []
for inst in unique_institutions:
needs = needs_enrichment(inst)
if any(needs.values()):
enrichment_candidates.append({
'id': inst.get('id'),
'name': inst.get('name'),
'country': get_country_code(inst),
'institution_type': inst.get('institution_type'),
'needs': needs,
'priority_score': sum(needs.values()) # Higher = more needs
})
# Sort by priority
enrichment_candidates.sort(key=lambda x: x['priority_score'], reverse=True)
print(f" 🔍 Found {len(enrichment_candidates)} institutions needing enrichment")
print(f" - Need Wikidata: {enrichment_stats['needs_wikidata']}")
print(f" - Need coordinates: {enrichment_stats['needs_coordinates']}")
print(f" - Need website: {enrichment_stats['needs_website']}")
print(f" - Need description: {enrichment_stats['needs_description']}")
# Save unified dataset
output_file = output_dir / 'globalglam-20251111.yaml'
print(f"\n💾 Saving unified dataset to: {output_file.name}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(unique_institutions, f, allow_unicode=True, sort_keys=False, width=120)
print(f" ✅ Saved {len(unique_institutions)} institutions")
# Save enrichment candidates
candidates_file = output_dir / 'ENRICHMENT_CANDIDATES.yaml'
print(f"\n💾 Saving enrichment candidates to: {candidates_file.name}")
with open(candidates_file, 'w', encoding='utf-8') as f:
yaml.dump(enrichment_candidates, f, allow_unicode=True, sort_keys=False)
print(f" ✅ Saved {len(enrichment_candidates)} candidates")
# Generate unification report
report_file = output_dir / 'UNIFICATION_REPORT.md'
print(f"\n📄 Generating unification report: {report_file.name}")
report = f"""# GLAM Dataset Unification Report
**Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}
## Executive Summary
- **Total Institutions**: {len(unique_institutions):,}
- **Countries Covered**: {len(by_country)}
- **Wikidata Coverage**: {enrichment_stats['with_wikidata']:,}/{enrichment_stats['total']:,} ({enrichment_stats['with_wikidata']/enrichment_stats['total']*100:.1f}%)
- **Geocoding Coverage**: {enrichment_stats['with_coordinates']:,}/{enrichment_stats['total']:,} ({enrichment_stats['with_coordinates']/enrichment_stats['total']*100:.1f}%)
- **Duplicates Removed**: {len(duplicates)}
## Data Sources
"""
for source_name, stats in sorted(source_stats.items()):
wikidata_pct = (stats['with_wikidata'] / stats['total'] * 100) if stats['total'] > 0 else 0
geocode_pct = (stats['with_coordinates'] / stats['total'] * 100) if stats['total'] > 0 else 0
report += f"""### {source_name.title()}
- Total: {stats['total']:,} institutions
- Wikidata: {stats['with_wikidata']:,} ({wikidata_pct:.1f}%)
- Geocoded: {stats['with_coordinates']:,} ({geocode_pct:.1f}%)
"""
report += f"""## Coverage by Country
| Country | Total | Wikidata | Wikidata % | Geocoded | Geocoded % |
|---------|-------|----------|------------|----------|------------|
"""
for country, insts in sorted(by_country.items(), key=lambda x: len(x[1]), reverse=True):
wikidata_count = sum(1 for i in insts if has_wikidata(i))
wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0
geocode_count = sum(1 for i in insts if has_coordinates(i))
geocode_pct = (geocode_count / len(insts) * 100) if insts else 0
report += f"| {country} | {len(insts):,} | {wikidata_count:,} | {wikidata_pct:.1f}% | {geocode_count:,} | {geocode_pct:.1f}% |\n"
report += f"""
## Enrichment Needs
Total institutions requiring enrichment: **{len(enrichment_candidates):,}** ({len(enrichment_candidates)/len(unique_institutions)*100:.1f}% of dataset)
### By Enrichment Type
- **Need Wikidata**: {enrichment_stats['needs_wikidata']:,} ({enrichment_stats['needs_wikidata']/enrichment_stats['total']*100:.1f}%)
- **Need Coordinates**: {enrichment_stats['needs_coordinates']:,} ({enrichment_stats['needs_coordinates']/enrichment_stats['total']*100:.1f}%)
- **Need Website**: {enrichment_stats['needs_website']:,} ({enrichment_stats['needs_website']/enrichment_stats['total']*100:.1f}%)
- **Need Description**: {enrichment_stats['needs_description']:,} ({enrichment_stats['needs_description']/enrichment_stats['total']*100:.1f}%)
### Priority Distribution (by number of missing fields)
"""
priority_dist = defaultdict(int)
for candidate in enrichment_candidates:
priority_dist[candidate['priority_score']] += 1
for priority in sorted(priority_dist.keys(), reverse=True):
count = priority_dist[priority]
report += f"- **Priority {priority}** ({priority} missing fields): {count:,} institutions\n"
report += f"""
## Top 50 Enrichment Candidates (Highest Priority)
| Name | Country | Type | Missing Fields |
|------|---------|------|----------------|
"""
for candidate in enrichment_candidates[:50]:
missing = ', '.join([k for k, v in candidate['needs'].items() if v])
name_short = candidate['name'][:60] + '...' if len(candidate['name']) > 60 else candidate['name']
report += f"| {name_short} | {candidate['country']} | {candidate['institution_type']} | {missing} |\n"
report += f"""
## Deduplication Details
### Duplicates Found
Total duplicate IDs: {len(duplicates)}
"""
if duplicates:
report += "| ID | Sources |\n|----|---------|\n"
for dup in duplicates[:20]: # Show first 20
sources_str = ', '.join(dup['sources'])
id_short = dup['id'][-50:] if len(dup['id']) > 50 else dup['id']
report += f"| ...{id_short} | {sources_str} |\n"
if len(duplicates) > 20:
report += f"\n*...and {len(duplicates) - 20} more duplicates*\n"
report += f"""
## Next Steps
### Immediate Actions
1. **Review Enrichment Candidates**: Check `ENRICHMENT_CANDIDATES.yaml` for institutions needing data
2. **Prioritize Countries**: Focus on countries with low Wikidata coverage:
"""
# Find countries with lowest Wikidata coverage
country_coverage = []
for country, insts in by_country.items():
if country == 'UNKNOWN':
continue
wikidata_count = sum(1 for i in insts if has_wikidata(i))
wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0
country_coverage.append((country, wikidata_pct, len(insts)))
country_coverage.sort(key=lambda x: x[1]) # Sort by coverage ascending
for country, pct, count in country_coverage[:10]:
report += f" - {country}: {pct:.1f}% coverage ({count} institutions)\n"
report += f"""
3. **Batch Enrichment Workflow**:
- Run Wikidata enrichment for high-priority candidates
- Run geocoding for missing coordinates
- Crawl institutional websites for missing data
### Tools Available
- **Wikidata Enrichment**: `scripts/enrich_global_batch.py`
- **Geocoding**: `scripts/geocode_institutions.py`
- **Website Crawling**: `scripts/crawl_institution_websites.py` (to be created)
## Files Generated
1. **globalglam-20251111.yaml** - Complete unified dataset ({len(unique_institutions):,} institutions)
2. **ENRICHMENT_CANDIDATES.yaml** - Institutions needing enrichment ({len(enrichment_candidates):,} candidates)
3. **UNIFICATION_REPORT.md** - This report
---
**Generated by**: `scripts/unify_all_datasets.py`
**Dataset Version**: 1.0
**Schema Version**: LinkML v0.2.1
"""
with open(report_file, 'w', encoding='utf-8') as f:
f.write(report)
print(f" ✅ Report saved")
# Update DATASET_STATISTICS.yaml
stats_file = output_dir / 'DATASET_STATISTICS.yaml'
print(f"\n📊 Updating statistics file: {stats_file.name}")
stats_data = {
'generated': datetime.now(timezone.utc).isoformat(),
'project': 'GLAM Data Extraction',
'schema_version': 'v0.2.1',
'unified_dataset': {
'total_institutions': len(unique_institutions),
'countries_covered': len(by_country),
'wikidata_coverage': {
'count': enrichment_stats['with_wikidata'],
'percentage': round(enrichment_stats['with_wikidata']/enrichment_stats['total']*100, 2)
},
'geocoding_coverage': {
'count': enrichment_stats['with_coordinates'],
'percentage': round(enrichment_stats['with_coordinates']/enrichment_stats['total']*100, 2)
},
'enrichment_needs': {
'total_candidates': len(enrichment_candidates),
'needs_wikidata': enrichment_stats['needs_wikidata'],
'needs_coordinates': enrichment_stats['needs_coordinates'],
'needs_website': enrichment_stats['needs_website'],
'needs_description': enrichment_stats['needs_description'],
}
},
'by_country': {}
}
for country, insts in sorted(by_country.items()):
wikidata_count = sum(1 for i in insts if has_wikidata(i))
geocode_count = sum(1 for i in insts if has_coordinates(i))
stats_data['by_country'][country] = {
'total': len(insts),
'wikidata_coverage': {
'count': wikidata_count,
'percentage': round(wikidata_count/len(insts)*100, 2) if insts else 0
},
'geocoding_coverage': {
'count': geocode_count,
'percentage': round(geocode_count/len(insts)*100, 2) if insts else 0
}
}
with open(stats_file, 'w', encoding='utf-8') as f:
yaml.dump(stats_data, f, allow_unicode=True, sort_keys=False)
print(f" ✅ Statistics updated")
print("\n" + "="*80)
print("✅ UNIFICATION COMPLETE!")
print("="*80)
print(f"\n📁 Output files in: {output_dir}/")
print(f" - globalglam-20251111.yaml ({len(unique_institutions):,} institutions)")
print(f" - ENRICHMENT_CANDIDATES.yaml ({len(enrichment_candidates):,} candidates)")
print(f" - UNIFICATION_REPORT.md")
print(f" - DATASET_STATISTICS.yaml")
print(f"\n🎯 Ready for global enrichment workflow!")
print(f" Next: Run enrichment on {enrichment_stats['needs_wikidata']:,} institutions without Wikidata")
if __name__ == '__main__':
main()