- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
475 lines
19 KiB
Python
475 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Unify All GLAM Datasets - Comprehensive Global Integration
|
|
|
|
This script unifies all heritage institution datasets from individual countries
|
|
into a single comprehensive global dataset at data/instances/all/
|
|
|
|
Features:
|
|
- Merges all country-specific YAML files
|
|
- Deduplicates by ID and coordinates
|
|
- Tracks data provenance by country
|
|
- Generates comprehensive statistics
|
|
- Identifies records needing enrichment (missing Q-numbers, coordinates, etc.)
|
|
|
|
Country Sources:
|
|
- Brazil: brazilian_institutions_batch6_enriched.yaml (115 institutions)
|
|
- Chile: chilean_institutions_batch19_enriched.yaml (90 institutions, 78.9% Wikidata)
|
|
- Mexico: mexican_institutions_geocoded.yaml (117 institutions)
|
|
- Japan: jp_institutions_resolved.yaml (12,065 institutions)
|
|
- Libya: libyan_institutions.yaml (54 institutions)
|
|
- Tunisia: tunisian_institutions.yaml (42 institutions)
|
|
- Algeria: algerian_institutions.yaml (20 institutions)
|
|
- Vietnam: vietnamese_glam_institutions.yaml (21 institutions)
|
|
- Georgia: georgia_glam_institutions.yaml (14 institutions)
|
|
- Global: global_heritage_institutions_merged.yaml (13,396 institutions)
|
|
|
|
Output:
|
|
- data/instances/all/globalglam-20251111.yaml
|
|
- data/instances/all/UNIFICATION_REPORT.md
|
|
- data/instances/all/ENRICHMENT_CANDIDATES.yaml (records needing enrichment)
|
|
"""
|
|
|
|
import yaml
|
|
import os
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any
|
|
from collections import defaultdict
|
|
|
|
|
|
def load_yaml_safe(filepath: Path) -> List[Dict]:
|
|
"""Load YAML file safely with error handling."""
|
|
print(f"Loading: {filepath.name}")
|
|
try:
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
if not data:
|
|
print(f" ⚠️ Empty file: {filepath.name}")
|
|
return []
|
|
if isinstance(data, list):
|
|
print(f" ✅ Loaded {len(data)} institutions")
|
|
return data
|
|
else:
|
|
print(f" ⚠️ Unexpected format (not a list): {type(data)}")
|
|
return []
|
|
except Exception as e:
|
|
print(f" ❌ Error loading {filepath.name}: {e}")
|
|
return []
|
|
|
|
|
|
def get_country_code(inst: Dict) -> str:
|
|
"""Extract country code from institution."""
|
|
if 'locations' in inst and inst['locations']:
|
|
for loc in inst['locations']:
|
|
if 'country' in loc and loc['country']:
|
|
return loc['country']
|
|
# Try to infer from ID
|
|
if 'id' in inst:
|
|
parts = inst['id'].split('/')
|
|
if len(parts) >= 2:
|
|
country = parts[-2]
|
|
if len(country) == 2:
|
|
return country.upper()
|
|
return 'UNKNOWN'
|
|
|
|
|
|
def has_wikidata(inst: Dict) -> bool:
|
|
"""Check if institution has Wikidata identifier."""
|
|
if 'identifiers' not in inst or not inst['identifiers']:
|
|
return False
|
|
return any(
|
|
id.get('identifier_scheme') == 'Wikidata'
|
|
for id in inst['identifiers']
|
|
)
|
|
|
|
|
|
def has_coordinates(inst: Dict) -> bool:
|
|
"""Check if institution has geocoded coordinates."""
|
|
if 'locations' not in inst or not inst['locations']:
|
|
return False
|
|
return any(
|
|
loc.get('latitude') is not None and loc.get('longitude') is not None
|
|
for loc in inst['locations']
|
|
)
|
|
|
|
|
|
def needs_enrichment(inst: Dict) -> Dict[str, bool]:
|
|
"""Identify what enrichment an institution needs."""
|
|
needs = {
|
|
'wikidata': not has_wikidata(inst),
|
|
'coordinates': not has_coordinates(inst),
|
|
'website': not any(
|
|
id.get('identifier_scheme') == 'Website'
|
|
for id in inst.get('identifiers', [])
|
|
) if inst.get('identifiers') else True,
|
|
'description': not inst.get('description') or len(inst.get('description', '')) < 50,
|
|
}
|
|
return needs
|
|
|
|
|
|
def main():
|
|
"""Main unification workflow."""
|
|
base_dir = Path('/Users/kempersc/apps/glam/data/instances')
|
|
output_dir = base_dir / 'all'
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
print("\n" + "="*80)
|
|
print("GLAM Dataset Unification - Global Integration")
|
|
print("="*80 + "\n")
|
|
|
|
# Define data sources (most recent files for each country)
|
|
sources = {
|
|
'chile': base_dir / 'chile' / 'chilean_institutions_batch19_enriched.yaml',
|
|
'brazil': base_dir / 'brazil' / 'brazilian_institutions_batch6_enriched.yaml',
|
|
'mexico': base_dir / 'mexico' / 'mexican_institutions_geocoded.yaml',
|
|
'japan': base_dir / 'japan' / 'jp_institutions_resolved.yaml',
|
|
'libya': base_dir / 'libya' / 'libyan_institutions.yaml',
|
|
'tunisia': base_dir / 'tunisia' / 'tunisian_institutions.yaml',
|
|
'algeria': base_dir / 'algeria' / 'algerian_institutions.yaml',
|
|
'vietnam': base_dir / 'vietnamese_glam_institutions.yaml',
|
|
'georgia': base_dir / 'georgia_glam_institutions.yaml',
|
|
'historical': base_dir / 'historical_institutions_validation.yaml',
|
|
'global': base_dir / 'global' / 'global_heritage_institutions_merged.yaml',
|
|
}
|
|
|
|
# Load all datasets
|
|
all_institutions = []
|
|
source_stats = {}
|
|
|
|
for source_name, filepath in sources.items():
|
|
if not filepath.exists():
|
|
print(f"⚠️ Skipping {source_name}: file not found")
|
|
continue
|
|
|
|
institutions = load_yaml_safe(filepath)
|
|
|
|
# Add source tracking to provenance
|
|
for inst in institutions:
|
|
if 'provenance' not in inst:
|
|
inst['provenance'] = {}
|
|
inst['provenance']['unification_source'] = source_name
|
|
inst['provenance']['unification_date'] = datetime.now(timezone.utc).isoformat()
|
|
|
|
all_institutions.extend(institutions)
|
|
|
|
# Calculate statistics
|
|
source_stats[source_name] = {
|
|
'total': len(institutions),
|
|
'with_wikidata': sum(1 for i in institutions if has_wikidata(i)),
|
|
'with_coordinates': sum(1 for i in institutions if has_coordinates(i)),
|
|
}
|
|
|
|
print(f"\n📊 Total institutions loaded: {len(all_institutions)}")
|
|
|
|
# Deduplicate by ID
|
|
print("\n🔍 Deduplicating by ID...")
|
|
seen_ids = {} # Maps ID -> (institution, source_name)
|
|
duplicates = []
|
|
unique_institutions = []
|
|
|
|
for inst in all_institutions:
|
|
inst_id = inst.get('id')
|
|
if not inst_id:
|
|
unique_institutions.append(inst)
|
|
continue
|
|
|
|
source = inst['provenance'].get('unification_source', 'unknown')
|
|
|
|
if inst_id in seen_ids:
|
|
existing_inst, existing_source = seen_ids[inst_id]
|
|
duplicates.append({
|
|
'id': inst_id,
|
|
'sources': [existing_source, source]
|
|
})
|
|
# Keep the one with more data (prioritize those with Wikidata)
|
|
if has_wikidata(inst) and not has_wikidata(existing_inst):
|
|
# Replace with more enriched version
|
|
unique_institutions = [i for i in unique_institutions if i.get('id') != inst_id]
|
|
unique_institutions.append(inst)
|
|
seen_ids[inst_id] = (inst, source)
|
|
else:
|
|
seen_ids[inst_id] = (inst, source)
|
|
unique_institutions.append(inst)
|
|
|
|
print(f" ✅ Unique institutions: {len(unique_institutions)}")
|
|
print(f" ⚠️ Duplicates removed: {len(duplicates)}")
|
|
|
|
# Calculate enrichment statistics
|
|
print("\n📈 Calculating enrichment statistics...")
|
|
enrichment_stats = {
|
|
'total': len(unique_institutions),
|
|
'with_wikidata': sum(1 for i in unique_institutions if has_wikidata(i)),
|
|
'with_coordinates': sum(1 for i in unique_institutions if has_coordinates(i)),
|
|
'needs_wikidata': sum(1 for i in unique_institutions if needs_enrichment(i)['wikidata']),
|
|
'needs_coordinates': sum(1 for i in unique_institutions if needs_enrichment(i)['coordinates']),
|
|
'needs_website': sum(1 for i in unique_institutions if needs_enrichment(i)['website']),
|
|
'needs_description': sum(1 for i in unique_institutions if needs_enrichment(i)['description']),
|
|
}
|
|
|
|
# Group by country
|
|
by_country = defaultdict(list)
|
|
for inst in unique_institutions:
|
|
country = get_country_code(inst)
|
|
by_country[country].append(inst)
|
|
|
|
print(f"\n🌍 Countries covered: {len(by_country)}")
|
|
for country, insts in sorted(by_country.items(), key=lambda x: len(x[1]), reverse=True):
|
|
wikidata_count = sum(1 for i in insts if has_wikidata(i))
|
|
wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0
|
|
print(f" {country}: {len(insts)} institutions ({wikidata_count}/{len(insts)} = {wikidata_pct:.1f}% Wikidata)")
|
|
|
|
# Identify enrichment candidates
|
|
print("\n🎯 Identifying enrichment candidates...")
|
|
enrichment_candidates = []
|
|
for inst in unique_institutions:
|
|
needs = needs_enrichment(inst)
|
|
if any(needs.values()):
|
|
enrichment_candidates.append({
|
|
'id': inst.get('id'),
|
|
'name': inst.get('name'),
|
|
'country': get_country_code(inst),
|
|
'institution_type': inst.get('institution_type'),
|
|
'needs': needs,
|
|
'priority_score': sum(needs.values()) # Higher = more needs
|
|
})
|
|
|
|
# Sort by priority
|
|
enrichment_candidates.sort(key=lambda x: x['priority_score'], reverse=True)
|
|
|
|
print(f" 🔍 Found {len(enrichment_candidates)} institutions needing enrichment")
|
|
print(f" - Need Wikidata: {enrichment_stats['needs_wikidata']}")
|
|
print(f" - Need coordinates: {enrichment_stats['needs_coordinates']}")
|
|
print(f" - Need website: {enrichment_stats['needs_website']}")
|
|
print(f" - Need description: {enrichment_stats['needs_description']}")
|
|
|
|
# Save unified dataset
|
|
output_file = output_dir / 'globalglam-20251111.yaml'
|
|
print(f"\n💾 Saving unified dataset to: {output_file.name}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(unique_institutions, f, allow_unicode=True, sort_keys=False, width=120)
|
|
print(f" ✅ Saved {len(unique_institutions)} institutions")
|
|
|
|
# Save enrichment candidates
|
|
candidates_file = output_dir / 'ENRICHMENT_CANDIDATES.yaml'
|
|
print(f"\n💾 Saving enrichment candidates to: {candidates_file.name}")
|
|
with open(candidates_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(enrichment_candidates, f, allow_unicode=True, sort_keys=False)
|
|
print(f" ✅ Saved {len(enrichment_candidates)} candidates")
|
|
|
|
# Generate unification report
|
|
report_file = output_dir / 'UNIFICATION_REPORT.md'
|
|
print(f"\n📄 Generating unification report: {report_file.name}")
|
|
|
|
report = f"""# GLAM Dataset Unification Report
|
|
|
|
**Generated**: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}
|
|
|
|
## Executive Summary
|
|
|
|
- **Total Institutions**: {len(unique_institutions):,}
|
|
- **Countries Covered**: {len(by_country)}
|
|
- **Wikidata Coverage**: {enrichment_stats['with_wikidata']:,}/{enrichment_stats['total']:,} ({enrichment_stats['with_wikidata']/enrichment_stats['total']*100:.1f}%)
|
|
- **Geocoding Coverage**: {enrichment_stats['with_coordinates']:,}/{enrichment_stats['total']:,} ({enrichment_stats['with_coordinates']/enrichment_stats['total']*100:.1f}%)
|
|
- **Duplicates Removed**: {len(duplicates)}
|
|
|
|
## Data Sources
|
|
|
|
"""
|
|
|
|
for source_name, stats in sorted(source_stats.items()):
|
|
wikidata_pct = (stats['with_wikidata'] / stats['total'] * 100) if stats['total'] > 0 else 0
|
|
geocode_pct = (stats['with_coordinates'] / stats['total'] * 100) if stats['total'] > 0 else 0
|
|
report += f"""### {source_name.title()}
|
|
- Total: {stats['total']:,} institutions
|
|
- Wikidata: {stats['with_wikidata']:,} ({wikidata_pct:.1f}%)
|
|
- Geocoded: {stats['with_coordinates']:,} ({geocode_pct:.1f}%)
|
|
|
|
"""
|
|
|
|
report += f"""## Coverage by Country
|
|
|
|
| Country | Total | Wikidata | Wikidata % | Geocoded | Geocoded % |
|
|
|---------|-------|----------|------------|----------|------------|
|
|
"""
|
|
|
|
for country, insts in sorted(by_country.items(), key=lambda x: len(x[1]), reverse=True):
|
|
wikidata_count = sum(1 for i in insts if has_wikidata(i))
|
|
wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0
|
|
geocode_count = sum(1 for i in insts if has_coordinates(i))
|
|
geocode_pct = (geocode_count / len(insts) * 100) if insts else 0
|
|
|
|
report += f"| {country} | {len(insts):,} | {wikidata_count:,} | {wikidata_pct:.1f}% | {geocode_count:,} | {geocode_pct:.1f}% |\n"
|
|
|
|
report += f"""
|
|
## Enrichment Needs
|
|
|
|
Total institutions requiring enrichment: **{len(enrichment_candidates):,}** ({len(enrichment_candidates)/len(unique_institutions)*100:.1f}% of dataset)
|
|
|
|
### By Enrichment Type
|
|
|
|
- **Need Wikidata**: {enrichment_stats['needs_wikidata']:,} ({enrichment_stats['needs_wikidata']/enrichment_stats['total']*100:.1f}%)
|
|
- **Need Coordinates**: {enrichment_stats['needs_coordinates']:,} ({enrichment_stats['needs_coordinates']/enrichment_stats['total']*100:.1f}%)
|
|
- **Need Website**: {enrichment_stats['needs_website']:,} ({enrichment_stats['needs_website']/enrichment_stats['total']*100:.1f}%)
|
|
- **Need Description**: {enrichment_stats['needs_description']:,} ({enrichment_stats['needs_description']/enrichment_stats['total']*100:.1f}%)
|
|
|
|
### Priority Distribution (by number of missing fields)
|
|
|
|
"""
|
|
|
|
priority_dist = defaultdict(int)
|
|
for candidate in enrichment_candidates:
|
|
priority_dist[candidate['priority_score']] += 1
|
|
|
|
for priority in sorted(priority_dist.keys(), reverse=True):
|
|
count = priority_dist[priority]
|
|
report += f"- **Priority {priority}** ({priority} missing fields): {count:,} institutions\n"
|
|
|
|
report += f"""
|
|
## Top 50 Enrichment Candidates (Highest Priority)
|
|
|
|
| Name | Country | Type | Missing Fields |
|
|
|------|---------|------|----------------|
|
|
"""
|
|
|
|
for candidate in enrichment_candidates[:50]:
|
|
missing = ', '.join([k for k, v in candidate['needs'].items() if v])
|
|
name_short = candidate['name'][:60] + '...' if len(candidate['name']) > 60 else candidate['name']
|
|
report += f"| {name_short} | {candidate['country']} | {candidate['institution_type']} | {missing} |\n"
|
|
|
|
report += f"""
|
|
## Deduplication Details
|
|
|
|
### Duplicates Found
|
|
|
|
Total duplicate IDs: {len(duplicates)}
|
|
|
|
"""
|
|
|
|
if duplicates:
|
|
report += "| ID | Sources |\n|----|---------|\n"
|
|
for dup in duplicates[:20]: # Show first 20
|
|
sources_str = ', '.join(dup['sources'])
|
|
id_short = dup['id'][-50:] if len(dup['id']) > 50 else dup['id']
|
|
report += f"| ...{id_short} | {sources_str} |\n"
|
|
|
|
if len(duplicates) > 20:
|
|
report += f"\n*...and {len(duplicates) - 20} more duplicates*\n"
|
|
|
|
report += f"""
|
|
## Next Steps
|
|
|
|
### Immediate Actions
|
|
|
|
1. **Review Enrichment Candidates**: Check `ENRICHMENT_CANDIDATES.yaml` for institutions needing data
|
|
2. **Prioritize Countries**: Focus on countries with low Wikidata coverage:
|
|
"""
|
|
|
|
# Find countries with lowest Wikidata coverage
|
|
country_coverage = []
|
|
for country, insts in by_country.items():
|
|
if country == 'UNKNOWN':
|
|
continue
|
|
wikidata_count = sum(1 for i in insts if has_wikidata(i))
|
|
wikidata_pct = (wikidata_count / len(insts) * 100) if insts else 0
|
|
country_coverage.append((country, wikidata_pct, len(insts)))
|
|
|
|
country_coverage.sort(key=lambda x: x[1]) # Sort by coverage ascending
|
|
|
|
for country, pct, count in country_coverage[:10]:
|
|
report += f" - {country}: {pct:.1f}% coverage ({count} institutions)\n"
|
|
|
|
report += f"""
|
|
3. **Batch Enrichment Workflow**:
|
|
- Run Wikidata enrichment for high-priority candidates
|
|
- Run geocoding for missing coordinates
|
|
- Crawl institutional websites for missing data
|
|
|
|
### Tools Available
|
|
|
|
- **Wikidata Enrichment**: `scripts/enrich_global_batch.py`
|
|
- **Geocoding**: `scripts/geocode_institutions.py`
|
|
- **Website Crawling**: `scripts/crawl_institution_websites.py` (to be created)
|
|
|
|
## Files Generated
|
|
|
|
1. **globalglam-20251111.yaml** - Complete unified dataset ({len(unique_institutions):,} institutions)
|
|
2. **ENRICHMENT_CANDIDATES.yaml** - Institutions needing enrichment ({len(enrichment_candidates):,} candidates)
|
|
3. **UNIFICATION_REPORT.md** - This report
|
|
|
|
---
|
|
|
|
**Generated by**: `scripts/unify_all_datasets.py`
|
|
**Dataset Version**: 1.0
|
|
**Schema Version**: LinkML v0.2.1
|
|
"""
|
|
|
|
with open(report_file, 'w', encoding='utf-8') as f:
|
|
f.write(report)
|
|
|
|
print(f" ✅ Report saved")
|
|
|
|
# Update DATASET_STATISTICS.yaml
|
|
stats_file = output_dir / 'DATASET_STATISTICS.yaml'
|
|
print(f"\n📊 Updating statistics file: {stats_file.name}")
|
|
|
|
stats_data = {
|
|
'generated': datetime.now(timezone.utc).isoformat(),
|
|
'project': 'GLAM Data Extraction',
|
|
'schema_version': 'v0.2.1',
|
|
'unified_dataset': {
|
|
'total_institutions': len(unique_institutions),
|
|
'countries_covered': len(by_country),
|
|
'wikidata_coverage': {
|
|
'count': enrichment_stats['with_wikidata'],
|
|
'percentage': round(enrichment_stats['with_wikidata']/enrichment_stats['total']*100, 2)
|
|
},
|
|
'geocoding_coverage': {
|
|
'count': enrichment_stats['with_coordinates'],
|
|
'percentage': round(enrichment_stats['with_coordinates']/enrichment_stats['total']*100, 2)
|
|
},
|
|
'enrichment_needs': {
|
|
'total_candidates': len(enrichment_candidates),
|
|
'needs_wikidata': enrichment_stats['needs_wikidata'],
|
|
'needs_coordinates': enrichment_stats['needs_coordinates'],
|
|
'needs_website': enrichment_stats['needs_website'],
|
|
'needs_description': enrichment_stats['needs_description'],
|
|
}
|
|
},
|
|
'by_country': {}
|
|
}
|
|
|
|
for country, insts in sorted(by_country.items()):
|
|
wikidata_count = sum(1 for i in insts if has_wikidata(i))
|
|
geocode_count = sum(1 for i in insts if has_coordinates(i))
|
|
stats_data['by_country'][country] = {
|
|
'total': len(insts),
|
|
'wikidata_coverage': {
|
|
'count': wikidata_count,
|
|
'percentage': round(wikidata_count/len(insts)*100, 2) if insts else 0
|
|
},
|
|
'geocoding_coverage': {
|
|
'count': geocode_count,
|
|
'percentage': round(geocode_count/len(insts)*100, 2) if insts else 0
|
|
}
|
|
}
|
|
|
|
with open(stats_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(stats_data, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f" ✅ Statistics updated")
|
|
|
|
print("\n" + "="*80)
|
|
print("✅ UNIFICATION COMPLETE!")
|
|
print("="*80)
|
|
print(f"\n📁 Output files in: {output_dir}/")
|
|
print(f" - globalglam-20251111.yaml ({len(unique_institutions):,} institutions)")
|
|
print(f" - ENRICHMENT_CANDIDATES.yaml ({len(enrichment_candidates):,} candidates)")
|
|
print(f" - UNIFICATION_REPORT.md")
|
|
print(f" - DATASET_STATISTICS.yaml")
|
|
print(f"\n🎯 Ready for global enrichment workflow!")
|
|
print(f" Next: Run enrichment on {enrichment_stats['needs_wikidata']:,} institutions without Wikidata")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|