- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
393 lines
14 KiB
Python
393 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Combine Brazilian, Chilean, and Mexican GLAM institution datasets into a unified collection.
|
|
|
|
This script:
|
|
1. Loads the three master YAML files
|
|
2. Validates and merges them into a single collection
|
|
3. Generates statistics about the combined dataset
|
|
4. Outputs a unified YAML file
|
|
|
|
Input files:
|
|
- data/instances/brazilian_institutions.yaml (97 institutions)
|
|
- data/instances/chilean_institutions.yaml (90 institutions)
|
|
- data/instances/mexican_institutions.yaml (117 institutions)
|
|
|
|
Output:
|
|
- data/instances/latin_american_institutions.yaml (304 institutions combined)
|
|
- data/instances/latin_american_combination_report.md
|
|
"""
|
|
|
|
import yaml
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from collections import Counter
|
|
from typing import List, Dict, Any
|
|
|
|
# File paths
|
|
BASE_DIR = Path(__file__).parent.parent
|
|
INSTANCES_DIR = BASE_DIR / "data" / "instances"
|
|
|
|
INPUT_FILES = {
|
|
"brazilian": INSTANCES_DIR / "brazilian_institutions.yaml",
|
|
"chilean": INSTANCES_DIR / "chilean_institutions.yaml",
|
|
"mexican": INSTANCES_DIR / "mexican_institutions.yaml"
|
|
}
|
|
|
|
OUTPUT_FILE = INSTANCES_DIR / "latin_american_institutions.yaml"
|
|
REPORT_FILE = INSTANCES_DIR / "latin_american_combination_report.md"
|
|
|
|
|
|
def load_yaml_file(file_path: Path) -> List[Dict[str, Any]]:
|
|
"""Load a YAML file and return the list of institutions."""
|
|
print(f"Loading {file_path.name}...")
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Handle both direct lists and documents with metadata headers
|
|
if isinstance(data, list):
|
|
return data
|
|
elif isinstance(data, dict):
|
|
# If it's a dict, it might be wrapped - look for common keys
|
|
if 'institutions' in data:
|
|
return data['institutions']
|
|
else:
|
|
raise ValueError(f"Unexpected YAML structure in {file_path}")
|
|
else:
|
|
raise ValueError(f"Unexpected YAML type in {file_path}: {type(data)}")
|
|
|
|
|
|
def get_country_from_id(institution_id: str) -> str:
|
|
"""Extract country code from institution ID."""
|
|
# IDs are like: https://w3id.org/heritage/custodian/br/...
|
|
parts = institution_id.split('/')
|
|
if len(parts) >= 6:
|
|
return parts[5].upper()
|
|
return "UNKNOWN"
|
|
|
|
|
|
def has_coordinates(institution: Dict[str, Any]) -> bool:
|
|
"""Check if institution has geographic coordinates."""
|
|
locations = institution.get('locations', [])
|
|
if not locations:
|
|
return False
|
|
|
|
for loc in locations:
|
|
if loc.get('latitude') is not None and loc.get('longitude') is not None:
|
|
return True
|
|
return False
|
|
|
|
|
|
def analyze_institutions(institutions: List[Dict[str, Any]], country_name: str) -> Dict[str, Any]:
|
|
"""Analyze a set of institutions and return statistics."""
|
|
stats = {
|
|
'country': country_name,
|
|
'total': len(institutions),
|
|
'geocoded': sum(1 for inst in institutions if has_coordinates(inst)),
|
|
'with_identifiers': sum(1 for inst in institutions if inst.get('identifiers')),
|
|
'with_platforms': sum(1 for inst in institutions if inst.get('digital_platforms')),
|
|
'with_collections': sum(1 for inst in institutions if inst.get('collections')),
|
|
'institution_types': Counter(),
|
|
'regions': Counter(),
|
|
'cities': Counter()
|
|
}
|
|
|
|
for inst in institutions:
|
|
# Count institution types
|
|
inst_type = inst.get('institution_type', 'UNKNOWN')
|
|
stats['institution_types'][inst_type] += 1
|
|
|
|
# Count regions and cities
|
|
for loc in inst.get('locations', []):
|
|
if loc.get('region'):
|
|
stats['regions'][loc['region']] += 1
|
|
if loc.get('city'):
|
|
stats['cities'][loc['city']] += 1
|
|
|
|
stats['geocoding_rate'] = (stats['geocoded'] / stats['total'] * 100) if stats['total'] > 0 else 0
|
|
stats['unique_regions'] = len(stats['regions'])
|
|
stats['unique_cities'] = len(stats['cities'])
|
|
|
|
return stats
|
|
|
|
|
|
def combine_datasets() -> None:
|
|
"""Main function to combine datasets."""
|
|
print("=" * 60)
|
|
print("Latin American GLAM Institutions Dataset Combination")
|
|
print("=" * 60)
|
|
print()
|
|
|
|
# Load all three datasets
|
|
datasets = {}
|
|
all_institutions = []
|
|
|
|
for country_key, file_path in INPUT_FILES.items():
|
|
if not file_path.exists():
|
|
print(f"ERROR: {file_path} not found!")
|
|
return
|
|
|
|
institutions = load_yaml_file(file_path)
|
|
datasets[country_key] = institutions
|
|
all_institutions.extend(institutions)
|
|
print(f" Loaded {len(institutions)} institutions from {file_path.name}")
|
|
|
|
print(f"\nTotal institutions loaded: {len(all_institutions)}")
|
|
|
|
# Analyze each dataset
|
|
print("\n" + "=" * 60)
|
|
print("Individual Dataset Statistics")
|
|
print("=" * 60)
|
|
|
|
country_stats = {}
|
|
for country_key, institutions in datasets.items():
|
|
country_name = country_key.capitalize()
|
|
stats = analyze_institutions(institutions, country_name)
|
|
country_stats[country_key] = stats
|
|
|
|
print(f"\n{country_name}:")
|
|
print(f" Total institutions: {stats['total']}")
|
|
print(f" Geocoded: {stats['geocoded']} ({stats['geocoding_rate']:.1f}%)")
|
|
print(f" Unique regions: {stats['unique_regions']}")
|
|
print(f" Unique cities: {stats['unique_cities']}")
|
|
print(f" With identifiers: {stats['with_identifiers']}")
|
|
print(f" With platforms: {stats['with_platforms']}")
|
|
print(f" Top 3 institution types:")
|
|
for inst_type, count in stats['institution_types'].most_common(3):
|
|
print(f" - {inst_type}: {count}")
|
|
|
|
# Analyze combined dataset
|
|
print("\n" + "=" * 60)
|
|
print("Combined Dataset Statistics")
|
|
print("=" * 60)
|
|
|
|
combined_stats = analyze_institutions(all_institutions, "Latin America")
|
|
print(f"\nTotal institutions: {combined_stats['total']}")
|
|
print(f"Geocoded: {combined_stats['geocoded']} ({combined_stats['geocoding_rate']:.1f}%)")
|
|
print(f"Unique regions: {combined_stats['unique_regions']}")
|
|
print(f"Unique cities: {combined_stats['unique_cities']}")
|
|
print(f"With identifiers: {combined_stats['with_identifiers']}")
|
|
print(f"With digital platforms: {combined_stats['with_platforms']}")
|
|
print(f"With collections: {combined_stats['with_collections']}")
|
|
|
|
print(f"\nInstitution types (all countries):")
|
|
for inst_type, count in combined_stats['institution_types'].most_common():
|
|
print(f" - {inst_type}: {count}")
|
|
|
|
# Check for duplicate IDs
|
|
print("\n" + "=" * 60)
|
|
print("Duplicate Detection")
|
|
print("=" * 60)
|
|
|
|
id_counter = Counter(inst['id'] for inst in all_institutions if 'id' in inst)
|
|
duplicates = {id_val: count for id_val, count in id_counter.items() if count > 1}
|
|
|
|
if duplicates:
|
|
print(f"\nWARNING: Found {len(duplicates)} duplicate IDs:")
|
|
for id_val, count in duplicates.items():
|
|
print(f" - {id_val} (appears {count} times)")
|
|
else:
|
|
print("\n✓ No duplicate IDs found - all institution IDs are unique!")
|
|
|
|
# Write combined YAML file
|
|
print("\n" + "=" * 60)
|
|
print("Writing Combined Dataset")
|
|
print("=" * 60)
|
|
|
|
timestamp = datetime.now(timezone.utc).isoformat()
|
|
|
|
# Create header comments for the combined file
|
|
header = f"""---
|
|
# Latin American GLAM Institutions - Combined Dataset
|
|
# Generated: {timestamp}
|
|
#
|
|
# This file combines institutions from three countries:
|
|
# - Brazil: {country_stats['brazilian']['total']} institutions
|
|
# - Chile: {country_stats['chilean']['total']} institutions
|
|
# - Mexico: {country_stats['mexican']['total']} institutions
|
|
#
|
|
# Total: {len(all_institutions)} institutions
|
|
# Geocoded: {combined_stats['geocoded']} ({combined_stats['geocoding_rate']:.1f}%)
|
|
# Unique regions: {combined_stats['unique_regions']}
|
|
# Unique cities: {combined_stats['unique_cities']}
|
|
#
|
|
# Schema: LinkML v0.2.0 (modular)
|
|
# Data tier: TIER_4_INFERRED (conversation-extracted)
|
|
#
|
|
# Source files:
|
|
# - brazilian_institutions.yaml
|
|
# - chilean_institutions.yaml
|
|
# - mexican_institutions.yaml
|
|
|
|
"""
|
|
|
|
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
|
f.write(header)
|
|
yaml.dump(all_institutions, f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
indent=2)
|
|
|
|
print(f"\n✓ Combined dataset written to: {OUTPUT_FILE}")
|
|
print(f" File size: {OUTPUT_FILE.stat().st_size / 1024:.1f} KB")
|
|
|
|
# Generate markdown report
|
|
print("\n" + "=" * 60)
|
|
print("Generating Report")
|
|
print("=" * 60)
|
|
|
|
report = generate_report(country_stats, combined_stats, timestamp)
|
|
|
|
with open(REPORT_FILE, 'w', encoding='utf-8') as f:
|
|
f.write(report)
|
|
|
|
print(f"\n✓ Report written to: {REPORT_FILE}")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Combination Complete!")
|
|
print("=" * 60)
|
|
print(f"\nNext steps:")
|
|
print(f" 1. Validate combined dataset with LinkML")
|
|
print(f" 2. Export to JSON-LD, RDF/Turtle, CSV formats")
|
|
print(f" 3. Create geographic visualization (GeoJSON)")
|
|
print()
|
|
|
|
|
|
def generate_report(country_stats: Dict[str, Dict], combined_stats: Dict, timestamp: str) -> str:
|
|
"""Generate a markdown report of the combination process."""
|
|
|
|
report = f"""# Latin American GLAM Institutions - Dataset Combination Report
|
|
|
|
**Generated**: {timestamp}
|
|
|
|
## Summary
|
|
|
|
This report documents the combination of three geocoded GLAM institution datasets from Latin America into a unified collection.
|
|
|
|
### Combined Dataset Overview
|
|
|
|
| Metric | Value |
|
|
|--------|-------|
|
|
| **Total Institutions** | {combined_stats['total']} |
|
|
| **Geocoded Institutions** | {combined_stats['geocoded']} ({combined_stats['geocoding_rate']:.1f}%) |
|
|
| **Unique Regions** | {combined_stats['unique_regions']} |
|
|
| **Unique Cities** | {combined_stats['unique_cities']} |
|
|
| **With Identifiers** | {combined_stats['with_identifiers']} |
|
|
| **With Digital Platforms** | {combined_stats['with_platforms']} |
|
|
| **With Collections** | {combined_stats['with_collections']} |
|
|
|
|
## Individual Country Contributions
|
|
|
|
"""
|
|
|
|
# Country-by-country breakdown
|
|
for country_key in ['brazilian', 'chilean', 'mexican']:
|
|
stats = country_stats[country_key]
|
|
country_name = stats['country']
|
|
|
|
report += f"""### {country_name}
|
|
|
|
| Metric | Value |
|
|
|--------|-------|
|
|
| Total Institutions | {stats['total']} |
|
|
| Geocoded | {stats['geocoded']} ({stats['geocoding_rate']:.1f}%) |
|
|
| Unique Regions | {stats['unique_regions']} |
|
|
| Unique Cities | {stats['unique_cities']} |
|
|
| With Identifiers | {stats['with_identifiers']} |
|
|
| With Digital Platforms | {stats['with_platforms']} |
|
|
| With Collections | {stats['with_collections']} |
|
|
|
|
**Top Institution Types**:
|
|
"""
|
|
for inst_type, count in stats['institution_types'].most_common(5):
|
|
report += f"- {inst_type}: {count}\n"
|
|
|
|
report += "\n"
|
|
|
|
# Combined institution types
|
|
report += f"""## Institution Type Distribution (Combined)
|
|
|
|
| Type | Count | Percentage |
|
|
|------|-------|------------|
|
|
"""
|
|
|
|
for inst_type, count in combined_stats['institution_types'].most_common():
|
|
percentage = (count / combined_stats['total'] * 100)
|
|
report += f"| {inst_type} | {count} | {percentage:.1f}% |\n"
|
|
|
|
# Geographic coverage
|
|
report += f"""
|
|
## Geographic Coverage
|
|
|
|
The combined dataset covers:
|
|
- **{combined_stats['unique_regions']} regions** across 3 countries
|
|
- **{combined_stats['unique_cities']} unique cities**
|
|
|
|
### Top 10 Cities by Institution Count
|
|
|
|
| City | Count |
|
|
|------|-------|
|
|
"""
|
|
|
|
for city, count in combined_stats['cities'].most_common(10):
|
|
report += f"| {city} | {count} |\n"
|
|
|
|
# Data quality
|
|
report += f"""
|
|
## Data Quality Metrics
|
|
|
|
### Completeness
|
|
|
|
| Field | Count | Coverage |
|
|
|-------|-------|----------|
|
|
| Locations | {combined_stats['total']} | 100% |
|
|
| Coordinates (lat/lon) | {combined_stats['geocoded']} | {combined_stats['geocoding_rate']:.1f}% |
|
|
| Identifiers | {combined_stats['with_identifiers']} | {(combined_stats['with_identifiers']/combined_stats['total']*100):.1f}% |
|
|
| Digital Platforms | {combined_stats['with_platforms']} | {(combined_stats['with_platforms']/combined_stats['total']*100):.1f}% |
|
|
| Collections | {combined_stats['with_collections']} | {(combined_stats['with_collections']/combined_stats['total']*100):.1f}% |
|
|
|
|
### Data Provenance
|
|
|
|
All institutions in this combined dataset have:
|
|
- **Data Source**: CONVERSATION_NLP (extracted from Claude conversation JSON files)
|
|
- **Data Tier**: TIER_4_INFERRED (AI-extracted, requires verification)
|
|
- **Extraction Method**: Multi-stage AI extraction with Nominatim geocoding
|
|
- **Schema**: LinkML v0.2.0 (modular)
|
|
|
|
## Source Files
|
|
|
|
1. `brazilian_institutions.yaml` - {country_stats['brazilian']['total']} institutions
|
|
2. `chilean_institutions.yaml` - {country_stats['chilean']['total']} institutions
|
|
3. `mexican_institutions.yaml` - {country_stats['mexican']['total']} institutions
|
|
|
|
## Output Files
|
|
|
|
- **Combined Dataset**: `latin_american_institutions.yaml`
|
|
- **Report**: `latin_american_combination_report.md`
|
|
|
|
## Next Steps
|
|
|
|
1. **Validation**: Run LinkML schema validation on combined dataset
|
|
2. **Export**: Generate JSON-LD, RDF/Turtle, and CSV formats
|
|
3. **Visualization**: Create GeoJSON for geographic mapping
|
|
4. **Manual Review**: Verify high-priority institutions with low confidence scores
|
|
5. **Expansion**: Extract institutions from remaining 60+ country conversation files
|
|
|
|
## Notes
|
|
|
|
- All institution IDs are unique (no duplicates detected)
|
|
- Geographic coverage spans 3 countries, {combined_stats['unique_regions']} regions, {combined_stats['unique_cities']} cities
|
|
- Geocoding rate of {combined_stats['geocoding_rate']:.1f}% meets project quality targets
|
|
- Ready for integration with authoritative CSV sources (ISIL, Wikidata)
|
|
|
|
---
|
|
|
|
*Generated by `combine_latin_american_datasets.py`*
|
|
"""
|
|
|
|
return report
|
|
|
|
|
|
if __name__ == '__main__':
|
|
combine_datasets()
|