178 lines
6.5 KiB
Python
Executable file
178 lines
6.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Merge Sachsen-Anhalt Enriched Museums + Archives
|
|
Creates complete Sachsen-Anhalt dataset with full metadata
|
|
"""
|
|
|
|
import json
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
from collections import Counter
|
|
|
|
def convert_archive_to_linkml(archive: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Convert archive record from flat structure to LinkML format."""
|
|
|
|
linkml_record = {
|
|
"name": archive.get("name", ""),
|
|
"institution_type": archive.get("institution_type", "ARCHIVE"),
|
|
"description": archive.get("address_text", ""),
|
|
"locations": [
|
|
{
|
|
"city": archive.get("city", ""),
|
|
"region": archive.get("region", "Sachsen-Anhalt"),
|
|
"country": archive.get("country", "DE")
|
|
}
|
|
],
|
|
"identifiers": [],
|
|
"provenance": archive.get("provenance", {})
|
|
}
|
|
|
|
# Add URL as identifier
|
|
if archive.get("url"):
|
|
linkml_record["identifiers"].append({
|
|
"identifier_scheme": "Website",
|
|
"identifier_value": archive["url"],
|
|
"identifier_url": archive["url"]
|
|
})
|
|
|
|
# Add email if present
|
|
if archive.get("email"):
|
|
linkml_record["identifiers"].append({
|
|
"identifier_scheme": "Email",
|
|
"identifier_value": archive["email"],
|
|
"identifier_url": f"mailto:{archive['email']}"
|
|
})
|
|
|
|
# Add phone if present
|
|
if archive.get("phone"):
|
|
linkml_record["identifiers"].append({
|
|
"identifier_scheme": "Phone",
|
|
"identifier_value": archive["phone"],
|
|
"identifier_url": f"tel:{archive['phone']}"
|
|
})
|
|
|
|
return linkml_record
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
|
|
print("=" * 80)
|
|
print("Merge Sachsen-Anhalt Complete Dataset")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
data_dir = Path('data/isil/germany')
|
|
|
|
# Load enriched museums
|
|
museum_files = sorted(data_dir.glob('sachsen_anhalt_museums_enriched_*.json'), reverse=True)
|
|
if not museum_files:
|
|
print("❌ No enriched museum files found")
|
|
return
|
|
|
|
museum_file = museum_files[0]
|
|
print(f"Loading museums: {museum_file.name}")
|
|
with open(museum_file, 'r', encoding='utf-8') as f:
|
|
museums = json.load(f)
|
|
|
|
# Load archives
|
|
archive_files = sorted(data_dir.glob('sachsen_anhalt_archives_*.json'), reverse=True)
|
|
if not archive_files:
|
|
print("❌ No archive files found")
|
|
return
|
|
|
|
archive_file = archive_files[0]
|
|
print(f"Loading archives: {archive_file.name}")
|
|
with open(archive_file, 'r', encoding='utf-8') as f:
|
|
archive_data = json.load(f)
|
|
|
|
# Convert archives to LinkML format
|
|
archives = []
|
|
if isinstance(archive_data, dict) and 'archives' in archive_data:
|
|
for archive in archive_data['archives']:
|
|
archives.append(convert_archive_to_linkml(archive))
|
|
elif isinstance(archive_data, list):
|
|
for archive in archive_data:
|
|
archives.append(convert_archive_to_linkml(archive))
|
|
|
|
print()
|
|
print(f"Loaded:")
|
|
print(f" Museums: {len(museums)}")
|
|
print(f" Archives: {len(archives)}")
|
|
print()
|
|
|
|
# Merge
|
|
merged = museums + archives
|
|
total = len(merged)
|
|
|
|
print(f"Total institutions: {total}")
|
|
print()
|
|
|
|
# Calculate completeness
|
|
has_name = sum(1 for inst in merged if inst.get('name'))
|
|
has_type = sum(1 for inst in merged if inst.get('institution_type'))
|
|
has_desc = sum(1 for inst in merged if inst.get('description'))
|
|
has_city = sum(1 for inst in merged if inst.get('locations') and any(loc.get('city') for loc in inst['locations']))
|
|
has_address = sum(1 for inst in merged if inst.get('locations') and any(loc.get('street_address') for loc in inst['locations']))
|
|
has_postal = sum(1 for inst in merged if inst.get('locations') and any(loc.get('postal_code') for loc in inst['locations']))
|
|
has_website = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Website' for i in inst['identifiers']))
|
|
has_phone = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Phone' for i in inst['identifiers']))
|
|
has_email = sum(1 for inst in merged if inst.get('identifiers') and any(i['identifier_scheme'] == 'Email' for i in inst['identifiers']))
|
|
|
|
print("Data Completeness:")
|
|
print(f" Name: {has_name:3d}/{total} ({has_name/total*100:5.1f}%)")
|
|
print(f" Type: {has_type:3d}/{total} ({has_type/total*100:5.1f}%)")
|
|
print(f" Description: {has_desc:3d}/{total} ({has_desc/total*100:5.1f}%)")
|
|
print(f" City: {has_city:3d}/{total} ({has_city/total*100:5.1f}%)")
|
|
print(f" Street Address: {has_address:3d}/{total} ({has_address/total*100:5.1f}%)")
|
|
print(f" Postal Code: {has_postal:3d}/{total} ({has_postal/total*100:5.1f}%)")
|
|
print(f" Website: {has_website:3d}/{total} ({has_website/total*100:5.1f}%)")
|
|
print(f" Phone: {has_phone:3d}/{total} ({has_phone/total*100:5.1f}%)")
|
|
print(f" Email: {has_email:3d}/{total} ({has_email/total*100:5.1f}%)")
|
|
print()
|
|
|
|
# Institution types
|
|
type_counts = Counter()
|
|
for inst in merged:
|
|
type_counts[inst.get('institution_type', 'UNKNOWN')] += 1
|
|
|
|
print("Institution Types:")
|
|
for itype, count in type_counts.most_common():
|
|
print(f" {itype:20s}: {count:3d}")
|
|
print()
|
|
|
|
# Geographic coverage
|
|
city_counts = Counter()
|
|
for inst in merged:
|
|
if inst.get('locations'):
|
|
for loc in inst['locations']:
|
|
city = loc.get('city', '')
|
|
if city:
|
|
city_counts[city] += 1
|
|
|
|
print(f"Geographic Coverage: {len(city_counts)} cities")
|
|
print()
|
|
print("Top 20 Cities:")
|
|
for city, count in city_counts.most_common(20):
|
|
print(f" {city:35s}: {count:2d}")
|
|
print()
|
|
|
|
# Save complete dataset
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_path = data_dir / f'sachsen_anhalt_complete_{timestamp}.json'
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(merged, f, ensure_ascii=False, indent=2)
|
|
|
|
file_size_kb = output_path.stat().st_size / 1024
|
|
|
|
print(f"✅ Saved to: {output_path}")
|
|
print(f" File size: {file_size_kb:.1f} KB")
|
|
print(f" Total institutions: {total}")
|
|
print()
|
|
print("=" * 80)
|
|
print("Sachsen-Anhalt Complete Dataset Ready!")
|
|
print("=" * 80)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|