glam/scripts/export_algeria_to_rdf.py

#!/usr/bin/env python3
"""
Export Algerian heritage institutions to RDF (Turtle, RDF/XML, JSON-LD).
Based on Libya export script with multi-ontology support.
"""

import yaml
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, SKOS, DCTERMS, XSD, FOAF
from pathlib import Path
from datetime import datetime

# Define namespaces
BASE = Namespace("https://w3id.org/heritage/custodian/")
SCHEMA = Namespace("http://schema.org/")
ORG = Namespace("http://www.w3.org/ns/org#")
PROV = Namespace("http://www.w3.org/ns/prov#")
RICO = Namespace("https://www.ica.org/standards/RiC/ontology#")
CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
WD = Namespace("http://www.wikidata.org/entity/")

def load_institutions():
    """Load Algerian institutions from YAML."""
    yaml_path = Path("data/instances/algeria/algerian_institutions.yaml")
    with open(yaml_path, 'r', encoding='utf-8') as f:
        return yaml.safe_load(f)

def create_rdf_graph(institutions):
    """Create RDF graph with multi-ontology support."""
    g = Graph()

    # Bind namespaces
    g.bind("custodian", BASE)
    g.bind("schema", SCHEMA)
    g.bind("org", ORG)
    g.bind("prov", PROV)
    g.bind("rico", RICO)
    g.bind("crm", CRM)
    g.bind("skos", SKOS)
    g.bind("dcterms", DCTERMS)
    g.bind("foaf", FOAF)
    g.bind("wd", WD)

    for inst in institutions:
        # Create institution URI
        inst_id = inst['id'].split('/')[-1]
        inst_uri = BASE[inst_id]

        # Multi-ontology typing
        g.add((inst_uri, RDF.type, ORG.Organization))
        g.add((inst_uri, RDF.type, PROV.Organization))
        g.add((inst_uri, RDF.type, RICO.CorporateBody))
        g.add((inst_uri, RDF.type, CRM.E74_Group))

        # Schema.org type based on institution type
        type_map = {
            'MUSEUM': SCHEMA.Museum,
            'LIBRARY': SCHEMA.Library,
            'ARCHIVE': SCHEMA.ArchiveOrganization,
            'EDUCATION_PROVIDER': SCHEMA.EducationalOrganization,
            'RESEARCH_CENTER': SCHEMA.ResearchOrganization,
            'OFFICIAL_INSTITUTION': SCHEMA.GovernmentOrganization,
            'PERSONAL_COLLECTION': SCHEMA.ArchiveOrganization
        }
        inst_type = inst.get('institution_type')
        if inst_type in type_map:
            g.add((inst_uri, RDF.type, type_map[inst_type]))

        # Basic properties
        g.add((inst_uri, SKOS.prefLabel, Literal(inst['name'], lang='fr')))
        g.add((inst_uri, SCHEMA.name, Literal(inst['name'], lang='fr')))
        g.add((inst_uri, RDFS.label, Literal(inst['name'], lang='fr')))

        # Alternative names
        if inst.get('alternative_names'):
            for alt_name in inst['alternative_names']:
                # Detect language (simple heuristic)
                lang = 'ar' if any(ord(c) > 1536 and ord(c) < 1791 for c in alt_name) else 'en'
                g.add((inst_uri, SKOS.altLabel, Literal(alt_name, lang=lang)))
                g.add((inst_uri, SCHEMA.alternateName, Literal(alt_name, lang=lang)))

        # Description
        if inst.get('description'):
            g.add((inst_uri, DCTERMS.description, Literal(inst['description'], lang='en')))
            g.add((inst_uri, SCHEMA.description, Literal(inst['description'], lang='en')))

        # Identifiers
        if inst.get('identifiers'):
            for ident in inst['identifiers']:
                scheme = ident['identifier_scheme']
                value = ident['identifier_value']

                if scheme == 'Wikidata':
                    g.add((inst_uri, SCHEMA.sameAs, WD[value]))
                    g.add((inst_uri, DCTERMS.identifier, Literal(value)))
                elif scheme == 'Website':
                    g.add((inst_uri, FOAF.homepage, URIRef(value)))
                    g.add((inst_uri, SCHEMA.url, URIRef(value)))
                else:
                    g.add((inst_uri, DCTERMS.identifier, Literal(f"{scheme}:{value}")))

        # Location
        if inst.get('locations'):
            loc = inst['locations'][0]

            # Create location node
            loc_uri = BASE[f"{inst_id}_location"]
            g.add((loc_uri, RDF.type, SCHEMA.Place))
            g.add((inst_uri, SCHEMA.location, loc_uri))
            g.add((inst_uri, ORG.hasSite, loc_uri))

            if loc.get('city'):
                g.add((loc_uri, SCHEMA.addressLocality, Literal(loc['city'])))
            if loc.get('country'):
                g.add((loc_uri, SCHEMA.addressCountry, Literal(loc['country'])))
            if loc.get('street_address'):
                g.add((loc_uri, SCHEMA.streetAddress, Literal(loc['street_address'])))
            if loc.get('postal_code'):
                g.add((loc_uri, SCHEMA.postalCode, Literal(loc['postal_code'])))

        # Collections
        if inst.get('collections'):
            for coll in inst['collections']:
                coll_uri = BASE[f"{inst_id}_collection_{hash(coll['collection_name']) % 10000}"]
                g.add((coll_uri, RDF.type, RICO.RecordSet))
                g.add((coll_uri, RDF.type, SCHEMA.Collection))
                g.add((inst_uri, RICO.hasOrHadConstituent, coll_uri))
                g.add((inst_uri, SCHEMA.hasOfferCatalog, coll_uri))

                g.add((coll_uri, SCHEMA.name, Literal(coll['collection_name'])))
                if coll.get('subject_areas'):
                    for subject in coll['subject_areas']:
                        g.add((coll_uri, SCHEMA.about, Literal(subject)))

        # Provenance
        if inst.get('provenance'):
            prov = inst['provenance']
            prov_uri = BASE[f"{inst_id}_provenance"]
            g.add((prov_uri, RDF.type, PROV.Entity))
            g.add((inst_uri, PROV.wasDerivedFrom, prov_uri))

            if prov.get('extraction_date'):
                g.add((prov_uri, PROV.generatedAtTime, Literal(prov['extraction_date'], datatype=XSD.dateTime)))
            if prov.get('data_source'):
                g.add((prov_uri, DCTERMS.source, Literal(prov['data_source'])))

    return g

def export_formats(g, base_filename):
    """Export to Turtle, RDF/XML, and JSON-LD."""
    output_dir = Path("data/exports")
    output_dir.mkdir(exist_ok=True)

    formats = {
        'turtle': 'ttl',
        'xml': 'rdf',
        'json-ld': 'jsonld'
    }

    for fmt, ext in formats.items():
        output_file = output_dir / f"{base_filename}.{ext}"
        g.serialize(destination=str(output_file), format=fmt)
        size = output_file.stat().st_size / 1024
        print(f"✅ Exported {fmt.upper():10} → {output_file.name} ({size:.1f} KB)")

def generate_statistics(institutions):
    """Generate coverage statistics."""
    stats = {
        'total_institutions': len(institutions),
        'wikidata_coverage': 0,
        'institution_types': {},
        'cities': set()
    }

    for inst in institutions:
        # Count Wikidata coverage
        if inst.get('identifiers'):
            for ident in inst['identifiers']:
                if ident['identifier_scheme'] == 'Wikidata':
                    stats['wikidata_coverage'] += 1
                    break

        # Count types
        inst_type = inst.get('institution_type', 'UNKNOWN')
        stats['institution_types'][inst_type] = stats['institution_types'].get(inst_type, 0) + 1

        # Collect cities
        if inst.get('locations'):
            stats['cities'].add(inst['locations'][0]['city'])

    stats['cities'] = len(stats['cities'])
    stats['wikidata_coverage_pct'] = (stats['wikidata_coverage'] / stats['total_institutions']) * 100

    return stats

def main():
    print("Algeria Heritage Institutions RDF Export")
    print("=" * 60)

    institutions = load_institutions()
    print(f"Loaded {len(institutions)} institutions")

    print("\nCreating RDF graph with multi-ontology support...")
    g = create_rdf_graph(institutions)
    print(f"✅ Graph created: {len(g)} triples")

    print("\nExporting to multiple formats...")
    export_formats(g, "algeria_institutions")

    print("\nGenerating statistics...")
    stats = generate_statistics(institutions)
    stats_file = Path("data/exports/algeria_statistics.yaml")
    with open(stats_file, 'w') as f:
        yaml.dump(stats, f, default_flow_style=False, allow_unicode=True)
    print(f"✅ Statistics → {stats_file.name}")

    print("\n" + "=" * 60)
    print("ALGERIA RDF EXPORT COMPLETE")
    print("=" * 60)
    print(f"Total institutions: {stats['total_institutions']}")
    print(f"Wikidata coverage: {stats['wikidata_coverage_pct']:.1f}% ({stats['wikidata_coverage']}/{stats['total_institutions']})")
    print(f"Institution types: {len(stats['institution_types'])} categories")
    print(f"Cities covered: {stats['cities']}")
    print("=" * 60)

if __name__ == "__main__":
    main()