#!/usr/bin/env python3 """ Export Algerian heritage institutions to RDF (Turtle, RDF/XML, JSON-LD). Based on Libya export script with multi-ontology support. """ import yaml from rdflib import Graph, Namespace, URIRef, Literal from rdflib.namespace import RDF, RDFS, SKOS, DCTERMS, XSD, FOAF from pathlib import Path from datetime import datetime # Define namespaces BASE = Namespace("https://w3id.org/heritage/custodian/") SCHEMA = Namespace("http://schema.org/") ORG = Namespace("http://www.w3.org/ns/org#") PROV = Namespace("http://www.w3.org/ns/prov#") RICO = Namespace("https://www.ica.org/standards/RiC/ontology#") CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/") WD = Namespace("http://www.wikidata.org/entity/") def load_institutions(): """Load Algerian institutions from YAML.""" yaml_path = Path("data/instances/algeria/algerian_institutions.yaml") with open(yaml_path, 'r', encoding='utf-8') as f: return yaml.safe_load(f) def create_rdf_graph(institutions): """Create RDF graph with multi-ontology support.""" g = Graph() # Bind namespaces g.bind("custodian", BASE) g.bind("schema", SCHEMA) g.bind("org", ORG) g.bind("prov", PROV) g.bind("rico", RICO) g.bind("crm", CRM) g.bind("skos", SKOS) g.bind("dcterms", DCTERMS) g.bind("foaf", FOAF) g.bind("wd", WD) for inst in institutions: # Create institution URI inst_id = inst['id'].split('/')[-1] inst_uri = BASE[inst_id] # Multi-ontology typing g.add((inst_uri, RDF.type, ORG.Organization)) g.add((inst_uri, RDF.type, PROV.Organization)) g.add((inst_uri, RDF.type, RICO.CorporateBody)) g.add((inst_uri, RDF.type, CRM.E74_Group)) # Schema.org type based on institution type type_map = { 'MUSEUM': SCHEMA.Museum, 'LIBRARY': SCHEMA.Library, 'ARCHIVE': SCHEMA.ArchiveOrganization, 'EDUCATION_PROVIDER': SCHEMA.EducationalOrganization, 'RESEARCH_CENTER': SCHEMA.ResearchOrganization, 'OFFICIAL_INSTITUTION': SCHEMA.GovernmentOrganization, 'PERSONAL_COLLECTION': SCHEMA.ArchiveOrganization } inst_type = inst.get('institution_type') if inst_type in type_map: g.add((inst_uri, RDF.type, type_map[inst_type])) # Basic properties g.add((inst_uri, SKOS.prefLabel, Literal(inst['name'], lang='fr'))) g.add((inst_uri, SCHEMA.name, Literal(inst['name'], lang='fr'))) g.add((inst_uri, RDFS.label, Literal(inst['name'], lang='fr'))) # Alternative names if inst.get('alternative_names'): for alt_name in inst['alternative_names']: # Detect language (simple heuristic) lang = 'ar' if any(ord(c) > 1536 and ord(c) < 1791 for c in alt_name) else 'en' g.add((inst_uri, SKOS.altLabel, Literal(alt_name, lang=lang))) g.add((inst_uri, SCHEMA.alternateName, Literal(alt_name, lang=lang))) # Description if inst.get('description'): g.add((inst_uri, DCTERMS.description, Literal(inst['description'], lang='en'))) g.add((inst_uri, SCHEMA.description, Literal(inst['description'], lang='en'))) # Identifiers if inst.get('identifiers'): for ident in inst['identifiers']: scheme = ident['identifier_scheme'] value = ident['identifier_value'] if scheme == 'Wikidata': g.add((inst_uri, SCHEMA.sameAs, WD[value])) g.add((inst_uri, DCTERMS.identifier, Literal(value))) elif scheme == 'Website': g.add((inst_uri, FOAF.homepage, URIRef(value))) g.add((inst_uri, SCHEMA.url, URIRef(value))) else: g.add((inst_uri, DCTERMS.identifier, Literal(f"{scheme}:{value}"))) # Location if inst.get('locations'): loc = inst['locations'][0] # Create location node loc_uri = BASE[f"{inst_id}_location"] g.add((loc_uri, RDF.type, SCHEMA.Place)) g.add((inst_uri, SCHEMA.location, loc_uri)) g.add((inst_uri, ORG.hasSite, loc_uri)) if loc.get('city'): g.add((loc_uri, SCHEMA.addressLocality, Literal(loc['city']))) if loc.get('country'): g.add((loc_uri, SCHEMA.addressCountry, Literal(loc['country']))) if loc.get('street_address'): g.add((loc_uri, SCHEMA.streetAddress, Literal(loc['street_address']))) if loc.get('postal_code'): g.add((loc_uri, SCHEMA.postalCode, Literal(loc['postal_code']))) # Collections if inst.get('collections'): for coll in inst['collections']: coll_uri = BASE[f"{inst_id}_collection_{hash(coll['collection_name']) % 10000}"] g.add((coll_uri, RDF.type, RICO.RecordSet)) g.add((coll_uri, RDF.type, SCHEMA.Collection)) g.add((inst_uri, RICO.hasOrHadConstituent, coll_uri)) g.add((inst_uri, SCHEMA.hasOfferCatalog, coll_uri)) g.add((coll_uri, SCHEMA.name, Literal(coll['collection_name']))) if coll.get('subject_areas'): for subject in coll['subject_areas']: g.add((coll_uri, SCHEMA.about, Literal(subject))) # Provenance if inst.get('provenance'): prov = inst['provenance'] prov_uri = BASE[f"{inst_id}_provenance"] g.add((prov_uri, RDF.type, PROV.Entity)) g.add((inst_uri, PROV.wasDerivedFrom, prov_uri)) if prov.get('extraction_date'): g.add((prov_uri, PROV.generatedAtTime, Literal(prov['extraction_date'], datatype=XSD.dateTime))) if prov.get('data_source'): g.add((prov_uri, DCTERMS.source, Literal(prov['data_source']))) return g def export_formats(g, base_filename): """Export to Turtle, RDF/XML, and JSON-LD.""" output_dir = Path("data/exports") output_dir.mkdir(exist_ok=True) formats = { 'turtle': 'ttl', 'xml': 'rdf', 'json-ld': 'jsonld' } for fmt, ext in formats.items(): output_file = output_dir / f"{base_filename}.{ext}" g.serialize(destination=str(output_file), format=fmt) size = output_file.stat().st_size / 1024 print(f"✅ Exported {fmt.upper():10} → {output_file.name} ({size:.1f} KB)") def generate_statistics(institutions): """Generate coverage statistics.""" stats = { 'total_institutions': len(institutions), 'wikidata_coverage': 0, 'institution_types': {}, 'cities': set() } for inst in institutions: # Count Wikidata coverage if inst.get('identifiers'): for ident in inst['identifiers']: if ident['identifier_scheme'] == 'Wikidata': stats['wikidata_coverage'] += 1 break # Count types inst_type = inst.get('institution_type', 'UNKNOWN') stats['institution_types'][inst_type] = stats['institution_types'].get(inst_type, 0) + 1 # Collect cities if inst.get('locations'): stats['cities'].add(inst['locations'][0]['city']) stats['cities'] = len(stats['cities']) stats['wikidata_coverage_pct'] = (stats['wikidata_coverage'] / stats['total_institutions']) * 100 return stats def main(): print("Algeria Heritage Institutions RDF Export") print("=" * 60) institutions = load_institutions() print(f"Loaded {len(institutions)} institutions") print("\nCreating RDF graph with multi-ontology support...") g = create_rdf_graph(institutions) print(f"✅ Graph created: {len(g)} triples") print("\nExporting to multiple formats...") export_formats(g, "algeria_institutions") print("\nGenerating statistics...") stats = generate_statistics(institutions) stats_file = Path("data/exports/algeria_statistics.yaml") with open(stats_file, 'w') as f: yaml.dump(stats, f, default_flow_style=False, allow_unicode=True) print(f"✅ Statistics → {stats_file.name}") print("\n" + "=" * 60) print("ALGERIA RDF EXPORT COMPLETE") print("=" * 60) print(f"Total institutions: {stats['total_institutions']}") print(f"Wikidata coverage: {stats['wikidata_coverage_pct']:.1f}% ({stats['wikidata_coverage']}/{stats['total_institutions']})") print(f"Institution types: {len(stats['institution_types'])} categories") print(f"Cities covered: {stats['cities']}") print("=" * 60) if __name__ == "__main__": main()