glam/scripts/export_algeria_to_rdf.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

227 lines
8.8 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Export Algerian heritage institutions to RDF (Turtle, RDF/XML, JSON-LD).
Based on Libya export script with multi-ontology support.
"""
import yaml
from rdflib import Graph, Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS, SKOS, DCTERMS, XSD, FOAF
from pathlib import Path
from datetime import datetime
# Define namespaces
BASE = Namespace("https://w3id.org/heritage/custodian/")
SCHEMA = Namespace("http://schema.org/")
ORG = Namespace("http://www.w3.org/ns/org#")
PROV = Namespace("http://www.w3.org/ns/prov#")
RICO = Namespace("https://www.ica.org/standards/RiC/ontology#")
CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
WD = Namespace("http://www.wikidata.org/entity/")
def load_institutions():
"""Load Algerian institutions from YAML."""
yaml_path = Path("data/instances/algeria/algerian_institutions.yaml")
with open(yaml_path, 'r', encoding='utf-8') as f:
return yaml.safe_load(f)
def create_rdf_graph(institutions):
"""Create RDF graph with multi-ontology support."""
g = Graph()
# Bind namespaces
g.bind("custodian", BASE)
g.bind("schema", SCHEMA)
g.bind("org", ORG)
g.bind("prov", PROV)
g.bind("rico", RICO)
g.bind("crm", CRM)
g.bind("skos", SKOS)
g.bind("dcterms", DCTERMS)
g.bind("foaf", FOAF)
g.bind("wd", WD)
for inst in institutions:
# Create institution URI
inst_id = inst['id'].split('/')[-1]
inst_uri = BASE[inst_id]
# Multi-ontology typing
g.add((inst_uri, RDF.type, ORG.Organization))
g.add((inst_uri, RDF.type, PROV.Organization))
g.add((inst_uri, RDF.type, RICO.CorporateBody))
g.add((inst_uri, RDF.type, CRM.E74_Group))
# Schema.org type based on institution type
type_map = {
'MUSEUM': SCHEMA.Museum,
'LIBRARY': SCHEMA.Library,
'ARCHIVE': SCHEMA.ArchiveOrganization,
'EDUCATION_PROVIDER': SCHEMA.EducationalOrganization,
'RESEARCH_CENTER': SCHEMA.ResearchOrganization,
'OFFICIAL_INSTITUTION': SCHEMA.GovernmentOrganization,
'PERSONAL_COLLECTION': SCHEMA.ArchiveOrganization
}
inst_type = inst.get('institution_type')
if inst_type in type_map:
g.add((inst_uri, RDF.type, type_map[inst_type]))
# Basic properties
g.add((inst_uri, SKOS.prefLabel, Literal(inst['name'], lang='fr')))
g.add((inst_uri, SCHEMA.name, Literal(inst['name'], lang='fr')))
g.add((inst_uri, RDFS.label, Literal(inst['name'], lang='fr')))
# Alternative names
if inst.get('alternative_names'):
for alt_name in inst['alternative_names']:
# Detect language (simple heuristic)
lang = 'ar' if any(ord(c) > 1536 and ord(c) < 1791 for c in alt_name) else 'en'
g.add((inst_uri, SKOS.altLabel, Literal(alt_name, lang=lang)))
g.add((inst_uri, SCHEMA.alternateName, Literal(alt_name, lang=lang)))
# Description
if inst.get('description'):
g.add((inst_uri, DCTERMS.description, Literal(inst['description'], lang='en')))
g.add((inst_uri, SCHEMA.description, Literal(inst['description'], lang='en')))
# Identifiers
if inst.get('identifiers'):
for ident in inst['identifiers']:
scheme = ident['identifier_scheme']
value = ident['identifier_value']
if scheme == 'Wikidata':
g.add((inst_uri, SCHEMA.sameAs, WD[value]))
g.add((inst_uri, DCTERMS.identifier, Literal(value)))
elif scheme == 'Website':
g.add((inst_uri, FOAF.homepage, URIRef(value)))
g.add((inst_uri, SCHEMA.url, URIRef(value)))
else:
g.add((inst_uri, DCTERMS.identifier, Literal(f"{scheme}:{value}")))
# Location
if inst.get('locations'):
loc = inst['locations'][0]
# Create location node
loc_uri = BASE[f"{inst_id}_location"]
g.add((loc_uri, RDF.type, SCHEMA.Place))
g.add((inst_uri, SCHEMA.location, loc_uri))
g.add((inst_uri, ORG.hasSite, loc_uri))
if loc.get('city'):
g.add((loc_uri, SCHEMA.addressLocality, Literal(loc['city'])))
if loc.get('country'):
g.add((loc_uri, SCHEMA.addressCountry, Literal(loc['country'])))
if loc.get('street_address'):
g.add((loc_uri, SCHEMA.streetAddress, Literal(loc['street_address'])))
if loc.get('postal_code'):
g.add((loc_uri, SCHEMA.postalCode, Literal(loc['postal_code'])))
# Collections
if inst.get('collections'):
for coll in inst['collections']:
coll_uri = BASE[f"{inst_id}_collection_{hash(coll['collection_name']) % 10000}"]
g.add((coll_uri, RDF.type, RICO.RecordSet))
g.add((coll_uri, RDF.type, SCHEMA.Collection))
g.add((inst_uri, RICO.hasOrHadConstituent, coll_uri))
g.add((inst_uri, SCHEMA.hasOfferCatalog, coll_uri))
g.add((coll_uri, SCHEMA.name, Literal(coll['collection_name'])))
if coll.get('subject_areas'):
for subject in coll['subject_areas']:
g.add((coll_uri, SCHEMA.about, Literal(subject)))
# Provenance
if inst.get('provenance'):
prov = inst['provenance']
prov_uri = BASE[f"{inst_id}_provenance"]
g.add((prov_uri, RDF.type, PROV.Entity))
g.add((inst_uri, PROV.wasDerivedFrom, prov_uri))
if prov.get('extraction_date'):
g.add((prov_uri, PROV.generatedAtTime, Literal(prov['extraction_date'], datatype=XSD.dateTime)))
if prov.get('data_source'):
g.add((prov_uri, DCTERMS.source, Literal(prov['data_source'])))
return g
def export_formats(g, base_filename):
"""Export to Turtle, RDF/XML, and JSON-LD."""
output_dir = Path("data/exports")
output_dir.mkdir(exist_ok=True)
formats = {
'turtle': 'ttl',
'xml': 'rdf',
'json-ld': 'jsonld'
}
for fmt, ext in formats.items():
output_file = output_dir / f"{base_filename}.{ext}"
g.serialize(destination=str(output_file), format=fmt)
size = output_file.stat().st_size / 1024
print(f"✅ Exported {fmt.upper():10}{output_file.name} ({size:.1f} KB)")
def generate_statistics(institutions):
"""Generate coverage statistics."""
stats = {
'total_institutions': len(institutions),
'wikidata_coverage': 0,
'institution_types': {},
'cities': set()
}
for inst in institutions:
# Count Wikidata coverage
if inst.get('identifiers'):
for ident in inst['identifiers']:
if ident['identifier_scheme'] == 'Wikidata':
stats['wikidata_coverage'] += 1
break
# Count types
inst_type = inst.get('institution_type', 'UNKNOWN')
stats['institution_types'][inst_type] = stats['institution_types'].get(inst_type, 0) + 1
# Collect cities
if inst.get('locations'):
stats['cities'].add(inst['locations'][0]['city'])
stats['cities'] = len(stats['cities'])
stats['wikidata_coverage_pct'] = (stats['wikidata_coverage'] / stats['total_institutions']) * 100
return stats
def main():
print("Algeria Heritage Institutions RDF Export")
print("=" * 60)
institutions = load_institutions()
print(f"Loaded {len(institutions)} institutions")
print("\nCreating RDF graph with multi-ontology support...")
g = create_rdf_graph(institutions)
print(f"✅ Graph created: {len(g)} triples")
print("\nExporting to multiple formats...")
export_formats(g, "algeria_institutions")
print("\nGenerating statistics...")
stats = generate_statistics(institutions)
stats_file = Path("data/exports/algeria_statistics.yaml")
with open(stats_file, 'w') as f:
yaml.dump(stats, f, default_flow_style=False, allow_unicode=True)
print(f"✅ Statistics → {stats_file.name}")
print("\n" + "=" * 60)
print("ALGERIA RDF EXPORT COMPLETE")
print("=" * 60)
print(f"Total institutions: {stats['total_institutions']}")
print(f"Wikidata coverage: {stats['wikidata_coverage_pct']:.1f}% ({stats['wikidata_coverage']}/{stats['total_institutions']})")
print(f"Institution types: {len(stats['institution_types'])} categories")
print(f"Cities covered: {stats['cities']}")
print("=" * 60)
if __name__ == "__main__":
main()