- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
227 lines
8.8 KiB
Python
Executable file
227 lines
8.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Export Algerian heritage institutions to RDF (Turtle, RDF/XML, JSON-LD).
|
|
Based on Libya export script with multi-ontology support.
|
|
"""
|
|
|
|
import yaml
|
|
from rdflib import Graph, Namespace, URIRef, Literal
|
|
from rdflib.namespace import RDF, RDFS, SKOS, DCTERMS, XSD, FOAF
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
# Define namespaces
|
|
BASE = Namespace("https://w3id.org/heritage/custodian/")
|
|
SCHEMA = Namespace("http://schema.org/")
|
|
ORG = Namespace("http://www.w3.org/ns/org#")
|
|
PROV = Namespace("http://www.w3.org/ns/prov#")
|
|
RICO = Namespace("https://www.ica.org/standards/RiC/ontology#")
|
|
CRM = Namespace("http://www.cidoc-crm.org/cidoc-crm/")
|
|
WD = Namespace("http://www.wikidata.org/entity/")
|
|
|
|
def load_institutions():
|
|
"""Load Algerian institutions from YAML."""
|
|
yaml_path = Path("data/instances/algeria/algerian_institutions.yaml")
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def create_rdf_graph(institutions):
|
|
"""Create RDF graph with multi-ontology support."""
|
|
g = Graph()
|
|
|
|
# Bind namespaces
|
|
g.bind("custodian", BASE)
|
|
g.bind("schema", SCHEMA)
|
|
g.bind("org", ORG)
|
|
g.bind("prov", PROV)
|
|
g.bind("rico", RICO)
|
|
g.bind("crm", CRM)
|
|
g.bind("skos", SKOS)
|
|
g.bind("dcterms", DCTERMS)
|
|
g.bind("foaf", FOAF)
|
|
g.bind("wd", WD)
|
|
|
|
for inst in institutions:
|
|
# Create institution URI
|
|
inst_id = inst['id'].split('/')[-1]
|
|
inst_uri = BASE[inst_id]
|
|
|
|
# Multi-ontology typing
|
|
g.add((inst_uri, RDF.type, ORG.Organization))
|
|
g.add((inst_uri, RDF.type, PROV.Organization))
|
|
g.add((inst_uri, RDF.type, RICO.CorporateBody))
|
|
g.add((inst_uri, RDF.type, CRM.E74_Group))
|
|
|
|
# Schema.org type based on institution type
|
|
type_map = {
|
|
'MUSEUM': SCHEMA.Museum,
|
|
'LIBRARY': SCHEMA.Library,
|
|
'ARCHIVE': SCHEMA.ArchiveOrganization,
|
|
'EDUCATION_PROVIDER': SCHEMA.EducationalOrganization,
|
|
'RESEARCH_CENTER': SCHEMA.ResearchOrganization,
|
|
'OFFICIAL_INSTITUTION': SCHEMA.GovernmentOrganization,
|
|
'PERSONAL_COLLECTION': SCHEMA.ArchiveOrganization
|
|
}
|
|
inst_type = inst.get('institution_type')
|
|
if inst_type in type_map:
|
|
g.add((inst_uri, RDF.type, type_map[inst_type]))
|
|
|
|
# Basic properties
|
|
g.add((inst_uri, SKOS.prefLabel, Literal(inst['name'], lang='fr')))
|
|
g.add((inst_uri, SCHEMA.name, Literal(inst['name'], lang='fr')))
|
|
g.add((inst_uri, RDFS.label, Literal(inst['name'], lang='fr')))
|
|
|
|
# Alternative names
|
|
if inst.get('alternative_names'):
|
|
for alt_name in inst['alternative_names']:
|
|
# Detect language (simple heuristic)
|
|
lang = 'ar' if any(ord(c) > 1536 and ord(c) < 1791 for c in alt_name) else 'en'
|
|
g.add((inst_uri, SKOS.altLabel, Literal(alt_name, lang=lang)))
|
|
g.add((inst_uri, SCHEMA.alternateName, Literal(alt_name, lang=lang)))
|
|
|
|
# Description
|
|
if inst.get('description'):
|
|
g.add((inst_uri, DCTERMS.description, Literal(inst['description'], lang='en')))
|
|
g.add((inst_uri, SCHEMA.description, Literal(inst['description'], lang='en')))
|
|
|
|
# Identifiers
|
|
if inst.get('identifiers'):
|
|
for ident in inst['identifiers']:
|
|
scheme = ident['identifier_scheme']
|
|
value = ident['identifier_value']
|
|
|
|
if scheme == 'Wikidata':
|
|
g.add((inst_uri, SCHEMA.sameAs, WD[value]))
|
|
g.add((inst_uri, DCTERMS.identifier, Literal(value)))
|
|
elif scheme == 'Website':
|
|
g.add((inst_uri, FOAF.homepage, URIRef(value)))
|
|
g.add((inst_uri, SCHEMA.url, URIRef(value)))
|
|
else:
|
|
g.add((inst_uri, DCTERMS.identifier, Literal(f"{scheme}:{value}")))
|
|
|
|
# Location
|
|
if inst.get('locations'):
|
|
loc = inst['locations'][0]
|
|
|
|
# Create location node
|
|
loc_uri = BASE[f"{inst_id}_location"]
|
|
g.add((loc_uri, RDF.type, SCHEMA.Place))
|
|
g.add((inst_uri, SCHEMA.location, loc_uri))
|
|
g.add((inst_uri, ORG.hasSite, loc_uri))
|
|
|
|
if loc.get('city'):
|
|
g.add((loc_uri, SCHEMA.addressLocality, Literal(loc['city'])))
|
|
if loc.get('country'):
|
|
g.add((loc_uri, SCHEMA.addressCountry, Literal(loc['country'])))
|
|
if loc.get('street_address'):
|
|
g.add((loc_uri, SCHEMA.streetAddress, Literal(loc['street_address'])))
|
|
if loc.get('postal_code'):
|
|
g.add((loc_uri, SCHEMA.postalCode, Literal(loc['postal_code'])))
|
|
|
|
# Collections
|
|
if inst.get('collections'):
|
|
for coll in inst['collections']:
|
|
coll_uri = BASE[f"{inst_id}_collection_{hash(coll['collection_name']) % 10000}"]
|
|
g.add((coll_uri, RDF.type, RICO.RecordSet))
|
|
g.add((coll_uri, RDF.type, SCHEMA.Collection))
|
|
g.add((inst_uri, RICO.hasOrHadConstituent, coll_uri))
|
|
g.add((inst_uri, SCHEMA.hasOfferCatalog, coll_uri))
|
|
|
|
g.add((coll_uri, SCHEMA.name, Literal(coll['collection_name'])))
|
|
if coll.get('subject_areas'):
|
|
for subject in coll['subject_areas']:
|
|
g.add((coll_uri, SCHEMA.about, Literal(subject)))
|
|
|
|
# Provenance
|
|
if inst.get('provenance'):
|
|
prov = inst['provenance']
|
|
prov_uri = BASE[f"{inst_id}_provenance"]
|
|
g.add((prov_uri, RDF.type, PROV.Entity))
|
|
g.add((inst_uri, PROV.wasDerivedFrom, prov_uri))
|
|
|
|
if prov.get('extraction_date'):
|
|
g.add((prov_uri, PROV.generatedAtTime, Literal(prov['extraction_date'], datatype=XSD.dateTime)))
|
|
if prov.get('data_source'):
|
|
g.add((prov_uri, DCTERMS.source, Literal(prov['data_source'])))
|
|
|
|
return g
|
|
|
|
def export_formats(g, base_filename):
|
|
"""Export to Turtle, RDF/XML, and JSON-LD."""
|
|
output_dir = Path("data/exports")
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
formats = {
|
|
'turtle': 'ttl',
|
|
'xml': 'rdf',
|
|
'json-ld': 'jsonld'
|
|
}
|
|
|
|
for fmt, ext in formats.items():
|
|
output_file = output_dir / f"{base_filename}.{ext}"
|
|
g.serialize(destination=str(output_file), format=fmt)
|
|
size = output_file.stat().st_size / 1024
|
|
print(f"✅ Exported {fmt.upper():10} → {output_file.name} ({size:.1f} KB)")
|
|
|
|
def generate_statistics(institutions):
|
|
"""Generate coverage statistics."""
|
|
stats = {
|
|
'total_institutions': len(institutions),
|
|
'wikidata_coverage': 0,
|
|
'institution_types': {},
|
|
'cities': set()
|
|
}
|
|
|
|
for inst in institutions:
|
|
# Count Wikidata coverage
|
|
if inst.get('identifiers'):
|
|
for ident in inst['identifiers']:
|
|
if ident['identifier_scheme'] == 'Wikidata':
|
|
stats['wikidata_coverage'] += 1
|
|
break
|
|
|
|
# Count types
|
|
inst_type = inst.get('institution_type', 'UNKNOWN')
|
|
stats['institution_types'][inst_type] = stats['institution_types'].get(inst_type, 0) + 1
|
|
|
|
# Collect cities
|
|
if inst.get('locations'):
|
|
stats['cities'].add(inst['locations'][0]['city'])
|
|
|
|
stats['cities'] = len(stats['cities'])
|
|
stats['wikidata_coverage_pct'] = (stats['wikidata_coverage'] / stats['total_institutions']) * 100
|
|
|
|
return stats
|
|
|
|
def main():
|
|
print("Algeria Heritage Institutions RDF Export")
|
|
print("=" * 60)
|
|
|
|
institutions = load_institutions()
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
|
|
print("\nCreating RDF graph with multi-ontology support...")
|
|
g = create_rdf_graph(institutions)
|
|
print(f"✅ Graph created: {len(g)} triples")
|
|
|
|
print("\nExporting to multiple formats...")
|
|
export_formats(g, "algeria_institutions")
|
|
|
|
print("\nGenerating statistics...")
|
|
stats = generate_statistics(institutions)
|
|
stats_file = Path("data/exports/algeria_statistics.yaml")
|
|
with open(stats_file, 'w') as f:
|
|
yaml.dump(stats, f, default_flow_style=False, allow_unicode=True)
|
|
print(f"✅ Statistics → {stats_file.name}")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("ALGERIA RDF EXPORT COMPLETE")
|
|
print("=" * 60)
|
|
print(f"Total institutions: {stats['total_institutions']}")
|
|
print(f"Wikidata coverage: {stats['wikidata_coverage_pct']:.1f}% ({stats['wikidata_coverage']}/{stats['total_institutions']})")
|
|
print(f"Institution types: {len(stats['institution_types'])} categories")
|
|
print(f"Cities covered: {stats['cities']}")
|
|
print("=" * 60)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|