267 lines
8.6 KiB
Python
Executable file
267 lines
8.6 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Sachsen-Anhalt GLAM Institutions - DDB SPARQL Harvest
|
|
Extracts museums, libraries, and archives from Deutsche Digitale Bibliothek
|
|
Target: 50-100+ institutions with comprehensive metadata
|
|
"""
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
import time
|
|
|
|
def query_ddb_sachsen_anhalt() -> List[Dict[str, Any]]:
|
|
"""Query DDB SPARQL endpoint for Sachsen-Anhalt institutions."""
|
|
|
|
endpoint = "https://labs.ddb.de/app/ddb-spl/sparql"
|
|
|
|
# Major cities in Sachsen-Anhalt
|
|
cities = [
|
|
"Halle", "Magdeburg", "Dessau", "Dessau-Roßlau",
|
|
"Wittenberg", "Quedlinburg", "Wernigerode", "Halberstadt",
|
|
"Bernburg", "Weißenfels", "Naumburg", "Zeitz",
|
|
"Sangerhausen", "Merseburg", "Köthen", "Aschersleben",
|
|
"Eisleben", "Stendal", "Salzwedel", "Burg"
|
|
]
|
|
|
|
# Build city filter for SPARQL
|
|
city_filter = " || ".join([f'CONTAINS(LCASE(?place), "{city.lower()}")' for city in cities])
|
|
|
|
query = f"""
|
|
PREFIX gndo: <https://d-nb.info/standards/elementset/gnd#>
|
|
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
|
|
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
|
PREFIX wgs84: <http://www.w3.org/2003/01/geo/wgs84_pos#>
|
|
PREFIX schema: <http://schema.org/>
|
|
|
|
SELECT DISTINCT ?institution ?name ?place ?lat ?long ?type ?gnd ?isil ?homepage WHERE {{
|
|
# Institution types: museums, libraries, archives
|
|
?institution rdf:type ?typeClass .
|
|
FILTER(?typeClass IN (gndo:Museum, gndo:Library, gndo:Archive))
|
|
|
|
# Institution name
|
|
?institution gndo:preferredNameForTheCorporateBody ?name .
|
|
|
|
# Location (filter for Sachsen-Anhalt cities)
|
|
?institution gndo:placeOfBusiness ?placeNode .
|
|
?placeNode gndo:preferredNameForThePlaceOrGeographicName ?place .
|
|
FILTER({city_filter})
|
|
|
|
# Optional: Coordinates
|
|
OPTIONAL {{
|
|
?placeNode wgs84:lat ?lat .
|
|
?placeNode wgs84:long ?long .
|
|
}}
|
|
|
|
# Optional: Type label
|
|
OPTIONAL {{
|
|
?typeClass rdfs:label ?type .
|
|
FILTER(LANG(?type) = "de" || LANG(?type) = "")
|
|
}}
|
|
|
|
# Optional: GND identifier
|
|
OPTIONAL {{
|
|
BIND(REPLACE(STR(?institution), "https://d-nb.info/gnd/", "") AS ?gnd)
|
|
}}
|
|
|
|
# Optional: ISIL code
|
|
OPTIONAL {{
|
|
?institution gndo:isilCode ?isil .
|
|
}}
|
|
|
|
# Optional: Homepage
|
|
OPTIONAL {{
|
|
?institution gndo:homepage ?homepage .
|
|
}}
|
|
}}
|
|
ORDER BY ?place ?name
|
|
LIMIT 500
|
|
"""
|
|
|
|
print("Querying DDB SPARQL endpoint for Sachsen-Anhalt institutions...")
|
|
print(f"Endpoint: {endpoint}")
|
|
|
|
sparql = SPARQLWrapper(endpoint)
|
|
sparql.setQuery(query)
|
|
sparql.setReturnFormat(JSON)
|
|
sparql.setTimeout(60) # 60 second timeout
|
|
|
|
try:
|
|
results = sparql.query().convert()
|
|
bindings = results['results']['bindings']
|
|
print(f"✅ Query returned {len(bindings)} results")
|
|
return bindings
|
|
except Exception as e:
|
|
print(f"❌ SPARQL query failed: {e}")
|
|
return []
|
|
|
|
def convert_to_linkml_format(sparql_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
"""Convert DDB SPARQL results to LinkML heritage custodian format."""
|
|
|
|
institutions = []
|
|
|
|
for result in sparql_results:
|
|
# Extract fields with safe defaults
|
|
name = result.get('name', {}).get('value', 'Unknown')
|
|
place = result.get('place', {}).get('value', '')
|
|
inst_type = result.get('type', {}).get('value', 'Unknown')
|
|
gnd_uri = result.get('institution', {}).get('value', '')
|
|
gnd_id = result.get('gnd', {}).get('value', '')
|
|
isil = result.get('isil', {}).get('value', '')
|
|
homepage = result.get('homepage', {}).get('value', '')
|
|
lat = result.get('lat', {}).get('value', '')
|
|
lon = result.get('long', {}).get('value', '')
|
|
|
|
# Map German type names to LinkML institution types
|
|
type_mapping = {
|
|
'Museum': 'MUSEUM',
|
|
'Bibliothek': 'LIBRARY',
|
|
'Archiv': 'ARCHIVE',
|
|
'Kunstmuseum': 'MUSEUM',
|
|
'Heimatmuseum': 'MUSEUM',
|
|
'Stadtarchiv': 'ARCHIVE',
|
|
'Universitätsbibliothek': 'LIBRARY',
|
|
}
|
|
|
|
linkml_type = type_mapping.get(inst_type, 'UNKNOWN')
|
|
|
|
# Build location object
|
|
location = {
|
|
'city': place,
|
|
'country': 'DE',
|
|
'region': 'Sachsen-Anhalt'
|
|
}
|
|
|
|
if lat and lon:
|
|
try:
|
|
location['latitude'] = float(lat)
|
|
location['longitude'] = float(lon)
|
|
except ValueError:
|
|
pass
|
|
|
|
# Build identifiers list
|
|
identifiers = []
|
|
|
|
if gnd_id:
|
|
identifiers.append({
|
|
'identifier_scheme': 'GND',
|
|
'identifier_value': gnd_id,
|
|
'identifier_url': gnd_uri
|
|
})
|
|
|
|
if isil:
|
|
identifiers.append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': isil,
|
|
'identifier_url': f'https://sigel.staatsbibliothek-berlin.de/suche/?isil={isil}'
|
|
})
|
|
|
|
if homepage:
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': homepage,
|
|
'identifier_url': homepage
|
|
})
|
|
|
|
# Build institution record
|
|
institution = {
|
|
'name': name,
|
|
'institution_type': linkml_type,
|
|
'locations': [location],
|
|
'identifiers': identifiers,
|
|
'provenance': {
|
|
'data_source': 'DDB_SPARQL',
|
|
'data_tier': 'TIER_2_VERIFIED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'DDB SPARQL query filtered by Sachsen-Anhalt cities',
|
|
'confidence_score': 0.95,
|
|
'source_url': 'https://labs.ddb.de/app/ddb-spl/'
|
|
}
|
|
}
|
|
|
|
institutions.append(institution)
|
|
|
|
return institutions
|
|
|
|
def main():
|
|
"""Main execution: Query DDB and save results."""
|
|
|
|
print("=" * 80)
|
|
print("Sachsen-Anhalt GLAM Institutions - DDB SPARQL Harvest")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Query DDB
|
|
sparql_results = query_ddb_sachsen_anhalt()
|
|
|
|
if not sparql_results:
|
|
print("❌ No results from DDB query. Exiting.")
|
|
return
|
|
|
|
print()
|
|
print(f"Converting {len(sparql_results)} results to LinkML format...")
|
|
|
|
# Convert to LinkML format
|
|
institutions = convert_to_linkml_format(sparql_results)
|
|
|
|
# Deduplicate by name + city (case-insensitive)
|
|
seen = set()
|
|
unique_institutions = []
|
|
duplicates = 0
|
|
|
|
for inst in institutions:
|
|
key = (inst['name'].lower(), inst['locations'][0]['city'].lower())
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique_institutions.append(inst)
|
|
else:
|
|
duplicates += 1
|
|
|
|
print(f"✅ Converted {len(institutions)} institutions")
|
|
print(f" Removed {duplicates} duplicates")
|
|
print(f" Final count: {len(unique_institutions)} unique institutions")
|
|
print()
|
|
|
|
# Statistics
|
|
type_counts = {}
|
|
for inst in unique_institutions:
|
|
inst_type = inst['institution_type']
|
|
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
|
|
|
|
print("Institution Types:")
|
|
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
|
|
print(f" {inst_type}: {count}")
|
|
print()
|
|
|
|
city_counts = {}
|
|
for inst in unique_institutions:
|
|
city = inst['locations'][0]['city']
|
|
city_counts[city] = city_counts.get(city, 0) + 1
|
|
|
|
print("Top 10 Cities:")
|
|
for city, count in sorted(city_counts.items(), key=lambda x: -x[1])[:10]:
|
|
print(f" {city}: {count}")
|
|
print()
|
|
|
|
# Save to JSON
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
output_path = Path('data/isil/germany') / f'sachsen_anhalt_ddb_sparql_{timestamp}.json'
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(unique_institutions, f, ensure_ascii=False, indent=2)
|
|
|
|
file_size_kb = output_path.stat().st_size / 1024
|
|
|
|
print(f"✅ Saved to: {output_path}")
|
|
print(f" File size: {file_size_kb:.1f} KB")
|
|
print(f" Total institutions: {len(unique_institutions)}")
|
|
print()
|
|
print("=" * 80)
|
|
print("DDB SPARQL harvest complete!")
|
|
print("=" * 80)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|