glam/scripts/scrapers/harvest_sachsen_anhalt_ddb.py
2025-11-21 22:12:33 +01:00

267 lines
8.6 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Sachsen-Anhalt GLAM Institutions - DDB SPARQL Harvest
Extracts museums, libraries, and archives from Deutsche Digitale Bibliothek
Target: 50-100+ institutions with comprehensive metadata
"""
from SPARQLWrapper import SPARQLWrapper, JSON
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any
import time
def query_ddb_sachsen_anhalt() -> List[Dict[str, Any]]:
"""Query DDB SPARQL endpoint for Sachsen-Anhalt institutions."""
endpoint = "https://labs.ddb.de/app/ddb-spl/sparql"
# Major cities in Sachsen-Anhalt
cities = [
"Halle", "Magdeburg", "Dessau", "Dessau-Roßlau",
"Wittenberg", "Quedlinburg", "Wernigerode", "Halberstadt",
"Bernburg", "Weißenfels", "Naumburg", "Zeitz",
"Sangerhausen", "Merseburg", "Köthen", "Aschersleben",
"Eisleben", "Stendal", "Salzwedel", "Burg"
]
# Build city filter for SPARQL
city_filter = " || ".join([f'CONTAINS(LCASE(?place), "{city.lower()}")' for city in cities])
query = f"""
PREFIX gndo: <https://d-nb.info/standards/elementset/gnd#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX wgs84: <http://www.w3.org/2003/01/geo/wgs84_pos#>
PREFIX schema: <http://schema.org/>
SELECT DISTINCT ?institution ?name ?place ?lat ?long ?type ?gnd ?isil ?homepage WHERE {{
# Institution types: museums, libraries, archives
?institution rdf:type ?typeClass .
FILTER(?typeClass IN (gndo:Museum, gndo:Library, gndo:Archive))
# Institution name
?institution gndo:preferredNameForTheCorporateBody ?name .
# Location (filter for Sachsen-Anhalt cities)
?institution gndo:placeOfBusiness ?placeNode .
?placeNode gndo:preferredNameForThePlaceOrGeographicName ?place .
FILTER({city_filter})
# Optional: Coordinates
OPTIONAL {{
?placeNode wgs84:lat ?lat .
?placeNode wgs84:long ?long .
}}
# Optional: Type label
OPTIONAL {{
?typeClass rdfs:label ?type .
FILTER(LANG(?type) = "de" || LANG(?type) = "")
}}
# Optional: GND identifier
OPTIONAL {{
BIND(REPLACE(STR(?institution), "https://d-nb.info/gnd/", "") AS ?gnd)
}}
# Optional: ISIL code
OPTIONAL {{
?institution gndo:isilCode ?isil .
}}
# Optional: Homepage
OPTIONAL {{
?institution gndo:homepage ?homepage .
}}
}}
ORDER BY ?place ?name
LIMIT 500
"""
print("Querying DDB SPARQL endpoint for Sachsen-Anhalt institutions...")
print(f"Endpoint: {endpoint}")
sparql = SPARQLWrapper(endpoint)
sparql.setQuery(query)
sparql.setReturnFormat(JSON)
sparql.setTimeout(60) # 60 second timeout
try:
results = sparql.query().convert()
bindings = results['results']['bindings']
print(f"✅ Query returned {len(bindings)} results")
return bindings
except Exception as e:
print(f"❌ SPARQL query failed: {e}")
return []
def convert_to_linkml_format(sparql_results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Convert DDB SPARQL results to LinkML heritage custodian format."""
institutions = []
for result in sparql_results:
# Extract fields with safe defaults
name = result.get('name', {}).get('value', 'Unknown')
place = result.get('place', {}).get('value', '')
inst_type = result.get('type', {}).get('value', 'Unknown')
gnd_uri = result.get('institution', {}).get('value', '')
gnd_id = result.get('gnd', {}).get('value', '')
isil = result.get('isil', {}).get('value', '')
homepage = result.get('homepage', {}).get('value', '')
lat = result.get('lat', {}).get('value', '')
lon = result.get('long', {}).get('value', '')
# Map German type names to LinkML institution types
type_mapping = {
'Museum': 'MUSEUM',
'Bibliothek': 'LIBRARY',
'Archiv': 'ARCHIVE',
'Kunstmuseum': 'MUSEUM',
'Heimatmuseum': 'MUSEUM',
'Stadtarchiv': 'ARCHIVE',
'Universitätsbibliothek': 'LIBRARY',
}
linkml_type = type_mapping.get(inst_type, 'UNKNOWN')
# Build location object
location = {
'city': place,
'country': 'DE',
'region': 'Sachsen-Anhalt'
}
if lat and lon:
try:
location['latitude'] = float(lat)
location['longitude'] = float(lon)
except ValueError:
pass
# Build identifiers list
identifiers = []
if gnd_id:
identifiers.append({
'identifier_scheme': 'GND',
'identifier_value': gnd_id,
'identifier_url': gnd_uri
})
if isil:
identifiers.append({
'identifier_scheme': 'ISIL',
'identifier_value': isil,
'identifier_url': f'https://sigel.staatsbibliothek-berlin.de/suche/?isil={isil}'
})
if homepage:
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': homepage,
'identifier_url': homepage
})
# Build institution record
institution = {
'name': name,
'institution_type': linkml_type,
'locations': [location],
'identifiers': identifiers,
'provenance': {
'data_source': 'DDB_SPARQL',
'data_tier': 'TIER_2_VERIFIED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'DDB SPARQL query filtered by Sachsen-Anhalt cities',
'confidence_score': 0.95,
'source_url': 'https://labs.ddb.de/app/ddb-spl/'
}
}
institutions.append(institution)
return institutions
def main():
"""Main execution: Query DDB and save results."""
print("=" * 80)
print("Sachsen-Anhalt GLAM Institutions - DDB SPARQL Harvest")
print("=" * 80)
print()
# Query DDB
sparql_results = query_ddb_sachsen_anhalt()
if not sparql_results:
print("❌ No results from DDB query. Exiting.")
return
print()
print(f"Converting {len(sparql_results)} results to LinkML format...")
# Convert to LinkML format
institutions = convert_to_linkml_format(sparql_results)
# Deduplicate by name + city (case-insensitive)
seen = set()
unique_institutions = []
duplicates = 0
for inst in institutions:
key = (inst['name'].lower(), inst['locations'][0]['city'].lower())
if key not in seen:
seen.add(key)
unique_institutions.append(inst)
else:
duplicates += 1
print(f"✅ Converted {len(institutions)} institutions")
print(f" Removed {duplicates} duplicates")
print(f" Final count: {len(unique_institutions)} unique institutions")
print()
# Statistics
type_counts = {}
for inst in unique_institutions:
inst_type = inst['institution_type']
type_counts[inst_type] = type_counts.get(inst_type, 0) + 1
print("Institution Types:")
for inst_type, count in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {inst_type}: {count}")
print()
city_counts = {}
for inst in unique_institutions:
city = inst['locations'][0]['city']
city_counts[city] = city_counts.get(city, 0) + 1
print("Top 10 Cities:")
for city, count in sorted(city_counts.items(), key=lambda x: -x[1])[:10]:
print(f" {city}: {count}")
print()
# Save to JSON
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
output_path = Path('data/isil/germany') / f'sachsen_anhalt_ddb_sparql_{timestamp}.json'
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(unique_institutions, f, ensure_ascii=False, indent=2)
file_size_kb = output_path.stat().st_size / 1024
print(f"✅ Saved to: {output_path}")
print(f" File size: {file_size_kb:.1f} KB")
print(f" Total institutions: {len(unique_institutions)}")
print()
print("=" * 80)
print("DDB SPARQL harvest complete!")
print("=" * 80)
if __name__ == '__main__':
main()