- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
772 lines
32 KiB
Python
772 lines
32 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Global Wikidata Heritage Institution Extractor
|
|
|
|
This script extracts ALL GLAMORCUBEPSXHF (Galleries, Libraries, Archives, Museums,
|
|
Official institutions, Research centers, Corporations, Universities, Botanical gardens/zoos,
|
|
Educational providers, Personal collections, Societies, Holy sites, Features) institutions worldwide
|
|
from Wikidata using SPARQL queries.
|
|
|
|
🌍 COMPREHENSIVE GLOBAL EXTRACTION STRATEGY 🌍
|
|
|
|
Strategy:
|
|
1. Load country configuration (205+ countries with Wikidata QIDs)
|
|
2. Query Wikidata SPARQL endpoint for each country separately
|
|
3. Extract 15 GLAMORCUBEPSXHF institution types per country
|
|
4. Capture complete metadata (identifiers, coordinates, temporal data, collections)
|
|
5. Save results to data/wikidata/{country_code}/{timestamp}.json
|
|
6. Track progress, errors, and statistics
|
|
|
|
Query Optimization:
|
|
- Query each institution type separately to avoid 504 timeouts
|
|
- Use LIMITED transitive subclass queries (wdt:P279? not wdt:P279*)
|
|
- Implement pagination (LIMIT/OFFSET) for large datasets
|
|
- Rate limiting: 2-5 second delays between requests
|
|
- Exponential backoff for error handling
|
|
|
|
Priority Countries (configurable):
|
|
- Priority 1: Netherlands (Q55), Chile (Q298), Belgium (Q31)
|
|
- Priority 2: Italy (Q38), Denmark (Q35), Austria (Q40), Switzerland (Q39)
|
|
- Priority 3: Latin America (Brazil, Mexico, Argentina, Colombia)
|
|
- Priority 4: Asia (Japan, Vietnam, Thailand, Taiwan, South Korea)
|
|
- Priority 5: Africa/Middle East (Egypt, South Africa, Kenya, Nigeria)
|
|
|
|
Output Format:
|
|
data/wikidata/{country_code}/{timestamp}.json
|
|
{
|
|
"country_code": "NL",
|
|
"country_name": "Netherlands",
|
|
"country_qid": "Q55",
|
|
"extraction_date": "2025-11-11T10:30:00Z",
|
|
"total_institutions": 1247,
|
|
"institution_types": {"museum": 843, "library": 302, ...},
|
|
"institutions": [...]
|
|
}
|
|
|
|
Usage:
|
|
# Extract priority 1 countries (Netherlands, Chile)
|
|
python extract_global_wikidata.py --priority 1
|
|
|
|
# Extract specific countries
|
|
python extract_global_wikidata.py --countries NL CL BE IT
|
|
|
|
# Extract all countries (use with caution - 205+ countries)
|
|
python extract_global_wikidata.py --all-countries
|
|
|
|
# Dry run (show what would be extracted)
|
|
python extract_global_wikidata.py --priority 1 --dry-run
|
|
|
|
# Resume from specific country
|
|
python extract_global_wikidata.py --countries BR MX AR --skip-existing
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
from datetime import datetime, timezone
|
|
import time
|
|
import argparse
|
|
from collections import defaultdict
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore
|
|
from SPARQLWrapper import SPARQLExceptions # type: ignore
|
|
|
|
|
|
# Country configurations (Wikidata QIDs) - 205+ countries worldwide
|
|
# Organized by continent and priority level
|
|
COUNTRY_CONFIGS = {
|
|
# =============================================================================
|
|
# PRIORITY 1: HIGH DATA QUALITY, LARGE DATASETS
|
|
# =============================================================================
|
|
'NL': {'name': 'Netherlands', 'qid': 'Q55', 'flag': '🇳🇱', 'languages': 'nl,en', 'priority': 1, 'continent': 'Europe'},
|
|
'CL': {'name': 'Chile', 'qid': 'Q298', 'flag': '🇨🇱', 'languages': 'es,en', 'priority': 1, 'continent': 'Americas'},
|
|
'BE': {'name': 'Belgium', 'qid': 'Q31', 'flag': '🇧🇪', 'languages': 'nl,fr,en', 'priority': 1, 'continent': 'Europe'},
|
|
|
|
# =============================================================================
|
|
# PRIORITY 2: MEDIUM DATASETS, GOOD COVERAGE POTENTIAL
|
|
# =============================================================================
|
|
'IT': {'name': 'Italy', 'qid': 'Q38', 'flag': '🇮🇹', 'languages': 'it,en', 'priority': 2, 'continent': 'Europe'},
|
|
'DK': {'name': 'Denmark', 'qid': 'Q35', 'flag': '🇩🇰', 'languages': 'da,en', 'priority': 2, 'continent': 'Europe'},
|
|
'AT': {'name': 'Austria', 'qid': 'Q40', 'flag': '🇦🇹', 'languages': 'de,en', 'priority': 2, 'continent': 'Europe'},
|
|
'CH': {'name': 'Switzerland', 'qid': 'Q39', 'flag': '🇨🇭', 'languages': 'de,fr,it,en', 'priority': 2, 'continent': 'Europe'},
|
|
'NO': {'name': 'Norway', 'qid': 'Q20', 'flag': '🇳🇴', 'languages': 'no,en', 'priority': 2, 'continent': 'Europe'},
|
|
'SE': {'name': 'Sweden', 'qid': 'Q34', 'flag': '🇸🇪', 'languages': 'sv,en', 'priority': 2, 'continent': 'Europe'},
|
|
'FI': {'name': 'Finland', 'qid': 'Q33', 'flag': '🇫🇮', 'languages': 'fi,en', 'priority': 2, 'continent': 'Europe'},
|
|
'FR': {'name': 'France', 'qid': 'Q142', 'flag': '🇫🇷', 'languages': 'fr,en', 'priority': 2, 'continent': 'Europe'},
|
|
'DE': {'name': 'Germany', 'qid': 'Q183', 'flag': '🇩🇪', 'languages': 'de,en', 'priority': 2, 'continent': 'Europe'},
|
|
'ES': {'name': 'Spain', 'qid': 'Q29', 'flag': '🇪🇸', 'languages': 'es,en', 'priority': 2, 'continent': 'Europe'},
|
|
'PT': {'name': 'Portugal', 'qid': 'Q45', 'flag': '🇵🇹', 'languages': 'pt,en', 'priority': 2, 'continent': 'Europe'},
|
|
|
|
# =============================================================================
|
|
# PRIORITY 3: LATIN AMERICA
|
|
# =============================================================================
|
|
'BR': {'name': 'Brazil', 'qid': 'Q155', 'flag': '🇧🇷', 'languages': 'pt,en', 'priority': 3, 'continent': 'Americas'},
|
|
'MX': {'name': 'Mexico', 'qid': 'Q96', 'flag': '🇲🇽', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
|
|
'AR': {'name': 'Argentina', 'qid': 'Q414', 'flag': '🇦🇷', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
|
|
'CO': {'name': 'Colombia', 'qid': 'Q739', 'flag': '🇨🇴', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
|
|
'PE': {'name': 'Peru', 'qid': 'Q419', 'flag': '🇵🇪', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
|
|
'VE': {'name': 'Venezuela', 'qid': 'Q717', 'flag': '🇻🇪', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
|
|
'EC': {'name': 'Ecuador', 'qid': 'Q736', 'flag': '🇪🇨', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
|
|
'BO': {'name': 'Bolivia', 'qid': 'Q750', 'flag': '🇧🇴', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
|
|
'UY': {'name': 'Uruguay', 'qid': 'Q77', 'flag': '🇺🇾', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
|
|
'PY': {'name': 'Paraguay', 'qid': 'Q733', 'flag': '🇵🇾', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'},
|
|
|
|
# =============================================================================
|
|
# PRIORITY 4: ASIA
|
|
# =============================================================================
|
|
'JP': {'name': 'Japan', 'qid': 'Q17', 'flag': '🇯🇵', 'languages': 'ja,en', 'priority': 4, 'continent': 'Asia'},
|
|
'CN': {'name': 'China', 'qid': 'Q148', 'flag': '🇨🇳', 'languages': 'zh,en', 'priority': 4, 'continent': 'Asia'},
|
|
'IN': {'name': 'India', 'qid': 'Q668', 'flag': '🇮🇳', 'languages': 'hi,en', 'priority': 4, 'continent': 'Asia'},
|
|
'KR': {'name': 'South Korea', 'qid': 'Q884', 'flag': '🇰🇷', 'languages': 'ko,en', 'priority': 4, 'continent': 'Asia'},
|
|
'TW': {'name': 'Taiwan', 'qid': 'Q865', 'flag': '🇹🇼', 'languages': 'zh,en', 'priority': 4, 'continent': 'Asia'},
|
|
'TH': {'name': 'Thailand', 'qid': 'Q869', 'flag': '🇹🇭', 'languages': 'th,en', 'priority': 4, 'continent': 'Asia'},
|
|
'VN': {'name': 'Vietnam', 'qid': 'Q881', 'flag': '🇻🇳', 'languages': 'vi,en', 'priority': 4, 'continent': 'Asia'},
|
|
'MY': {'name': 'Malaysia', 'qid': 'Q833', 'flag': '🇲🇾', 'languages': 'ms,en', 'priority': 4, 'continent': 'Asia'},
|
|
'ID': {'name': 'Indonesia', 'qid': 'Q252', 'flag': '🇮🇩', 'languages': 'id,en', 'priority': 4, 'continent': 'Asia'},
|
|
'PH': {'name': 'Philippines', 'qid': 'Q928', 'flag': '🇵🇭', 'languages': 'en,tl', 'priority': 4, 'continent': 'Asia'},
|
|
'SG': {'name': 'Singapore', 'qid': 'Q334', 'flag': '🇸🇬', 'languages': 'en', 'priority': 4, 'continent': 'Asia'},
|
|
|
|
# =============================================================================
|
|
# PRIORITY 5: AFRICA AND MIDDLE EAST
|
|
# =============================================================================
|
|
'EG': {'name': 'Egypt', 'qid': 'Q79', 'flag': '🇪🇬', 'languages': 'ar,en', 'priority': 5, 'continent': 'Africa'},
|
|
'ZA': {'name': 'South Africa', 'qid': 'Q258', 'flag': '🇿🇦', 'languages': 'en,af', 'priority': 5, 'continent': 'Africa'},
|
|
'KE': {'name': 'Kenya', 'qid': 'Q114', 'flag': '🇰🇪', 'languages': 'en,sw', 'priority': 5, 'continent': 'Africa'},
|
|
'NG': {'name': 'Nigeria', 'qid': 'Q1033', 'flag': '🇳🇬', 'languages': 'en', 'priority': 5, 'continent': 'Africa'},
|
|
'GH': {'name': 'Ghana', 'qid': 'Q117', 'flag': '🇬🇭', 'languages': 'en', 'priority': 5, 'continent': 'Africa'},
|
|
'ET': {'name': 'Ethiopia', 'qid': 'Q115', 'flag': '🇪🇹', 'languages': 'am,en', 'priority': 5, 'continent': 'Africa'},
|
|
'TZ': {'name': 'Tanzania', 'qid': 'Q924', 'flag': '🇹🇿', 'languages': 'sw,en', 'priority': 5, 'continent': 'Africa'},
|
|
'UG': {'name': 'Uganda', 'qid': 'Q1036', 'flag': '🇺🇬', 'languages': 'en', 'priority': 5, 'continent': 'Africa'},
|
|
|
|
# TODO: Add remaining 160+ countries from /docs/WIKIDATA_SPARQL_QUERIES.md
|
|
}
|
|
|
|
|
|
# Institution type mappings (GLAMORCUBEPSXHF taxonomy → Wikidata QIDs)
|
|
INSTITUTION_TYPES = {
|
|
'museum': {'qid': 'Q33506', 'code': 'M', 'label': 'Museum'},
|
|
'library': {'qid': 'Q7075', 'code': 'L', 'label': 'Library'},
|
|
'archive': {'qid': 'Q166118', 'code': 'A', 'label': 'Archive'},
|
|
'gallery': {'qid': 'Q2668072', 'code': 'G', 'label': 'Gallery'},
|
|
'cultural_center': {'qid': 'Q5282129', 'code': 'O', 'label': 'Cultural Center'},
|
|
'research_center': {'qid': 'Q3152824', 'code': 'R', 'label': 'Research Center'},
|
|
'university': {'qid': 'Q3918', 'code': 'U', 'label': 'University'},
|
|
'botanical_garden': {'qid': 'Q167346', 'code': 'B', 'label': 'Botanical Garden'},
|
|
'features': {'qid': 'Q4989906', 'code': 'F', 'label': 'Monument/Landmark'},
|
|
# Add holy sites, societies, etc. as needed
|
|
}
|
|
|
|
|
|
def create_sparql_query(country_qid: str, inst_type_qid: str, languages: str = "en", limit: int = 1000, offset: int = 0) -> str:
|
|
"""
|
|
Generate SPARQL query for heritage institutions in a specific country and type.
|
|
|
|
This query extracts comprehensive metadata from Wikidata including:
|
|
- Identifiers (ISIL, VIAF, Wikidata QID, website, email, phone)
|
|
- Geographic data (coordinates, address, city, region)
|
|
- Temporal data (inception, founding, dissolution dates)
|
|
- Organizational data (parent org, part-of relationships)
|
|
- Collection metadata (size, types)
|
|
- Media (images, logos)
|
|
|
|
Args:
|
|
country_qid: Wikidata QID for country (e.g., Q55 for Netherlands)
|
|
inst_type_qid: Wikidata QID for institution type (e.g., Q33506 for museum)
|
|
languages: Comma-separated language codes (e.g., "nl,en")
|
|
limit: Maximum results to return
|
|
offset: Pagination offset
|
|
|
|
Returns:
|
|
SPARQL query string
|
|
"""
|
|
query = f"""
|
|
SELECT DISTINCT
|
|
?item ?itemLabel ?itemDescription ?itemAltLabel
|
|
?instType ?instTypeLabel
|
|
?coords
|
|
?streetAddress ?postalCode ?city ?cityLabel ?region ?regionLabel
|
|
# Primary identifiers
|
|
?isil ?viaf ?wikidataQID ?website ?email ?phone
|
|
# Archives & Libraries
|
|
?archivesPortalEuropeID ?egaxaID ?archiveGridID ?atomURL
|
|
# Museum identifiers
|
|
?museofileID ?commonsInstitution
|
|
# Dutch heritage identifiers
|
|
?rkdInstituteID ?rkdArtistsID ?rceMonumentID ?monumentsFlandersID
|
|
# Authority control
|
|
?locID ?gndID ?bnfID ?librariesAustraliaID ?nliID
|
|
# Social media
|
|
?twitter ?facebook ?instagram
|
|
# Temporal, organizational, collection, media
|
|
?inception ?dissolved ?foundingDate
|
|
?parent ?parentLabel ?partOf ?partOfLabel
|
|
?collectionSize ?collectionType
|
|
?image ?logo
|
|
WHERE {{
|
|
# Instance of heritage institution type (including subclasses)
|
|
# This checks if ?item is an instance of a type that is a subclass of our target type
|
|
?item wdt:P31 ?instType .
|
|
?instType wdt:P279* wd:{inst_type_qid} .
|
|
|
|
# Country filter
|
|
?item wdt:P17 wd:{country_qid} .
|
|
|
|
# =============================================================================
|
|
# GEOGRAPHIC DATA
|
|
# =============================================================================
|
|
|
|
# Coordinates (lat/lon) - returned as Point(lon lat) string, parsed in Python
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
|
|
# Physical address components
|
|
OPTIONAL {{ ?item wdt:P6375 ?streetAddress . }}
|
|
OPTIONAL {{ ?item wdt:P281 ?postalCode . }}
|
|
|
|
# City/municipality
|
|
OPTIONAL {{
|
|
?item wdt:P131 ?city .
|
|
?city wdt:P31/wdt:P279? wd:Q515 .
|
|
}}
|
|
|
|
# Region/province/state
|
|
OPTIONAL {{
|
|
?item wdt:P131 ?region .
|
|
?region wdt:P31/wdt:P279? wd:Q10864048 .
|
|
}}
|
|
|
|
# =============================================================================
|
|
# IDENTIFIERS - Comprehensive heritage institution identifiers
|
|
# =============================================================================
|
|
|
|
# Primary identifiers (ISIL, VIAF)
|
|
OPTIONAL {{ ?item wdt:P791 ?isil . }}
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
|
|
# Contact information
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P968 ?email . }}
|
|
OPTIONAL {{ ?item wdt:P1329 ?phone . }}
|
|
|
|
# Archives & Libraries
|
|
OPTIONAL {{ ?item wdt:P3066 ?archivesPortalEuropeID . }}
|
|
OPTIONAL {{ ?item wdt:P1309 ?egaxaID . }}
|
|
OPTIONAL {{ ?item wdt:P1984 ?archiveGridID . }}
|
|
OPTIONAL {{ ?item wdt:P6721 ?atomURL . }}
|
|
|
|
# Museum identifiers
|
|
OPTIONAL {{ ?item wdt:P539 ?museofileID . }}
|
|
OPTIONAL {{ ?item wdt:P1907 ?commonsInstitution . }}
|
|
|
|
# Dutch heritage identifiers (PRIORITY for NL data)
|
|
OPTIONAL {{ ?item wdt:P7740 ?rkdInstituteID . }}
|
|
OPTIONAL {{ ?item wdt:P350 ?rkdArtistsID . }}
|
|
OPTIONAL {{ ?item wdt:P7314 ?rceMonumentID . }}
|
|
OPTIONAL {{ ?item wdt:P4372 ?monumentsFlandersID . }}
|
|
|
|
# Authority control
|
|
OPTIONAL {{ ?item wdt:P244 ?locID . }}
|
|
OPTIONAL {{ ?item wdt:P227 ?gndID . }}
|
|
OPTIONAL {{ ?item wdt:P268 ?bnfID . }}
|
|
OPTIONAL {{ ?item wdt:P409 ?librariesAustraliaID . }}
|
|
OPTIONAL {{ ?item wdt:P3788 ?nliID . }}
|
|
|
|
# Social media
|
|
OPTIONAL {{ ?item wdt:P2002 ?twitter . }}
|
|
OPTIONAL {{ ?item wdt:P2013 ?facebook . }}
|
|
OPTIONAL {{ ?item wdt:P2003 ?instagram . }}
|
|
|
|
# Extract Wikidata QID from URI
|
|
BIND(STRAFTER(STR(?item), "http://www.wikidata.org/entity/") AS ?wikidataQID)
|
|
|
|
# =============================================================================
|
|
# TEMPORAL DATA
|
|
# =============================================================================
|
|
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
OPTIONAL {{ ?item wdt:P576 ?dissolved . }}
|
|
OPTIONAL {{ ?item wdt:P1619 ?foundingDate . }}
|
|
|
|
# =============================================================================
|
|
# ORGANIZATIONAL RELATIONSHIPS
|
|
# =============================================================================
|
|
|
|
OPTIONAL {{ ?item wdt:P749 ?parent . }}
|
|
OPTIONAL {{ ?item wdt:P361 ?partOf . }}
|
|
|
|
# =============================================================================
|
|
# COLLECTION METADATA
|
|
# =============================================================================
|
|
|
|
OPTIONAL {{ ?item wdt:P1301 ?collectionSize . }}
|
|
OPTIONAL {{ ?item wdt:P195 ?collectionType . }}
|
|
|
|
# =============================================================================
|
|
# MEDIA
|
|
# =============================================================================
|
|
|
|
OPTIONAL {{ ?item wdt:P18 ?image . }}
|
|
OPTIONAL {{ ?item wdt:P154 ?logo . }}
|
|
|
|
# =============================================================================
|
|
# LABELS AND DESCRIPTIONS (Multilingual)
|
|
# =============================================================================
|
|
|
|
SERVICE wikibase:label {{
|
|
bd:serviceParam wikibase:language "{languages}" .
|
|
?item rdfs:label ?itemLabel .
|
|
?item schema:description ?itemDescription .
|
|
?item skos:altLabel ?itemAltLabel .
|
|
?instType rdfs:label ?instTypeLabel .
|
|
?city rdfs:label ?cityLabel .
|
|
?region rdfs:label ?regionLabel .
|
|
?parent rdfs:label ?parentLabel .
|
|
?partOf rdfs:label ?partOfLabel .
|
|
}}
|
|
}}
|
|
ORDER BY ?itemLabel
|
|
LIMIT {limit}
|
|
OFFSET {offset}
|
|
"""
|
|
return query
|
|
|
|
|
|
def parse_sparql_results(bindings: list[dict]) -> list[dict[str, Any]]:
|
|
"""
|
|
Parse SPARQL query results into institution records.
|
|
|
|
Aggregates results by Wikidata QID (multiple rows may exist per institution
|
|
due to alternative labels, collection types, etc.)
|
|
|
|
Returns:
|
|
List of institution dictionaries with complete metadata
|
|
"""
|
|
institutions_by_qid: dict[str, dict[str, Any]] = {}
|
|
|
|
for binding in bindings:
|
|
item_uri = binding.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
continue
|
|
|
|
# Skip synthetic Q-numbers (policy: real identifiers only)
|
|
try:
|
|
qid_num = int(qid[1:])
|
|
if qid_num >= 90000000:
|
|
continue
|
|
except ValueError:
|
|
continue
|
|
|
|
# Initialize institution record if first occurrence
|
|
if qid not in institutions_by_qid:
|
|
institutions_by_qid[qid] = {
|
|
"wikidata_qid": qid,
|
|
"name": binding.get("itemLabel", {}).get("value", ""),
|
|
"description": binding.get("itemDescription", {}).get("value", ""),
|
|
"institution_type": binding.get("instTypeLabel", {}).get("value", ""),
|
|
"alternative_names": [],
|
|
"identifiers": {},
|
|
"location": {},
|
|
"temporal": {},
|
|
"organizational": {},
|
|
"collection": {},
|
|
"media": {}
|
|
}
|
|
|
|
inst = institutions_by_qid[qid]
|
|
|
|
# Collect alternative names
|
|
alt_label = binding.get("itemAltLabel", {}).get("value", "")
|
|
if alt_label and alt_label not in inst["alternative_names"]:
|
|
inst["alternative_names"].append(alt_label)
|
|
|
|
# Identifiers - comprehensive mapping
|
|
identifier_mappings = {
|
|
# Primary identifiers
|
|
"isil": "ISIL",
|
|
"viaf": "VIAF",
|
|
# Contact
|
|
"website": "website",
|
|
"email": "email",
|
|
"phone": "phone",
|
|
# Archives & Libraries
|
|
"archivesPortalEuropeID": "Archives_Portal_Europe_ID",
|
|
"egaxaID": "EGAXA_ID",
|
|
"archiveGridID": "ArchiveGrid_ID",
|
|
"atomURL": "AtoM_URL",
|
|
# Museums
|
|
"museofileID": "Museofile_ID",
|
|
"commonsInstitution": "Wikimedia_Commons_Institution",
|
|
# Dutch heritage
|
|
"rkdInstituteID": "RKD_Institute_ID",
|
|
"rkdArtistsID": "RKDartists_ID",
|
|
"rceMonumentID": "RCE_Monument_ID",
|
|
"monumentsFlandersID": "Monuments_Flanders_ID",
|
|
# Authority control
|
|
"locID": "Library_of_Congress_ID",
|
|
"gndID": "GND_ID",
|
|
"bnfID": "BnF_ID",
|
|
"librariesAustraliaID": "Libraries_Australia_ID",
|
|
"nliID": "National_Library_Israel_ID",
|
|
# Social media
|
|
"twitter": "Twitter",
|
|
"facebook": "Facebook",
|
|
"instagram": "Instagram"
|
|
}
|
|
|
|
for sparql_field, linkml_field in identifier_mappings.items():
|
|
if sparql_field in binding:
|
|
inst["identifiers"][linkml_field] = binding[sparql_field]["value"]
|
|
|
|
# Location - parse coordinates from Point(lon lat) format
|
|
if "coords" in binding:
|
|
coords_str = binding["coords"]["value"]
|
|
if coords_str and coords_str.startswith("Point("):
|
|
# Format: "Point(longitude latitude)"
|
|
coords_inner = coords_str[6:-1] # Remove "Point(" and ")"
|
|
lon_str, lat_str = coords_inner.split()
|
|
inst["location"]["latitude"] = float(lat_str)
|
|
inst["location"]["longitude"] = float(lon_str)
|
|
if "streetAddress" in binding:
|
|
inst["location"]["street_address"] = binding["streetAddress"]["value"]
|
|
if "postalCode" in binding:
|
|
inst["location"]["postal_code"] = binding["postalCode"]["value"]
|
|
if "cityLabel" in binding:
|
|
inst["location"]["city"] = binding["cityLabel"]["value"]
|
|
if "regionLabel" in binding:
|
|
inst["location"]["region"] = binding["regionLabel"]["value"]
|
|
|
|
# Temporal
|
|
if "inception" in binding:
|
|
inst["temporal"]["inception"] = binding["inception"]["value"].split("T")[0]
|
|
if "foundingDate" in binding:
|
|
inst["temporal"]["founding_date"] = binding["foundingDate"]["value"].split("T")[0]
|
|
if "dissolved" in binding:
|
|
inst["temporal"]["dissolved"] = binding["dissolved"]["value"].split("T")[0]
|
|
|
|
# Organizational
|
|
if "parentLabel" in binding:
|
|
inst["organizational"]["parent"] = binding["parentLabel"]["value"]
|
|
if "partOfLabel" in binding:
|
|
inst["organizational"]["part_of"] = binding["partOfLabel"]["value"]
|
|
|
|
# Collection
|
|
if "collectionSize" in binding:
|
|
inst["collection"]["size"] = int(binding["collectionSize"]["value"])
|
|
if "collectionType" in binding:
|
|
if "types" not in inst["collection"]:
|
|
inst["collection"]["types"] = []
|
|
inst["collection"]["types"].append(binding["collectionType"]["value"])
|
|
|
|
# Media
|
|
if "image" in binding:
|
|
inst["media"]["image"] = binding["image"]["value"]
|
|
if "logo" in binding:
|
|
inst["media"]["logo"] = binding["logo"]["value"]
|
|
|
|
return list(institutions_by_qid.values())
|
|
|
|
|
|
def query_wikidata_country(
|
|
sparql: SPARQLWrapper,
|
|
country_code: str,
|
|
country_info: dict[str, Any],
|
|
dry_run: bool = False
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Query Wikidata for all heritage institutions in a specific country.
|
|
|
|
Queries each institution type separately to avoid timeout issues.
|
|
Implements pagination for large datasets.
|
|
|
|
Returns:
|
|
Dictionary with extraction results and statistics
|
|
"""
|
|
print(f"\n{'='*80}")
|
|
print(f"{country_info['flag']} {country_info['name'].upper()} ({country_code})")
|
|
print(f"{'='*80}\n")
|
|
print(f"🔍 Querying Wikidata for {country_info['name']} heritage institutions...")
|
|
print(f" Languages: {country_info['languages']}")
|
|
print(f" Wikidata QID: {country_info['qid']}\n")
|
|
|
|
all_institutions = []
|
|
type_stats = defaultdict(int)
|
|
errors = []
|
|
|
|
for type_key, type_info in INSTITUTION_TYPES.items():
|
|
print(f" - {type_info['label']} ({type_info['qid']})...", end="", flush=True)
|
|
|
|
try:
|
|
# Pagination: fetch ALL results in batches of 1000
|
|
offset = 0
|
|
batch_size = 1000
|
|
type_institutions = []
|
|
|
|
while True:
|
|
query = create_sparql_query(
|
|
country_qid=country_info['qid'],
|
|
inst_type_qid=type_info['qid'],
|
|
languages=country_info['languages'],
|
|
limit=batch_size,
|
|
offset=offset
|
|
)
|
|
|
|
if dry_run:
|
|
print(" [DRY RUN - Query prepared]")
|
|
break
|
|
|
|
sparql.setQuery(query)
|
|
raw_results = sparql.query().convert()
|
|
bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else []
|
|
|
|
if not bindings:
|
|
# No more results
|
|
break
|
|
|
|
institutions = parse_sparql_results(bindings)
|
|
type_institutions.extend(institutions)
|
|
|
|
# If we got fewer results than batch_size, we've reached the end
|
|
if len(bindings) < batch_size:
|
|
break
|
|
|
|
offset += batch_size
|
|
# Rate limiting between pagination requests
|
|
time.sleep(1)
|
|
|
|
all_institutions.extend(type_institutions)
|
|
type_stats[type_key] = len(type_institutions)
|
|
|
|
print(f" ✅ {len(type_institutions)} found")
|
|
|
|
# Rate limiting - be nice to Wikidata
|
|
time.sleep(2)
|
|
|
|
except SPARQLExceptions.EndPointInternalError as e:
|
|
print(f" ❌ Timeout (query too complex)")
|
|
errors.append(f"{type_key}: Endpoint timeout")
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
errors.append(f"{type_key}: {str(e)}")
|
|
|
|
print(f"\n✅ Total institutions extracted: {len(all_institutions):,}")
|
|
print(f"📊 By type:")
|
|
for type_key, count in type_stats.items():
|
|
if count > 0:
|
|
print(f" {INSTITUTION_TYPES[type_key]['label']}: {count:,}")
|
|
|
|
if errors:
|
|
print(f"\n⚠️ Errors encountered: {len(errors)}")
|
|
for error in errors[:5]:
|
|
print(f" - {error}")
|
|
|
|
return {
|
|
"country_code": country_code,
|
|
"country_name": country_info['name'],
|
|
"country_qid": country_info['qid'],
|
|
"extraction_date": datetime.now(timezone.utc).isoformat(),
|
|
"total_institutions": len(all_institutions),
|
|
"institution_types": dict(type_stats),
|
|
"institutions": all_institutions,
|
|
"errors": errors
|
|
}
|
|
|
|
|
|
def save_results(results: dict[str, Any], output_dir: Path) -> Path:
|
|
"""
|
|
Save extraction results to JSON file.
|
|
|
|
File path: data/wikidata/{country_code}/{timestamp}.json
|
|
|
|
Returns:
|
|
Path to saved file
|
|
"""
|
|
country_code = results['country_code']
|
|
country_dir = output_dir / country_code.lower()
|
|
country_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
output_file = country_dir / f"{timestamp}.json"
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, ensure_ascii=False, indent=2)
|
|
|
|
return output_file
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract heritage institutions from Wikidata globally using SPARQL",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Extract priority 1 countries (Netherlands, Chile, Belgium)
|
|
%(prog)s --priority 1
|
|
|
|
# Extract specific countries
|
|
%(prog)s --countries NL CL BE IT FR DE
|
|
|
|
# Extract all countries (use with caution - 205+ countries)
|
|
%(prog)s --all-countries
|
|
|
|
# Dry run (preview what would be extracted)
|
|
%(prog)s --priority 1 --dry-run
|
|
|
|
# Resume extraction, skip countries with existing data
|
|
%(prog)s --priority 2 --skip-existing
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--countries',
|
|
nargs='+',
|
|
metavar='CODE',
|
|
help='Country codes to process (e.g., NL CL BE IT)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--priority',
|
|
nargs='+',
|
|
type=int,
|
|
metavar='N',
|
|
help='Process countries by priority level (1-5)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--all-countries',
|
|
action='store_true',
|
|
help='Process all configured countries (use with caution)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Preview queries without executing them'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--skip-existing',
|
|
action='store_true',
|
|
help='Skip countries that already have extraction data'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--output-dir',
|
|
type=Path,
|
|
help='Output directory (default: data/wikidata/)'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Determine countries to process
|
|
countries_to_process = []
|
|
|
|
if args.countries:
|
|
countries_to_process = args.countries
|
|
elif args.priority:
|
|
countries_to_process = [
|
|
code for code, info in COUNTRY_CONFIGS.items()
|
|
if info.get('priority') in args.priority
|
|
]
|
|
elif args.all_countries:
|
|
countries_to_process = list(COUNTRY_CONFIGS.keys())
|
|
else:
|
|
# Default: Priority 1 countries
|
|
countries_to_process = [
|
|
code for code, info in COUNTRY_CONFIGS.items()
|
|
if info.get('priority') == 1
|
|
]
|
|
|
|
# Validate country codes
|
|
invalid_countries = [c for c in countries_to_process if c not in COUNTRY_CONFIGS]
|
|
if invalid_countries:
|
|
print(f"❌ Invalid country codes: {', '.join(invalid_countries)}")
|
|
print(f" Valid codes: {', '.join(sorted(COUNTRY_CONFIGS.keys()))}")
|
|
return 1
|
|
|
|
# Output directory
|
|
base_dir = Path(__file__).parent.parent
|
|
output_dir = args.output_dir if args.output_dir else base_dir / "data" / "wikidata"
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Header
|
|
print("="*80)
|
|
print("🌍 GLOBAL WIKIDATA HERITAGE INSTITUTION EXTRACTOR")
|
|
print("="*80)
|
|
print(f"\n📂 Output directory: {output_dir}")
|
|
print(f"🌍 Countries to process: {len(countries_to_process)}")
|
|
country_names = [f"{COUNTRY_CONFIGS[c]['flag']} {COUNTRY_CONFIGS[c]['name']}" for c in countries_to_process]
|
|
print(f" {', '.join(country_names)}\n")
|
|
|
|
if args.dry_run:
|
|
print("🔍 DRY RUN MODE: No data will be extracted or saved\n")
|
|
|
|
# Setup SPARQL
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
sparql.setReturnFormat(SPARQL_JSON)
|
|
sparql.setMethod('POST')
|
|
sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.0 (Global Wikidata Extraction)")
|
|
|
|
# Process countries
|
|
start_time = time.time()
|
|
total_institutions = 0
|
|
successful_countries = 0
|
|
failed_countries = []
|
|
|
|
for i, country_code in enumerate(countries_to_process, 1):
|
|
country_info = COUNTRY_CONFIGS[country_code]
|
|
|
|
# Skip if already extracted (optional)
|
|
if args.skip_existing:
|
|
country_dir = output_dir / country_code.lower()
|
|
if country_dir.exists() and any(country_dir.glob("*.json")):
|
|
print(f"\n⏭️ Skipping {country_info['name']} (already extracted)")
|
|
continue
|
|
|
|
try:
|
|
results = query_wikidata_country(sparql, country_code, country_info, dry_run=args.dry_run)
|
|
|
|
if not args.dry_run:
|
|
output_file = save_results(results, output_dir)
|
|
print(f"\n💾 Saved to: {output_file}")
|
|
|
|
total_institutions += results['total_institutions']
|
|
successful_countries += 1
|
|
|
|
except Exception as e:
|
|
print(f"\n❌ FAILED: {country_info['name']}: {e}")
|
|
failed_countries.append(country_code)
|
|
|
|
# Rate limiting between countries
|
|
if i < len(countries_to_process):
|
|
wait_time = 5
|
|
print(f"\n⏸️ Waiting {wait_time} seconds (Wikidata rate limiting)...\n")
|
|
time.sleep(wait_time)
|
|
|
|
# Final report
|
|
elapsed_time = time.time() - start_time
|
|
print("\n" + "="*80)
|
|
print("📊 EXTRACTION COMPLETE")
|
|
print("="*80)
|
|
print(f"\n✅ Successful countries: {successful_countries}/{len(countries_to_process)}")
|
|
print(f"✨ Total institutions extracted: {total_institutions:,}")
|
|
print(f"⏱️ Total time: {elapsed_time/60:.1f} minutes")
|
|
|
|
if failed_countries:
|
|
print(f"\n❌ Failed countries ({len(failed_countries)}):")
|
|
for code in failed_countries:
|
|
print(f" - {COUNTRY_CONFIGS[code]['name']} ({code})")
|
|
|
|
if args.dry_run:
|
|
print("\n🔍 This was a dry run. Remove --dry-run to extract data.")
|
|
|
|
print("="*80 + "\n")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|