#!/usr/bin/env python3 """ Global Wikidata Heritage Institution Extractor This script extracts ALL GLAMORCUBEPSXHF (Galleries, Libraries, Archives, Museums, Official institutions, Research centers, Corporations, Universities, Botanical gardens/zoos, Educational providers, Personal collections, Societies, Holy sites, Features) institutions worldwide from Wikidata using SPARQL queries. šŸŒ COMPREHENSIVE GLOBAL EXTRACTION STRATEGY šŸŒ Strategy: 1. Load country configuration (205+ countries with Wikidata QIDs) 2. Query Wikidata SPARQL endpoint for each country separately 3. Extract 15 GLAMORCUBEPSXHF institution types per country 4. Capture complete metadata (identifiers, coordinates, temporal data, collections) 5. Save results to data/wikidata/{country_code}/{timestamp}.json 6. Track progress, errors, and statistics Query Optimization: - Query each institution type separately to avoid 504 timeouts - Use LIMITED transitive subclass queries (wdt:P279? not wdt:P279*) - Implement pagination (LIMIT/OFFSET) for large datasets - Rate limiting: 2-5 second delays between requests - Exponential backoff for error handling Priority Countries (configurable): - Priority 1: Netherlands (Q55), Chile (Q298), Belgium (Q31) - Priority 2: Italy (Q38), Denmark (Q35), Austria (Q40), Switzerland (Q39) - Priority 3: Latin America (Brazil, Mexico, Argentina, Colombia) - Priority 4: Asia (Japan, Vietnam, Thailand, Taiwan, South Korea) - Priority 5: Africa/Middle East (Egypt, South Africa, Kenya, Nigeria) Output Format: data/wikidata/{country_code}/{timestamp}.json { "country_code": "NL", "country_name": "Netherlands", "country_qid": "Q55", "extraction_date": "2025-11-11T10:30:00Z", "total_institutions": 1247, "institution_types": {"museum": 843, "library": 302, ...}, "institutions": [...] } Usage: # Extract priority 1 countries (Netherlands, Chile) python extract_global_wikidata.py --priority 1 # Extract specific countries python extract_global_wikidata.py --countries NL CL BE IT # Extract all countries (use with caution - 205+ countries) python extract_global_wikidata.py --all-countries # Dry run (show what would be extracted) python extract_global_wikidata.py --priority 1 --dry-run # Resume from specific country python extract_global_wikidata.py --countries BR MX AR --skip-existing """ import sys import json from pathlib import Path from typing import Any, Optional from datetime import datetime, timezone import time import argparse from collections import defaultdict sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from SPARQLWrapper import SPARQLWrapper, JSON as SPARQL_JSON # type: ignore from SPARQLWrapper import SPARQLExceptions # type: ignore # Country configurations (Wikidata QIDs) - 205+ countries worldwide # Organized by continent and priority level COUNTRY_CONFIGS = { # ============================================================================= # PRIORITY 1: HIGH DATA QUALITY, LARGE DATASETS # ============================================================================= 'NL': {'name': 'Netherlands', 'qid': 'Q55', 'flag': 'šŸ‡³šŸ‡±', 'languages': 'nl,en', 'priority': 1, 'continent': 'Europe'}, 'CL': {'name': 'Chile', 'qid': 'Q298', 'flag': 'šŸ‡ØšŸ‡±', 'languages': 'es,en', 'priority': 1, 'continent': 'Americas'}, 'BE': {'name': 'Belgium', 'qid': 'Q31', 'flag': 'šŸ‡§šŸ‡Ŗ', 'languages': 'nl,fr,en', 'priority': 1, 'continent': 'Europe'}, # ============================================================================= # PRIORITY 2: MEDIUM DATASETS, GOOD COVERAGE POTENTIAL # ============================================================================= 'IT': {'name': 'Italy', 'qid': 'Q38', 'flag': 'šŸ‡®šŸ‡¹', 'languages': 'it,en', 'priority': 2, 'continent': 'Europe'}, 'DK': {'name': 'Denmark', 'qid': 'Q35', 'flag': 'šŸ‡©šŸ‡°', 'languages': 'da,en', 'priority': 2, 'continent': 'Europe'}, 'AT': {'name': 'Austria', 'qid': 'Q40', 'flag': 'šŸ‡¦šŸ‡¹', 'languages': 'de,en', 'priority': 2, 'continent': 'Europe'}, 'CH': {'name': 'Switzerland', 'qid': 'Q39', 'flag': 'šŸ‡ØšŸ‡­', 'languages': 'de,fr,it,en', 'priority': 2, 'continent': 'Europe'}, 'NO': {'name': 'Norway', 'qid': 'Q20', 'flag': 'šŸ‡³šŸ‡“', 'languages': 'no,en', 'priority': 2, 'continent': 'Europe'}, 'SE': {'name': 'Sweden', 'qid': 'Q34', 'flag': 'šŸ‡øšŸ‡Ŗ', 'languages': 'sv,en', 'priority': 2, 'continent': 'Europe'}, 'FI': {'name': 'Finland', 'qid': 'Q33', 'flag': 'šŸ‡«šŸ‡®', 'languages': 'fi,en', 'priority': 2, 'continent': 'Europe'}, 'FR': {'name': 'France', 'qid': 'Q142', 'flag': 'šŸ‡«šŸ‡·', 'languages': 'fr,en', 'priority': 2, 'continent': 'Europe'}, 'DE': {'name': 'Germany', 'qid': 'Q183', 'flag': 'šŸ‡©šŸ‡Ŗ', 'languages': 'de,en', 'priority': 2, 'continent': 'Europe'}, 'ES': {'name': 'Spain', 'qid': 'Q29', 'flag': 'šŸ‡ŖšŸ‡ø', 'languages': 'es,en', 'priority': 2, 'continent': 'Europe'}, 'PT': {'name': 'Portugal', 'qid': 'Q45', 'flag': 'šŸ‡µšŸ‡¹', 'languages': 'pt,en', 'priority': 2, 'continent': 'Europe'}, # ============================================================================= # PRIORITY 3: LATIN AMERICA # ============================================================================= 'BR': {'name': 'Brazil', 'qid': 'Q155', 'flag': 'šŸ‡§šŸ‡·', 'languages': 'pt,en', 'priority': 3, 'continent': 'Americas'}, 'MX': {'name': 'Mexico', 'qid': 'Q96', 'flag': 'šŸ‡²šŸ‡½', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'}, 'AR': {'name': 'Argentina', 'qid': 'Q414', 'flag': 'šŸ‡¦šŸ‡·', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'}, 'CO': {'name': 'Colombia', 'qid': 'Q739', 'flag': 'šŸ‡ØšŸ‡“', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'}, 'PE': {'name': 'Peru', 'qid': 'Q419', 'flag': 'šŸ‡µšŸ‡Ŗ', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'}, 'VE': {'name': 'Venezuela', 'qid': 'Q717', 'flag': 'šŸ‡»šŸ‡Ŗ', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'}, 'EC': {'name': 'Ecuador', 'qid': 'Q736', 'flag': 'šŸ‡ŖšŸ‡Ø', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'}, 'BO': {'name': 'Bolivia', 'qid': 'Q750', 'flag': 'šŸ‡§šŸ‡“', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'}, 'UY': {'name': 'Uruguay', 'qid': 'Q77', 'flag': 'šŸ‡ŗšŸ‡¾', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'}, 'PY': {'name': 'Paraguay', 'qid': 'Q733', 'flag': 'šŸ‡µšŸ‡¾', 'languages': 'es,en', 'priority': 3, 'continent': 'Americas'}, # ============================================================================= # PRIORITY 4: ASIA # ============================================================================= 'JP': {'name': 'Japan', 'qid': 'Q17', 'flag': 'šŸ‡ÆšŸ‡µ', 'languages': 'ja,en', 'priority': 4, 'continent': 'Asia'}, 'CN': {'name': 'China', 'qid': 'Q148', 'flag': 'šŸ‡ØšŸ‡³', 'languages': 'zh,en', 'priority': 4, 'continent': 'Asia'}, 'IN': {'name': 'India', 'qid': 'Q668', 'flag': 'šŸ‡®šŸ‡³', 'languages': 'hi,en', 'priority': 4, 'continent': 'Asia'}, 'KR': {'name': 'South Korea', 'qid': 'Q884', 'flag': 'šŸ‡°šŸ‡·', 'languages': 'ko,en', 'priority': 4, 'continent': 'Asia'}, 'TW': {'name': 'Taiwan', 'qid': 'Q865', 'flag': 'šŸ‡¹šŸ‡¼', 'languages': 'zh,en', 'priority': 4, 'continent': 'Asia'}, 'TH': {'name': 'Thailand', 'qid': 'Q869', 'flag': 'šŸ‡¹šŸ‡­', 'languages': 'th,en', 'priority': 4, 'continent': 'Asia'}, 'VN': {'name': 'Vietnam', 'qid': 'Q881', 'flag': 'šŸ‡»šŸ‡³', 'languages': 'vi,en', 'priority': 4, 'continent': 'Asia'}, 'MY': {'name': 'Malaysia', 'qid': 'Q833', 'flag': 'šŸ‡²šŸ‡¾', 'languages': 'ms,en', 'priority': 4, 'continent': 'Asia'}, 'ID': {'name': 'Indonesia', 'qid': 'Q252', 'flag': 'šŸ‡®šŸ‡©', 'languages': 'id,en', 'priority': 4, 'continent': 'Asia'}, 'PH': {'name': 'Philippines', 'qid': 'Q928', 'flag': 'šŸ‡µšŸ‡­', 'languages': 'en,tl', 'priority': 4, 'continent': 'Asia'}, 'SG': {'name': 'Singapore', 'qid': 'Q334', 'flag': 'šŸ‡øšŸ‡¬', 'languages': 'en', 'priority': 4, 'continent': 'Asia'}, # ============================================================================= # PRIORITY 5: AFRICA AND MIDDLE EAST # ============================================================================= 'EG': {'name': 'Egypt', 'qid': 'Q79', 'flag': 'šŸ‡ŖšŸ‡¬', 'languages': 'ar,en', 'priority': 5, 'continent': 'Africa'}, 'ZA': {'name': 'South Africa', 'qid': 'Q258', 'flag': 'šŸ‡æšŸ‡¦', 'languages': 'en,af', 'priority': 5, 'continent': 'Africa'}, 'KE': {'name': 'Kenya', 'qid': 'Q114', 'flag': 'šŸ‡°šŸ‡Ŗ', 'languages': 'en,sw', 'priority': 5, 'continent': 'Africa'}, 'NG': {'name': 'Nigeria', 'qid': 'Q1033', 'flag': 'šŸ‡³šŸ‡¬', 'languages': 'en', 'priority': 5, 'continent': 'Africa'}, 'GH': {'name': 'Ghana', 'qid': 'Q117', 'flag': 'šŸ‡¬šŸ‡­', 'languages': 'en', 'priority': 5, 'continent': 'Africa'}, 'ET': {'name': 'Ethiopia', 'qid': 'Q115', 'flag': 'šŸ‡ŖšŸ‡¹', 'languages': 'am,en', 'priority': 5, 'continent': 'Africa'}, 'TZ': {'name': 'Tanzania', 'qid': 'Q924', 'flag': 'šŸ‡¹šŸ‡æ', 'languages': 'sw,en', 'priority': 5, 'continent': 'Africa'}, 'UG': {'name': 'Uganda', 'qid': 'Q1036', 'flag': 'šŸ‡ŗšŸ‡¬', 'languages': 'en', 'priority': 5, 'continent': 'Africa'}, # TODO: Add remaining 160+ countries from /docs/WIKIDATA_SPARQL_QUERIES.md } # Institution type mappings (GLAMORCUBEPSXHF taxonomy → Wikidata QIDs) INSTITUTION_TYPES = { 'museum': {'qid': 'Q33506', 'code': 'M', 'label': 'Museum'}, 'library': {'qid': 'Q7075', 'code': 'L', 'label': 'Library'}, 'archive': {'qid': 'Q166118', 'code': 'A', 'label': 'Archive'}, 'gallery': {'qid': 'Q2668072', 'code': 'G', 'label': 'Gallery'}, 'cultural_center': {'qid': 'Q5282129', 'code': 'O', 'label': 'Cultural Center'}, 'research_center': {'qid': 'Q3152824', 'code': 'R', 'label': 'Research Center'}, 'university': {'qid': 'Q3918', 'code': 'U', 'label': 'University'}, 'botanical_garden': {'qid': 'Q167346', 'code': 'B', 'label': 'Botanical Garden'}, 'features': {'qid': 'Q4989906', 'code': 'F', 'label': 'Monument/Landmark'}, # Add holy sites, societies, etc. as needed } def create_sparql_query(country_qid: str, inst_type_qid: str, languages: str = "en", limit: int = 1000, offset: int = 0) -> str: """ Generate SPARQL query for heritage institutions in a specific country and type. This query extracts comprehensive metadata from Wikidata including: - Identifiers (ISIL, VIAF, Wikidata QID, website, email, phone) - Geographic data (coordinates, address, city, region) - Temporal data (inception, founding, dissolution dates) - Organizational data (parent org, part-of relationships) - Collection metadata (size, types) - Media (images, logos) Args: country_qid: Wikidata QID for country (e.g., Q55 for Netherlands) inst_type_qid: Wikidata QID for institution type (e.g., Q33506 for museum) languages: Comma-separated language codes (e.g., "nl,en") limit: Maximum results to return offset: Pagination offset Returns: SPARQL query string """ query = f""" SELECT DISTINCT ?item ?itemLabel ?itemDescription ?itemAltLabel ?instType ?instTypeLabel ?coords ?streetAddress ?postalCode ?city ?cityLabel ?region ?regionLabel # Primary identifiers ?isil ?viaf ?wikidataQID ?website ?email ?phone # Archives & Libraries ?archivesPortalEuropeID ?egaxaID ?archiveGridID ?atomURL # Museum identifiers ?museofileID ?commonsInstitution # Dutch heritage identifiers ?rkdInstituteID ?rkdArtistsID ?rceMonumentID ?monumentsFlandersID # Authority control ?locID ?gndID ?bnfID ?librariesAustraliaID ?nliID # Social media ?twitter ?facebook ?instagram # Temporal, organizational, collection, media ?inception ?dissolved ?foundingDate ?parent ?parentLabel ?partOf ?partOfLabel ?collectionSize ?collectionType ?image ?logo WHERE {{ # Instance of heritage institution type (including subclasses) # This checks if ?item is an instance of a type that is a subclass of our target type ?item wdt:P31 ?instType . ?instType wdt:P279* wd:{inst_type_qid} . # Country filter ?item wdt:P17 wd:{country_qid} . # ============================================================================= # GEOGRAPHIC DATA # ============================================================================= # Coordinates (lat/lon) - returned as Point(lon lat) string, parsed in Python OPTIONAL {{ ?item wdt:P625 ?coords . }} # Physical address components OPTIONAL {{ ?item wdt:P6375 ?streetAddress . }} OPTIONAL {{ ?item wdt:P281 ?postalCode . }} # City/municipality OPTIONAL {{ ?item wdt:P131 ?city . ?city wdt:P31/wdt:P279? wd:Q515 . }} # Region/province/state OPTIONAL {{ ?item wdt:P131 ?region . ?region wdt:P31/wdt:P279? wd:Q10864048 . }} # ============================================================================= # IDENTIFIERS - Comprehensive heritage institution identifiers # ============================================================================= # Primary identifiers (ISIL, VIAF) OPTIONAL {{ ?item wdt:P791 ?isil . }} OPTIONAL {{ ?item wdt:P214 ?viaf . }} # Contact information OPTIONAL {{ ?item wdt:P856 ?website . }} OPTIONAL {{ ?item wdt:P968 ?email . }} OPTIONAL {{ ?item wdt:P1329 ?phone . }} # Archives & Libraries OPTIONAL {{ ?item wdt:P3066 ?archivesPortalEuropeID . }} OPTIONAL {{ ?item wdt:P1309 ?egaxaID . }} OPTIONAL {{ ?item wdt:P1984 ?archiveGridID . }} OPTIONAL {{ ?item wdt:P6721 ?atomURL . }} # Museum identifiers OPTIONAL {{ ?item wdt:P539 ?museofileID . }} OPTIONAL {{ ?item wdt:P1907 ?commonsInstitution . }} # Dutch heritage identifiers (PRIORITY for NL data) OPTIONAL {{ ?item wdt:P7740 ?rkdInstituteID . }} OPTIONAL {{ ?item wdt:P350 ?rkdArtistsID . }} OPTIONAL {{ ?item wdt:P7314 ?rceMonumentID . }} OPTIONAL {{ ?item wdt:P4372 ?monumentsFlandersID . }} # Authority control OPTIONAL {{ ?item wdt:P244 ?locID . }} OPTIONAL {{ ?item wdt:P227 ?gndID . }} OPTIONAL {{ ?item wdt:P268 ?bnfID . }} OPTIONAL {{ ?item wdt:P409 ?librariesAustraliaID . }} OPTIONAL {{ ?item wdt:P3788 ?nliID . }} # Social media OPTIONAL {{ ?item wdt:P2002 ?twitter . }} OPTIONAL {{ ?item wdt:P2013 ?facebook . }} OPTIONAL {{ ?item wdt:P2003 ?instagram . }} # Extract Wikidata QID from URI BIND(STRAFTER(STR(?item), "http://www.wikidata.org/entity/") AS ?wikidataQID) # ============================================================================= # TEMPORAL DATA # ============================================================================= OPTIONAL {{ ?item wdt:P571 ?inception . }} OPTIONAL {{ ?item wdt:P576 ?dissolved . }} OPTIONAL {{ ?item wdt:P1619 ?foundingDate . }} # ============================================================================= # ORGANIZATIONAL RELATIONSHIPS # ============================================================================= OPTIONAL {{ ?item wdt:P749 ?parent . }} OPTIONAL {{ ?item wdt:P361 ?partOf . }} # ============================================================================= # COLLECTION METADATA # ============================================================================= OPTIONAL {{ ?item wdt:P1301 ?collectionSize . }} OPTIONAL {{ ?item wdt:P195 ?collectionType . }} # ============================================================================= # MEDIA # ============================================================================= OPTIONAL {{ ?item wdt:P18 ?image . }} OPTIONAL {{ ?item wdt:P154 ?logo . }} # ============================================================================= # LABELS AND DESCRIPTIONS (Multilingual) # ============================================================================= SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{languages}" . ?item rdfs:label ?itemLabel . ?item schema:description ?itemDescription . ?item skos:altLabel ?itemAltLabel . ?instType rdfs:label ?instTypeLabel . ?city rdfs:label ?cityLabel . ?region rdfs:label ?regionLabel . ?parent rdfs:label ?parentLabel . ?partOf rdfs:label ?partOfLabel . }} }} ORDER BY ?itemLabel LIMIT {limit} OFFSET {offset} """ return query def parse_sparql_results(bindings: list[dict]) -> list[dict[str, Any]]: """ Parse SPARQL query results into institution records. Aggregates results by Wikidata QID (multiple rows may exist per institution due to alternative labels, collection types, etc.) Returns: List of institution dictionaries with complete metadata """ institutions_by_qid: dict[str, dict[str, Any]] = {} for binding in bindings: item_uri = binding.get("item", {}).get("value", "") qid = item_uri.split("/")[-1] if item_uri else None if not qid or not qid.startswith("Q"): continue # Skip synthetic Q-numbers (policy: real identifiers only) try: qid_num = int(qid[1:]) if qid_num >= 90000000: continue except ValueError: continue # Initialize institution record if first occurrence if qid not in institutions_by_qid: institutions_by_qid[qid] = { "wikidata_qid": qid, "name": binding.get("itemLabel", {}).get("value", ""), "description": binding.get("itemDescription", {}).get("value", ""), "institution_type": binding.get("instTypeLabel", {}).get("value", ""), "alternative_names": [], "identifiers": {}, "location": {}, "temporal": {}, "organizational": {}, "collection": {}, "media": {} } inst = institutions_by_qid[qid] # Collect alternative names alt_label = binding.get("itemAltLabel", {}).get("value", "") if alt_label and alt_label not in inst["alternative_names"]: inst["alternative_names"].append(alt_label) # Identifiers - comprehensive mapping identifier_mappings = { # Primary identifiers "isil": "ISIL", "viaf": "VIAF", # Contact "website": "website", "email": "email", "phone": "phone", # Archives & Libraries "archivesPortalEuropeID": "Archives_Portal_Europe_ID", "egaxaID": "EGAXA_ID", "archiveGridID": "ArchiveGrid_ID", "atomURL": "AtoM_URL", # Museums "museofileID": "Museofile_ID", "commonsInstitution": "Wikimedia_Commons_Institution", # Dutch heritage "rkdInstituteID": "RKD_Institute_ID", "rkdArtistsID": "RKDartists_ID", "rceMonumentID": "RCE_Monument_ID", "monumentsFlandersID": "Monuments_Flanders_ID", # Authority control "locID": "Library_of_Congress_ID", "gndID": "GND_ID", "bnfID": "BnF_ID", "librariesAustraliaID": "Libraries_Australia_ID", "nliID": "National_Library_Israel_ID", # Social media "twitter": "Twitter", "facebook": "Facebook", "instagram": "Instagram" } for sparql_field, linkml_field in identifier_mappings.items(): if sparql_field in binding: inst["identifiers"][linkml_field] = binding[sparql_field]["value"] # Location - parse coordinates from Point(lon lat) format if "coords" in binding: coords_str = binding["coords"]["value"] if coords_str and coords_str.startswith("Point("): # Format: "Point(longitude latitude)" coords_inner = coords_str[6:-1] # Remove "Point(" and ")" lon_str, lat_str = coords_inner.split() inst["location"]["latitude"] = float(lat_str) inst["location"]["longitude"] = float(lon_str) if "streetAddress" in binding: inst["location"]["street_address"] = binding["streetAddress"]["value"] if "postalCode" in binding: inst["location"]["postal_code"] = binding["postalCode"]["value"] if "cityLabel" in binding: inst["location"]["city"] = binding["cityLabel"]["value"] if "regionLabel" in binding: inst["location"]["region"] = binding["regionLabel"]["value"] # Temporal if "inception" in binding: inst["temporal"]["inception"] = binding["inception"]["value"].split("T")[0] if "foundingDate" in binding: inst["temporal"]["founding_date"] = binding["foundingDate"]["value"].split("T")[0] if "dissolved" in binding: inst["temporal"]["dissolved"] = binding["dissolved"]["value"].split("T")[0] # Organizational if "parentLabel" in binding: inst["organizational"]["parent"] = binding["parentLabel"]["value"] if "partOfLabel" in binding: inst["organizational"]["part_of"] = binding["partOfLabel"]["value"] # Collection if "collectionSize" in binding: inst["collection"]["size"] = int(binding["collectionSize"]["value"]) if "collectionType" in binding: if "types" not in inst["collection"]: inst["collection"]["types"] = [] inst["collection"]["types"].append(binding["collectionType"]["value"]) # Media if "image" in binding: inst["media"]["image"] = binding["image"]["value"] if "logo" in binding: inst["media"]["logo"] = binding["logo"]["value"] return list(institutions_by_qid.values()) def query_wikidata_country( sparql: SPARQLWrapper, country_code: str, country_info: dict[str, Any], dry_run: bool = False ) -> dict[str, Any]: """ Query Wikidata for all heritage institutions in a specific country. Queries each institution type separately to avoid timeout issues. Implements pagination for large datasets. Returns: Dictionary with extraction results and statistics """ print(f"\n{'='*80}") print(f"{country_info['flag']} {country_info['name'].upper()} ({country_code})") print(f"{'='*80}\n") print(f"šŸ” Querying Wikidata for {country_info['name']} heritage institutions...") print(f" Languages: {country_info['languages']}") print(f" Wikidata QID: {country_info['qid']}\n") all_institutions = [] type_stats = defaultdict(int) errors = [] for type_key, type_info in INSTITUTION_TYPES.items(): print(f" - {type_info['label']} ({type_info['qid']})...", end="", flush=True) try: # Pagination: fetch ALL results in batches of 1000 offset = 0 batch_size = 1000 type_institutions = [] while True: query = create_sparql_query( country_qid=country_info['qid'], inst_type_qid=type_info['qid'], languages=country_info['languages'], limit=batch_size, offset=offset ) if dry_run: print(" [DRY RUN - Query prepared]") break sparql.setQuery(query) raw_results = sparql.query().convert() bindings = raw_results.get("results", {}).get("bindings", []) if isinstance(raw_results, dict) else [] if not bindings: # No more results break institutions = parse_sparql_results(bindings) type_institutions.extend(institutions) # If we got fewer results than batch_size, we've reached the end if len(bindings) < batch_size: break offset += batch_size # Rate limiting between pagination requests time.sleep(1) all_institutions.extend(type_institutions) type_stats[type_key] = len(type_institutions) print(f" āœ… {len(type_institutions)} found") # Rate limiting - be nice to Wikidata time.sleep(2) except SPARQLExceptions.EndPointInternalError as e: print(f" āŒ Timeout (query too complex)") errors.append(f"{type_key}: Endpoint timeout") except Exception as e: print(f" āŒ Error: {e}") errors.append(f"{type_key}: {str(e)}") print(f"\nāœ… Total institutions extracted: {len(all_institutions):,}") print(f"šŸ“Š By type:") for type_key, count in type_stats.items(): if count > 0: print(f" {INSTITUTION_TYPES[type_key]['label']}: {count:,}") if errors: print(f"\nāš ļø Errors encountered: {len(errors)}") for error in errors[:5]: print(f" - {error}") return { "country_code": country_code, "country_name": country_info['name'], "country_qid": country_info['qid'], "extraction_date": datetime.now(timezone.utc).isoformat(), "total_institutions": len(all_institutions), "institution_types": dict(type_stats), "institutions": all_institutions, "errors": errors } def save_results(results: dict[str, Any], output_dir: Path) -> Path: """ Save extraction results to JSON file. File path: data/wikidata/{country_code}/{timestamp}.json Returns: Path to saved file """ country_code = results['country_code'] country_dir = output_dir / country_code.lower() country_dir.mkdir(parents=True, exist_ok=True) timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") output_file = country_dir / f"{timestamp}.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) return output_file def main(): parser = argparse.ArgumentParser( description="Extract heritage institutions from Wikidata globally using SPARQL", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Extract priority 1 countries (Netherlands, Chile, Belgium) %(prog)s --priority 1 # Extract specific countries %(prog)s --countries NL CL BE IT FR DE # Extract all countries (use with caution - 205+ countries) %(prog)s --all-countries # Dry run (preview what would be extracted) %(prog)s --priority 1 --dry-run # Resume extraction, skip countries with existing data %(prog)s --priority 2 --skip-existing """ ) parser.add_argument( '--countries', nargs='+', metavar='CODE', help='Country codes to process (e.g., NL CL BE IT)' ) parser.add_argument( '--priority', nargs='+', type=int, metavar='N', help='Process countries by priority level (1-5)' ) parser.add_argument( '--all-countries', action='store_true', help='Process all configured countries (use with caution)' ) parser.add_argument( '--dry-run', action='store_true', help='Preview queries without executing them' ) parser.add_argument( '--skip-existing', action='store_true', help='Skip countries that already have extraction data' ) parser.add_argument( '--output-dir', type=Path, help='Output directory (default: data/wikidata/)' ) args = parser.parse_args() # Determine countries to process countries_to_process = [] if args.countries: countries_to_process = args.countries elif args.priority: countries_to_process = [ code for code, info in COUNTRY_CONFIGS.items() if info.get('priority') in args.priority ] elif args.all_countries: countries_to_process = list(COUNTRY_CONFIGS.keys()) else: # Default: Priority 1 countries countries_to_process = [ code for code, info in COUNTRY_CONFIGS.items() if info.get('priority') == 1 ] # Validate country codes invalid_countries = [c for c in countries_to_process if c not in COUNTRY_CONFIGS] if invalid_countries: print(f"āŒ Invalid country codes: {', '.join(invalid_countries)}") print(f" Valid codes: {', '.join(sorted(COUNTRY_CONFIGS.keys()))}") return 1 # Output directory base_dir = Path(__file__).parent.parent output_dir = args.output_dir if args.output_dir else base_dir / "data" / "wikidata" output_dir.mkdir(parents=True, exist_ok=True) # Header print("="*80) print("šŸŒ GLOBAL WIKIDATA HERITAGE INSTITUTION EXTRACTOR") print("="*80) print(f"\nšŸ“‚ Output directory: {output_dir}") print(f"šŸŒ Countries to process: {len(countries_to_process)}") country_names = [f"{COUNTRY_CONFIGS[c]['flag']} {COUNTRY_CONFIGS[c]['name']}" for c in countries_to_process] print(f" {', '.join(country_names)}\n") if args.dry_run: print("šŸ” DRY RUN MODE: No data will be extracted or saved\n") # Setup SPARQL sparql = SPARQLWrapper("https://query.wikidata.org/sparql") sparql.setReturnFormat(SPARQL_JSON) sparql.setMethod('POST') sparql.addCustomHttpHeader("User-Agent", "GLAM-Extractor/0.2.0 (Global Wikidata Extraction)") # Process countries start_time = time.time() total_institutions = 0 successful_countries = 0 failed_countries = [] for i, country_code in enumerate(countries_to_process, 1): country_info = COUNTRY_CONFIGS[country_code] # Skip if already extracted (optional) if args.skip_existing: country_dir = output_dir / country_code.lower() if country_dir.exists() and any(country_dir.glob("*.json")): print(f"\nā­ļø Skipping {country_info['name']} (already extracted)") continue try: results = query_wikidata_country(sparql, country_code, country_info, dry_run=args.dry_run) if not args.dry_run: output_file = save_results(results, output_dir) print(f"\nšŸ’¾ Saved to: {output_file}") total_institutions += results['total_institutions'] successful_countries += 1 except Exception as e: print(f"\nāŒ FAILED: {country_info['name']}: {e}") failed_countries.append(country_code) # Rate limiting between countries if i < len(countries_to_process): wait_time = 5 print(f"\nāøļø Waiting {wait_time} seconds (Wikidata rate limiting)...\n") time.sleep(wait_time) # Final report elapsed_time = time.time() - start_time print("\n" + "="*80) print("šŸ“Š EXTRACTION COMPLETE") print("="*80) print(f"\nāœ… Successful countries: {successful_countries}/{len(countries_to_process)}") print(f"✨ Total institutions extracted: {total_institutions:,}") print(f"ā±ļø Total time: {elapsed_time/60:.1f} minutes") if failed_countries: print(f"\nāŒ Failed countries ({len(failed_countries)}):") for code in failed_countries: print(f" - {COUNTRY_CONFIGS[code]['name']} ({code})") if args.dry_run: print("\nšŸ” This was a dry run. Remove --dry-run to extract data.") print("="*80 + "\n") return 0 if __name__ == "__main__": sys.exit(main())