248 lines
7.7 KiB
Python
248 lines
7.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract ISIL codes from Wikidata for Czech institutions.
|
|
|
|
During Wikidata enrichment, we queried ISIL codes but didn't systematically
|
|
extract them. This script queries Wikidata again for institutions that have
|
|
Q-numbers but are missing ISIL codes.
|
|
|
|
Process:
|
|
1. Load czech_unified.yaml (8,694 institutions)
|
|
2. Filter institutions WITH Wikidata Q-numbers BUT WITHOUT ISIL codes
|
|
3. Query Wikidata for ISIL codes (P791 property)
|
|
4. Add ISIL identifiers to records
|
|
5. Save to czech_unified_isil.yaml
|
|
|
|
Expected: 306 ISIL codes available in Wikidata (from previous SPARQL query)
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
from typing import List, Dict, Optional
|
|
from datetime import datetime, timezone
|
|
|
|
# Wikidata SPARQL endpoint
|
|
WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
|
|
|
|
def get_isil_codes_batch(qids: List[str]) -> Dict[str, str]:
|
|
"""
|
|
Query Wikidata for ISIL codes for a batch of Q-numbers.
|
|
|
|
Args:
|
|
qids: List of Wikidata Q-numbers (e.g., ['Q642884', 'Q1144653'])
|
|
|
|
Returns:
|
|
Dict mapping Q-number to ISIL code
|
|
"""
|
|
|
|
if not qids:
|
|
return {}
|
|
|
|
# Build VALUES clause for SPARQL
|
|
qid_values = " ".join(f"wd:{qid}" for qid in qids)
|
|
|
|
query = f"""
|
|
SELECT ?item ?isil WHERE {{
|
|
VALUES ?item {{ {qid_values} }}
|
|
?item wdt:P791 ?isil .
|
|
}}
|
|
"""
|
|
|
|
headers = {
|
|
'User-Agent': 'GLAM-Data-Extraction/0.2.0 (ISIL code enrichment)',
|
|
'Accept': 'application/sparql-results+json'
|
|
}
|
|
|
|
try:
|
|
response = requests.get(
|
|
WIKIDATA_SPARQL,
|
|
params={'query': query},
|
|
headers=headers,
|
|
timeout=30
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
# Parse results
|
|
isil_map = {}
|
|
for binding in data['results']['bindings']:
|
|
qid = binding['item']['value'].split('/')[-1]
|
|
isil_code = binding['isil']['value']
|
|
isil_map[qid] = isil_code
|
|
|
|
return isil_map
|
|
|
|
except Exception as e:
|
|
print(f"Error querying Wikidata: {e}")
|
|
return {}
|
|
|
|
def extract_isil_codes():
|
|
"""Main extraction workflow."""
|
|
|
|
print("="*80)
|
|
print("CZECH INSTITUTIONS - ISIL CODE EXTRACTION FROM WIKIDATA")
|
|
print("="*80)
|
|
print()
|
|
|
|
# Load unified dataset
|
|
print("Loading czech_unified.yaml...")
|
|
with open('data/instances/czech_unified.yaml', 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
|
|
# Filter institutions with Wikidata Q-numbers but no ISIL codes
|
|
needs_isil = []
|
|
has_isil = 0
|
|
has_wikidata = 0
|
|
|
|
for inst in institutions:
|
|
# Check if has Wikidata Q-number
|
|
wikidata_qid = None
|
|
for identifier in inst.get('identifiers', []):
|
|
if identifier.get('identifier_scheme') == 'Wikidata':
|
|
wikidata_qid = identifier.get('identifier_value')
|
|
has_wikidata += 1
|
|
break
|
|
|
|
if not wikidata_qid:
|
|
continue
|
|
|
|
# Check if already has ISIL code
|
|
has_isil_code = any(
|
|
i.get('identifier_scheme') == 'ISIL'
|
|
for i in inst.get('identifiers', [])
|
|
)
|
|
|
|
if has_isil_code:
|
|
has_isil += 1
|
|
else:
|
|
needs_isil.append({
|
|
'institution': inst,
|
|
'qid': wikidata_qid
|
|
})
|
|
|
|
print(f"Institutions with Wikidata Q-numbers: {has_wikidata}")
|
|
print(f"Institutions with ISIL codes (before): {has_isil}")
|
|
print(f"Institutions needing ISIL codes: {len(needs_isil)}")
|
|
print()
|
|
|
|
if not needs_isil:
|
|
print("No institutions need ISIL codes. Exiting.")
|
|
return
|
|
|
|
# Query Wikidata in batches (50 Q-numbers at a time to avoid URL length limits)
|
|
print("Querying Wikidata for ISIL codes...")
|
|
batch_size = 50
|
|
all_isil_codes = {}
|
|
|
|
for i in range(0, len(needs_isil), batch_size):
|
|
batch = needs_isil[i:i+batch_size]
|
|
qids = [item['qid'] for item in batch]
|
|
|
|
print(f" Querying batch {i//batch_size + 1}/{(len(needs_isil)-1)//batch_size + 1} ({len(qids)} Q-numbers)...")
|
|
|
|
batch_isil = get_isil_codes_batch(qids)
|
|
all_isil_codes.update(batch_isil)
|
|
|
|
print(f"\nFound {len(all_isil_codes)} ISIL codes in Wikidata")
|
|
print()
|
|
|
|
# Add ISIL codes to institutions
|
|
added_count = 0
|
|
|
|
for item in needs_isil:
|
|
inst = item['institution']
|
|
qid = item['qid']
|
|
|
|
if qid in all_isil_codes:
|
|
isil_code = all_isil_codes[qid]
|
|
|
|
# Add ISIL identifier
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': isil_code,
|
|
'identifier_url': f"https://isil.org/{isil_code}"
|
|
})
|
|
|
|
# Update provenance
|
|
if 'enrichment_history' not in inst['provenance']:
|
|
inst['provenance']['enrichment_history'] = []
|
|
|
|
inst['provenance']['enrichment_history'].append({
|
|
'enrichment_date': datetime.now(timezone.utc).isoformat(),
|
|
'enrichment_method': 'Wikidata ISIL code extraction (P791 property)',
|
|
'match_score': 100.0, # Direct extraction, no fuzzy matching
|
|
'verified': True
|
|
})
|
|
|
|
added_count += 1
|
|
|
|
print(f"✅ Added {added_count} ISIL codes to institutions")
|
|
print()
|
|
|
|
# Statistics
|
|
final_isil_count = has_isil + added_count
|
|
total = len(institutions)
|
|
|
|
print("="*80)
|
|
print("ISIL CODE COVERAGE (BEFORE → AFTER)")
|
|
print("="*80)
|
|
print(f"Before: {has_isil:5} / {total} ({has_isil/total*100:5.1f}%)")
|
|
print(f"After: {final_isil_count:5} / {total} ({final_isil_count/total*100:5.1f}%)")
|
|
print(f"Increase: +{added_count} ISIL codes (+{(final_isil_count-has_isil)/total*100:.1f}% coverage)")
|
|
print()
|
|
|
|
# Save enriched dataset
|
|
output_path = 'data/instances/czech_unified_isil.yaml'
|
|
print(f"Saving enriched dataset to {output_path}...")
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(
|
|
institutions,
|
|
f,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
default_flow_style=False,
|
|
width=100
|
|
)
|
|
|
|
print(f"✅ Saved {len(institutions)} institutions")
|
|
print()
|
|
|
|
# Sample institutions with ISIL codes
|
|
print("="*80)
|
|
print("SAMPLE INSTITUTIONS WITH ISIL CODES")
|
|
print("="*80)
|
|
|
|
sample_count = 0
|
|
for inst in institutions:
|
|
isil_codes = [
|
|
i.get('identifier_value')
|
|
for i in inst.get('identifiers', [])
|
|
if i.get('identifier_scheme') == 'ISIL'
|
|
]
|
|
|
|
if isil_codes:
|
|
print(f"{inst['name'][:60]:60} | ISIL: {isil_codes[0]}")
|
|
sample_count += 1
|
|
|
|
if sample_count >= 20:
|
|
break
|
|
|
|
print()
|
|
print("="*80)
|
|
print("EXTRACTION COMPLETE")
|
|
print("="*80)
|
|
print(f"Total institutions: {len(institutions)}")
|
|
print(f"With ISIL codes: {final_isil_count} ({final_isil_count/total*100:.1f}%)")
|
|
print(f"With Wikidata Q-numbers: {has_wikidata} ({has_wikidata/total*100:.1f}%)")
|
|
print(f"With GPS coordinates: {sum(1 for i in institutions if i.get('locations') and i['locations'][0].get('latitude'))} ({sum(1 for i in institutions if i.get('locations') and i['locations'][0].get('latitude'))/total*100:.1f}%)")
|
|
print()
|
|
print("Next step: Replace czech_unified.yaml with czech_unified_isil.yaml")
|
|
|
|
if __name__ == '__main__':
|
|
extract_isil_codes()
|