428 lines
17 KiB
Python
428 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Bulgarian ISIL Registry Enrichment Script
|
|
Enriches Bulgarian institutions with OSM, Wikidata, and VIAF data.
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
import yaml
|
|
import requests
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Optional
|
|
from rapidfuzz import fuzz
|
|
|
|
def fetch_osm_libraries() -> List[Dict]:
|
|
"""Fetch Bulgarian libraries/archives from OpenStreetMap."""
|
|
print("Fetching OSM data for Bulgaria...")
|
|
|
|
overpass_url = "https://overpass-api.de/api/interpreter"
|
|
query = """
|
|
[out:json][timeout:90];
|
|
area["ISO3166-1"="BG"][admin_level=2]->.searchArea;
|
|
(
|
|
node["amenity"="library"](area.searchArea);
|
|
way["amenity"="library"](area.searchArea);
|
|
relation["amenity"="library"](area.searchArea);
|
|
node["amenity"="archive"](area.searchArea);
|
|
way["amenity"="archive"](area.searchArea);
|
|
node["tourism"="museum"](area.searchArea);
|
|
way["tourism"="museum"](area.searchArea);
|
|
);
|
|
out body;
|
|
>;
|
|
out skel qt;
|
|
"""
|
|
|
|
try:
|
|
response = requests.post(overpass_url, data={'data': query}, timeout=120)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
libraries = []
|
|
for element in data.get('elements', []):
|
|
if element.get('type') == 'node' or (element.get('type') == 'way' and 'center' in element):
|
|
tags = element.get('tags', {})
|
|
name = tags.get('name') or tags.get('name:bg') or tags.get('name:en')
|
|
if name:
|
|
lat = element.get('lat') or element.get('center', {}).get('lat')
|
|
lon = element.get('lon') or element.get('center', {}).get('lon')
|
|
|
|
libraries.append({
|
|
'name': name,
|
|
'name_bg': tags.get('name:bg'),
|
|
'name_en': tags.get('name:en'),
|
|
'latitude': lat,
|
|
'longitude': lon,
|
|
'city': tags.get('addr:city'),
|
|
'street': tags.get('addr:street'),
|
|
'website': tags.get('website') or tags.get('contact:website'),
|
|
'wikidata': tags.get('wikidata'),
|
|
'osm_id': element.get('id')
|
|
})
|
|
|
|
print(f"✓ Found {len(libraries)} locations in OSM")
|
|
return libraries
|
|
|
|
except Exception as e:
|
|
print(f"✗ OSM fetch failed: {e}")
|
|
return []
|
|
|
|
def fetch_wikidata_institutions() -> List[Dict]:
|
|
"""Query Wikidata for Bulgarian heritage institutions."""
|
|
print("Querying Wikidata for Bulgarian institutions...")
|
|
|
|
endpoint = "https://query.wikidata.org/sparql"
|
|
query = """
|
|
SELECT DISTINCT ?item ?itemLabel ?itemAltLabel ?isil ?viaf ?website ?coords ?address WHERE {
|
|
VALUES ?type {
|
|
wd:Q7075 wd:Q166118 wd:Q1030034 wd:Q33506 wd:Q636400
|
|
wd:Q212805 wd:Q213441 wd:Q1362225 wd:Q24398318
|
|
}
|
|
?item wdt:P31/wdt:P279* ?type .
|
|
?item wdt:P17 wd:Q219 .
|
|
|
|
OPTIONAL { ?item wdt:P791 ?isil }
|
|
OPTIONAL { ?item wdt:P214 ?viaf }
|
|
OPTIONAL { ?item wdt:P856 ?website }
|
|
OPTIONAL { ?item wdt:P625 ?coords }
|
|
OPTIONAL { ?item wdt:P6375 ?address }
|
|
|
|
SERVICE wikibase:label {
|
|
bd:serviceParam wikibase:language "bg,en,ru".
|
|
?item rdfs:label ?itemLabel .
|
|
?item skos:altLabel ?itemAltLabel .
|
|
}
|
|
}
|
|
"""
|
|
|
|
try:
|
|
response = requests.get(
|
|
endpoint,
|
|
params={'query': query, 'format': 'json'},
|
|
headers={'User-Agent': 'GLAM-Extractor/1.0'},
|
|
timeout=60
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
institutions = []
|
|
for result in data['results']['bindings']:
|
|
item_id = result['item']['value'].split('/')[-1]
|
|
|
|
coords_str = result.get('coords', {}).get('value', '')
|
|
lat, lon = None, None
|
|
if coords_str and coords_str.startswith('Point('):
|
|
parts = coords_str.replace('Point(', '').replace(')', '').split()
|
|
if len(parts) == 2:
|
|
lon, lat = float(parts[0]), float(parts[1])
|
|
|
|
institutions.append({
|
|
'qid': item_id,
|
|
'label': result.get('itemLabel', {}).get('value'),
|
|
'alt_labels': result.get('itemAltLabel', {}).get('value', '').split(',') if result.get('itemAltLabel') else [],
|
|
'isil': result.get('isil', {}).get('value'),
|
|
'viaf': result.get('viaf', {}).get('value'),
|
|
'website': result.get('website', {}).get('value'),
|
|
'latitude': lat,
|
|
'longitude': lon,
|
|
'address': result.get('address', {}).get('value')
|
|
})
|
|
|
|
print(f"✓ Found {len(institutions)} institutions in Wikidata")
|
|
return institutions
|
|
|
|
except Exception as e:
|
|
print(f"✗ Wikidata query failed: {e}")
|
|
return []
|
|
|
|
def fuzzy_match_institution(inst_name: str, candidates: List[Dict], name_fields: List[str]) -> Optional[tuple]:
|
|
"""Fuzzy match institution name against candidates."""
|
|
best_match = None
|
|
best_score = 0
|
|
|
|
inst_name_clean = inst_name.lower().strip()
|
|
|
|
for candidate in candidates:
|
|
# Try all name fields
|
|
names_to_try = []
|
|
for field in name_fields:
|
|
value = candidate.get(field)
|
|
if value:
|
|
if isinstance(value, list):
|
|
names_to_try.extend(value)
|
|
else:
|
|
names_to_try.append(value)
|
|
|
|
for name in names_to_try:
|
|
if not name:
|
|
continue
|
|
|
|
name_clean = name.lower().strip()
|
|
score = fuzz.token_sort_ratio(inst_name_clean, name_clean)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = candidate
|
|
|
|
if best_score >= 75:
|
|
return (best_match, best_score)
|
|
return None
|
|
|
|
def enrich_institutions(base_file: str, osm_data: List[Dict], wikidata_data: List[Dict]) -> tuple:
|
|
"""Enrich Bulgarian institutions with external data."""
|
|
print(f"\nLoading base institutions from {base_file}...")
|
|
|
|
with open(base_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"✓ Loaded {len(institutions)} institutions")
|
|
print("\nEnriching institutions...")
|
|
|
|
enrichments = []
|
|
stats = {
|
|
'total': len(institutions),
|
|
'wikidata_matched': 0,
|
|
'viaf_added': 0,
|
|
'coords_added': 0,
|
|
'website_added': 0,
|
|
'osm_matched': 0,
|
|
'high_confidence': 0,
|
|
'medium_confidence': 0
|
|
}
|
|
|
|
for idx, inst in enumerate(institutions, 1):
|
|
inst_name = inst.get('name', '')
|
|
alt_names = inst.get('alternative_names', [])
|
|
|
|
enrichment = {
|
|
'id': inst.get('id'),
|
|
'name': inst_name,
|
|
'isil': next((i['identifier_value'] for i in inst.get('identifiers', []) if i.get('identifier_scheme') == 'ISIL'), None),
|
|
'matches': []
|
|
}
|
|
|
|
# Try Wikidata match by ISIL first (exact)
|
|
inst_isil = enrichment['isil']
|
|
wikidata_match = None
|
|
match_score = 0
|
|
|
|
if inst_isil:
|
|
for wd in wikidata_data:
|
|
if wd.get('isil') == inst_isil:
|
|
wikidata_match = wd
|
|
match_score = 100
|
|
enrichment['matches'].append({
|
|
'source': 'wikidata',
|
|
'match_type': 'isil_exact',
|
|
'score': 100,
|
|
'qid': wd['qid']
|
|
})
|
|
stats['wikidata_matched'] += 1
|
|
stats['high_confidence'] += 1
|
|
break
|
|
|
|
# Fuzzy match by name if no ISIL match
|
|
if not wikidata_match:
|
|
result = fuzzy_match_institution(
|
|
inst_name,
|
|
wikidata_data,
|
|
['label', 'alt_labels']
|
|
)
|
|
if result:
|
|
wikidata_match, match_score = result
|
|
enrichment['matches'].append({
|
|
'source': 'wikidata',
|
|
'match_type': 'name_fuzzy',
|
|
'score': match_score,
|
|
'qid': wikidata_match['qid']
|
|
})
|
|
stats['wikidata_matched'] += 1
|
|
if match_score >= 85:
|
|
stats['high_confidence'] += 1
|
|
else:
|
|
stats['medium_confidence'] += 1
|
|
|
|
# Add Wikidata enrichments
|
|
if wikidata_match:
|
|
if wikidata_match.get('viaf'):
|
|
enrichment['viaf'] = wikidata_match['viaf']
|
|
stats['viaf_added'] += 1
|
|
|
|
if wikidata_match.get('website') and not inst.get('homepage'):
|
|
enrichment['website'] = wikidata_match['website']
|
|
stats['website_added'] += 1
|
|
|
|
if wikidata_match.get('latitude') and wikidata_match.get('longitude'):
|
|
location = inst.get('locations', [{}])[0]
|
|
if not location.get('latitude'):
|
|
enrichment['latitude'] = wikidata_match['latitude']
|
|
enrichment['longitude'] = wikidata_match['longitude']
|
|
stats['coords_added'] += 1
|
|
|
|
# Try OSM match
|
|
osm_result = fuzzy_match_institution(
|
|
inst_name,
|
|
osm_data,
|
|
['name', 'name_bg', 'name_en']
|
|
)
|
|
|
|
if osm_result:
|
|
osm_match, osm_score = osm_result
|
|
enrichment['matches'].append({
|
|
'source': 'osm',
|
|
'match_type': 'name_fuzzy',
|
|
'score': osm_score,
|
|
'osm_id': osm_match['osm_id']
|
|
})
|
|
stats['osm_matched'] += 1
|
|
|
|
# Add OSM data if missing
|
|
if osm_match.get('latitude') and osm_match.get('longitude'):
|
|
location = inst.get('locations', [{}])[0]
|
|
if not location.get('latitude') and 'latitude' not in enrichment:
|
|
enrichment['latitude'] = osm_match['latitude']
|
|
enrichment['longitude'] = osm_match['longitude']
|
|
stats['coords_added'] += 1
|
|
|
|
if osm_match.get('website') and not inst.get('homepage') and 'website' not in enrichment:
|
|
enrichment['website'] = osm_match['website']
|
|
stats['website_added'] += 1
|
|
|
|
if enrichment['matches']:
|
|
enrichments.append(enrichment)
|
|
|
|
if idx % 10 == 0:
|
|
print(f" Processed {idx}/{stats['total']} institutions...")
|
|
|
|
print(f"\n✓ Enrichment complete")
|
|
print(f"\nStatistics:")
|
|
print(f" Total institutions: {stats['total']}")
|
|
print(f" Wikidata matches: {stats['wikidata_matched']} ({stats['wikidata_matched']/stats['total']*100:.1f}%)")
|
|
print(f" - High confidence (≥85%): {stats['high_confidence']}")
|
|
print(f" - Medium confidence (75-84%): {stats['medium_confidence']}")
|
|
print(f" OSM matches: {stats['osm_matched']} ({stats['osm_matched']/stats['total']*100:.1f}%)")
|
|
print(f" VIAF IDs added: {stats['viaf_added']}")
|
|
print(f" Coordinates added: {stats['coords_added']}")
|
|
print(f" Websites added: {stats['website_added']}")
|
|
print(f" Total enriched: {len(enrichments)} ({len(enrichments)/stats['total']*100:.1f}%)")
|
|
|
|
return enrichments, stats
|
|
|
|
def apply_enrichments(base_file: str, enrichments: List[Dict], output_file: str):
|
|
"""Apply enrichments to base YAML and save."""
|
|
print(f"\nApplying enrichments to {base_file}...")
|
|
|
|
with open(base_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
enrichment_map = {e['id']: e for e in enrichments}
|
|
|
|
enriched_count = 0
|
|
for inst in institutions:
|
|
inst_id = inst.get('id')
|
|
if inst_id in enrichment_map:
|
|
enrich_data = enrichment_map[inst_id]
|
|
|
|
# Add Wikidata ID
|
|
if 'qid' in enrich_data or any(m.get('qid') for m in enrich_data.get('matches', [])):
|
|
qid = enrich_data.get('qid') or next(m['qid'] for m in enrich_data['matches'] if m.get('qid'))
|
|
identifiers = inst.setdefault('identifiers', [])
|
|
if not any(i.get('identifier_scheme') == 'Wikidata' for i in identifiers):
|
|
identifiers.append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': qid,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{qid}'
|
|
})
|
|
|
|
# Add VIAF ID
|
|
if 'viaf' in enrich_data:
|
|
identifiers = inst.setdefault('identifiers', [])
|
|
if not any(i.get('identifier_scheme') == 'VIAF' for i in identifiers):
|
|
identifiers.append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': enrich_data['viaf'],
|
|
'identifier_url': f'https://viaf.org/viaf/{enrich_data["viaf"]}'
|
|
})
|
|
|
|
# Add coordinates
|
|
if 'latitude' in enrich_data and 'longitude' in enrich_data:
|
|
locations = inst.setdefault('locations', [{}])
|
|
if not locations[0].get('latitude'):
|
|
locations[0]['latitude'] = enrich_data['latitude']
|
|
locations[0]['longitude'] = enrich_data['longitude']
|
|
|
|
# Add website
|
|
if 'website' in enrich_data and not inst.get('homepage'):
|
|
inst['homepage'] = enrich_data['website']
|
|
identifiers = inst.setdefault('identifiers', [])
|
|
if not any(i.get('identifier_scheme') == 'Website' for i in identifiers):
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': enrich_data['website'],
|
|
'identifier_url': enrich_data['website']
|
|
})
|
|
|
|
enriched_count += 1
|
|
|
|
# Add header comment
|
|
output_data = {
|
|
'_comment': f'Bulgarian ISIL Registry - Enriched with Wikidata, VIAF, and OSM data',
|
|
'_generated': datetime.now(timezone.utc).isoformat(),
|
|
'_source_file': base_file,
|
|
'_enriched_count': enriched_count,
|
|
'_total_count': len(institutions)
|
|
}
|
|
|
|
# Save enriched YAML
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
f.write("---\n")
|
|
f.write(f"# Bulgarian ISIL Registry - Enriched\n")
|
|
f.write(f"# Generated: {datetime.now(timezone.utc).isoformat()}\n")
|
|
f.write(f"# Enriched: {enriched_count}/{len(institutions)} institutions\n")
|
|
f.write(f"# Source: {base_file}\n\n")
|
|
yaml.dump(institutions, f, allow_unicode=True, default_flow_style=False, sort_keys=False)
|
|
|
|
print(f"✓ Saved enriched data to {output_file}")
|
|
print(f" Enriched {enriched_count}/{len(institutions)} institutions")
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print("Bulgarian ISIL Registry Enrichment")
|
|
print("=" * 70)
|
|
|
|
base_file = "data/instances/bulgaria_isil_libraries.yaml"
|
|
output_file = "data/instances/bulgaria_complete.yaml"
|
|
|
|
# Step 1: Fetch OSM data
|
|
osm_data = fetch_osm_libraries()
|
|
with open('data/isil/bulgaria/bulgaria_osm_libraries.json', 'w', encoding='utf-8') as f:
|
|
json.dump(osm_data, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved OSM data to data/isil/bulgaria/bulgaria_osm_libraries.json")
|
|
|
|
# Step 2: Fetch Wikidata
|
|
time.sleep(2) # Rate limiting
|
|
wikidata_data = fetch_wikidata_institutions()
|
|
with open('data/isil/bulgaria/bulgaria_wikidata_institutions.json', 'w', encoding='utf-8') as f:
|
|
json.dump(wikidata_data, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved Wikidata data to data/isil/bulgaria/bulgaria_wikidata_institutions.json")
|
|
|
|
# Step 3: Enrich
|
|
enrichments, stats = enrich_institutions(base_file, osm_data, wikidata_data)
|
|
with open('data/isil/bulgaria/bulgaria_enrichments.json', 'w', encoding='utf-8') as f:
|
|
json.dump({
|
|
'enrichments': enrichments,
|
|
'stats': stats,
|
|
'generated': datetime.now(timezone.utc).isoformat()
|
|
}, f, ensure_ascii=False, indent=2)
|
|
print(f"✓ Saved enrichment data to data/isil/bulgaria/bulgaria_enrichments.json")
|
|
|
|
# Step 4: Apply enrichments
|
|
apply_enrichments(base_file, enrichments, output_file)
|
|
|
|
print("\n" + "=" * 70)
|
|
print("✓ Bulgarian ISIL enrichment complete!")
|
|
print("=" * 70)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|