glam/enrich_brazil_batch17.py
2025-11-19 23:25:22 +01:00

202 lines
6.8 KiB
Python

#!/usr/bin/env python3
"""
Enrich Brazilian institutions - Batch 17 (Final push to 70%)
Target: 4 high-potential candidates to reach 70% coverage goal
Current: 85/126 (67.5%) → Goal: 88/126 (70%)
"""
import yaml
import requests
import time
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional
def query_wikidata(sparql_query: str) -> dict:
"""Execute SPARQL query against Wikidata."""
endpoint = "https://query.wikidata.org/sparql"
headers = {'User-Agent': 'GLAMExtractor/1.0 (Heritage Institution Research)'}
try:
response = requests.get(
endpoint,
params={'query': sparql_query, 'format': 'json'},
headers=headers,
timeout=30
)
response.raise_for_status()
return response.json()
except Exception as e:
print(f" ⚠️ Query error: {e}")
return {'results': {'bindings': []}}
def search_brazilian_museum(name: str, city: str, region: str) -> Optional[dict]:
"""Search for Brazilian museum in Wikidata."""
# Build search patterns
search_patterns = [name]
# Alternative patterns for specific cases
if "Museu dos Povos Acreanos" in name:
search_patterns.extend([
"Museu dos Povos Acreanos",
"Memorial dos Autonomistas"
])
elif "Natural History Museum" in name and city == "Campina Grande":
search_patterns.extend([
"Museu de História Natural Campina Grande",
"Museu de História Natural da Paraíba"
])
elif "Museu Memória" in name:
search_patterns.extend([
"Museu da Memória Rondoniense",
"Museu Memória Rondoniense"
])
elif "MuseusBr" in name:
search_patterns.extend([
"Cadastro Nacional de Museus",
"MuseusBR",
"Sistema Brasileiro de Museus" # Related platform
])
print(f" 🔍 Searching Wikidata for: {name}")
print(f" City: {city}, Region: {region}")
for pattern in search_patterns:
print(f" Trying pattern: {pattern}")
# Query Wikidata
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil WHERE {{
?item rdfs:label "{pattern}"@pt .
?item wdt:P31/wdt:P279* wd:Q33506 . # instance of museum
OPTIONAL {{ ?item wdt:P214 ?viaf }}
OPTIONAL {{ ?item wdt:P791 ?isil }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "pt,en". }}
}}
LIMIT 5
"""
result = query_wikidata(query)
bindings = result.get('results', {}).get('bindings', [])
if bindings:
item = bindings[0]
q_id = item['item']['value'].split('/')[-1]
label = item.get('itemLabel', {}).get('value', '')
desc = item.get('itemDescription', {}).get('value', '')
print(f" ✅ Found: {q_id} - {label}")
print(f" Description: {desc}")
return {
'q_id': q_id,
'label': label,
'description': desc,
'viaf': item.get('viaf', {}).get('value'),
'isil': item.get('isil', {}).get('value')
}
time.sleep(1) # Rate limit
# Fallback: Text search
print(f" 🔎 Trying text search...")
query = f"""
SELECT DISTINCT ?item ?itemLabel ?itemDescription WHERE {{
?item wdt:P31/wdt:P279* wd:Q33506 .
?item wdt:P17 wd:Q155 .
?item rdfs:label ?label .
FILTER(CONTAINS(LCASE(?label), "{name.lower().split()[0]}"))
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "pt,en". }}
}}
LIMIT 10
"""
result = query_wikidata(query)
bindings = result.get('results', {}).get('bindings', [])
if bindings:
print(f" 📋 Found {len(bindings)} potential matches:")
for idx, item in enumerate(bindings[:5], 1):
q_id = item['item']['value'].split('/')[-1]
label = item.get('itemLabel', {}).get('value', '')
desc = item.get('itemDescription', {}).get('value', '')
print(f" {idx}. {q_id} - {label}")
print(f" {desc}")
return None
def enrich_institutions():
"""Enrich Batch 17 target institutions."""
print("=" * 80)
print("BRAZILIAN ENRICHMENT - BATCH 17")
print("Goal: Reach 70% coverage (88/126 institutions)")
print("=" * 80)
print()
# Load dataset
yaml_path = Path('data/instances/all/globalglam-20251111-batch16-fixed.yaml')
print(f"📂 Loading: {yaml_path.name}")
with open(yaml_path, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
# Target institutions
targets = {
"Museu dos Povos Acreanos": {"city": "Rio Branco", "region": "ACRE"},
"Natural History Museum": {"city": "Campina Grande", "region": "PARAÍBA"},
"Museu Memória": {"city": "Porto Velho", "region": "RONDÔNIA"},
"MuseusBr": {"city": "Brasília", "region": "DISTRITO FEDERAL"} # National platform
}
enriched_count = 0
for inst in institutions:
name = inst.get('name', '')
if name in targets:
target_info = targets[name]
print(f"\n{'=' * 80}")
print(f"🎯 Target: {name}")
print(f"{'=' * 80}")
# Check if already has Wikidata
identifiers = inst.get('identifiers', [])
has_wikidata = any(id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in identifiers)
if has_wikidata:
print(" ⏭️ Already enriched, skipping")
continue
# Search Wikidata
result = search_brazilian_museum(
name,
target_info['city'],
target_info['region']
)
if result:
print(f"\n ✅ MATCH FOUND: {result['q_id']}")
enriched_count += 1
else:
print(f"\n ❌ NO MATCH: Manual research required")
print()
print("=" * 80)
print("BATCH 17 ENRICHMENT SUMMARY")
print("=" * 80)
print(f"✅ Institutions enriched: {enriched_count}/4")
print(f"❌ Requiring manual research: {4 - enriched_count}/4")
print()
print("📝 Next steps:")
print(" 1. Review match quality for automated matches")
print(" 2. Conduct manual Wikidata searches for unmatched institutions")
print(" 3. Consider Portuguese Wikipedia as alternative source")
print(" 4. Update dataset with verified identifiers")
print()
if __name__ == '__main__':
enrich_institutions()