202 lines
6.8 KiB
Python
202 lines
6.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Enrich Brazilian institutions - Batch 17 (Final push to 70%)
|
|
|
|
Target: 4 high-potential candidates to reach 70% coverage goal
|
|
Current: 85/126 (67.5%) → Goal: 88/126 (70%)
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
def query_wikidata(sparql_query: str) -> dict:
|
|
"""Execute SPARQL query against Wikidata."""
|
|
endpoint = "https://query.wikidata.org/sparql"
|
|
headers = {'User-Agent': 'GLAMExtractor/1.0 (Heritage Institution Research)'}
|
|
|
|
try:
|
|
response = requests.get(
|
|
endpoint,
|
|
params={'query': sparql_query, 'format': 'json'},
|
|
headers=headers,
|
|
timeout=30
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()
|
|
except Exception as e:
|
|
print(f" ⚠️ Query error: {e}")
|
|
return {'results': {'bindings': []}}
|
|
|
|
def search_brazilian_museum(name: str, city: str, region: str) -> Optional[dict]:
|
|
"""Search for Brazilian museum in Wikidata."""
|
|
|
|
# Build search patterns
|
|
search_patterns = [name]
|
|
|
|
# Alternative patterns for specific cases
|
|
if "Museu dos Povos Acreanos" in name:
|
|
search_patterns.extend([
|
|
"Museu dos Povos Acreanos",
|
|
"Memorial dos Autonomistas"
|
|
])
|
|
elif "Natural History Museum" in name and city == "Campina Grande":
|
|
search_patterns.extend([
|
|
"Museu de História Natural Campina Grande",
|
|
"Museu de História Natural da Paraíba"
|
|
])
|
|
elif "Museu Memória" in name:
|
|
search_patterns.extend([
|
|
"Museu da Memória Rondoniense",
|
|
"Museu Memória Rondoniense"
|
|
])
|
|
elif "MuseusBr" in name:
|
|
search_patterns.extend([
|
|
"Cadastro Nacional de Museus",
|
|
"MuseusBR",
|
|
"Sistema Brasileiro de Museus" # Related platform
|
|
])
|
|
|
|
print(f" 🔍 Searching Wikidata for: {name}")
|
|
print(f" City: {city}, Region: {region}")
|
|
|
|
for pattern in search_patterns:
|
|
print(f" Trying pattern: {pattern}")
|
|
|
|
# Query Wikidata
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?viaf ?isil WHERE {{
|
|
?item rdfs:label "{pattern}"@pt .
|
|
?item wdt:P31/wdt:P279* wd:Q33506 . # instance of museum
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf }}
|
|
OPTIONAL {{ ?item wdt:P791 ?isil }}
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "pt,en". }}
|
|
}}
|
|
LIMIT 5
|
|
"""
|
|
|
|
result = query_wikidata(query)
|
|
bindings = result.get('results', {}).get('bindings', [])
|
|
|
|
if bindings:
|
|
item = bindings[0]
|
|
q_id = item['item']['value'].split('/')[-1]
|
|
label = item.get('itemLabel', {}).get('value', '')
|
|
desc = item.get('itemDescription', {}).get('value', '')
|
|
|
|
print(f" ✅ Found: {q_id} - {label}")
|
|
print(f" Description: {desc}")
|
|
|
|
return {
|
|
'q_id': q_id,
|
|
'label': label,
|
|
'description': desc,
|
|
'viaf': item.get('viaf', {}).get('value'),
|
|
'isil': item.get('isil', {}).get('value')
|
|
}
|
|
|
|
time.sleep(1) # Rate limit
|
|
|
|
# Fallback: Text search
|
|
print(f" 🔎 Trying text search...")
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription WHERE {{
|
|
?item wdt:P31/wdt:P279* wd:Q33506 .
|
|
?item wdt:P17 wd:Q155 .
|
|
?item rdfs:label ?label .
|
|
FILTER(CONTAINS(LCASE(?label), "{name.lower().split()[0]}"))
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "pt,en". }}
|
|
}}
|
|
LIMIT 10
|
|
"""
|
|
|
|
result = query_wikidata(query)
|
|
bindings = result.get('results', {}).get('bindings', [])
|
|
|
|
if bindings:
|
|
print(f" 📋 Found {len(bindings)} potential matches:")
|
|
for idx, item in enumerate(bindings[:5], 1):
|
|
q_id = item['item']['value'].split('/')[-1]
|
|
label = item.get('itemLabel', {}).get('value', '')
|
|
desc = item.get('itemDescription', {}).get('value', '')
|
|
print(f" {idx}. {q_id} - {label}")
|
|
print(f" {desc}")
|
|
|
|
return None
|
|
|
|
def enrich_institutions():
|
|
"""Enrich Batch 17 target institutions."""
|
|
|
|
print("=" * 80)
|
|
print("BRAZILIAN ENRICHMENT - BATCH 17")
|
|
print("Goal: Reach 70% coverage (88/126 institutions)")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load dataset
|
|
yaml_path = Path('data/instances/all/globalglam-20251111-batch16-fixed.yaml')
|
|
print(f"📂 Loading: {yaml_path.name}")
|
|
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
# Target institutions
|
|
targets = {
|
|
"Museu dos Povos Acreanos": {"city": "Rio Branco", "region": "ACRE"},
|
|
"Natural History Museum": {"city": "Campina Grande", "region": "PARAÍBA"},
|
|
"Museu Memória": {"city": "Porto Velho", "region": "RONDÔNIA"},
|
|
"MuseusBr": {"city": "Brasília", "region": "DISTRITO FEDERAL"} # National platform
|
|
}
|
|
|
|
enriched_count = 0
|
|
|
|
for inst in institutions:
|
|
name = inst.get('name', '')
|
|
|
|
if name in targets:
|
|
target_info = targets[name]
|
|
print(f"\n{'=' * 80}")
|
|
print(f"🎯 Target: {name}")
|
|
print(f"{'=' * 80}")
|
|
|
|
# Check if already has Wikidata
|
|
identifiers = inst.get('identifiers', [])
|
|
has_wikidata = any(id_obj.get('identifier_scheme') == 'Wikidata' for id_obj in identifiers)
|
|
|
|
if has_wikidata:
|
|
print(" ⏭️ Already enriched, skipping")
|
|
continue
|
|
|
|
# Search Wikidata
|
|
result = search_brazilian_museum(
|
|
name,
|
|
target_info['city'],
|
|
target_info['region']
|
|
)
|
|
|
|
if result:
|
|
print(f"\n ✅ MATCH FOUND: {result['q_id']}")
|
|
enriched_count += 1
|
|
else:
|
|
print(f"\n ❌ NO MATCH: Manual research required")
|
|
|
|
print()
|
|
|
|
print("=" * 80)
|
|
print("BATCH 17 ENRICHMENT SUMMARY")
|
|
print("=" * 80)
|
|
print(f"✅ Institutions enriched: {enriched_count}/4")
|
|
print(f"❌ Requiring manual research: {4 - enriched_count}/4")
|
|
print()
|
|
print("📝 Next steps:")
|
|
print(" 1. Review match quality for automated matches")
|
|
print(" 2. Conduct manual Wikidata searches for unmatched institutions")
|
|
print(" 3. Consider Portuguese Wikipedia as alternative source")
|
|
print(" 4. Update dataset with verified identifiers")
|
|
print()
|
|
|
|
if __name__ == '__main__':
|
|
enrich_institutions()
|