glam/archive/scripts/brazil/query_major_brazilian_institutions.py
2025-11-19 23:25:22 +01:00

238 lines
8.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Query Wikidata for major Brazilian GLAM institutions and compare with our dataset.
This helps us understand if the low Brazilian coverage (1%) is due to:
1. Extraction issue (we missed them in conversations)
2. Wikidata gap (institutions need to be added to Wikidata)
3. Dataset composition (we focus on different institutions)
"""
import yaml
import requests
import sys
from pathlib import Path
from collections import defaultdict
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
# SPARQL query for major Brazilian GLAM institutions
# Simplified to avoid timeout - direct instance matching only
BRAZILIAN_GLAM_QUERY = """
SELECT ?item ?itemLabel ?typeLabel ?cityLabel ?isil ?viaf ?website WHERE {
VALUES ?type {
wd:Q33506 # museum
wd:Q7075 # library
wd:Q166118 # archive
wd:Q207694 # art gallery
wd:Q1030034 # national library
wd:Q637519 # university library
wd:Q7210356 # public library
wd:Q7840289 # national archive
}
?item wdt:P31 ?type . # Direct instance, no subclass traversal
?item wdt:P17 wd:Q155 . # country = Brazil
OPTIONAL { ?item wdt:P131 ?city }
OPTIONAL { ?item wdt:P791 ?isil }
OPTIONAL { ?item wdt:P214 ?viaf }
OPTIONAL { ?item wdt:P856 ?website }
SERVICE wikibase:label { bd:serviceParam wikibase:language "pt,en" }
}
ORDER BY ?itemLabel
LIMIT 200
"""
def query_wikidata(sparql_query: str) -> list:
"""Execute SPARQL query against Wikidata endpoint."""
headers = {
"User-Agent": "GLAM-Data-Extraction/1.0 (https://github.com/yourusername/glam; contact@example.com)",
"Accept": "application/json"
}
try:
response = requests.get(
WIKIDATA_SPARQL_ENDPOINT,
params={"query": sparql_query, "format": "json"},
headers=headers,
timeout=60
)
response.raise_for_status()
return response.json()["results"]["bindings"]
except Exception as e:
print(f"❌ Wikidata query failed: {e}")
return []
def load_our_brazilian_institutions() -> list:
"""Load Brazilian institutions from our dataset."""
input_file = Path("data/instances/global/global_heritage_institutions_wikidata_enriched.yaml")
if not input_file.exists():
print(f"❌ Input file not found: {input_file}")
sys.exit(1)
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
# Filter Brazilian institutions
brazilian = [
inst for inst in institutions
if inst.get('locations') and
inst['locations'][0].get('country') == 'BR'
]
return brazilian
def main():
print("=" * 80)
print("🇧🇷 BRAZILIAN GLAM INSTITUTIONS - WIKIDATA vs OUR DATASET")
print("=" * 80)
print()
# Load our Brazilian institutions
print("📖 Loading our Brazilian institutions...")
our_institutions = load_our_brazilian_institutions()
print(f"✅ Found {len(our_institutions)} Brazilian institutions in our dataset\n")
# Create index by name for quick lookup
our_names = {inst.get('name', '').lower() for inst in our_institutions}
# Query Wikidata
print("🌐 Querying Wikidata for major Brazilian GLAM institutions...")
print(" (This may take 30-60 seconds...)\n")
wikidata_results = query_wikidata(BRAZILIAN_GLAM_QUERY)
if not wikidata_results:
print("❌ No results from Wikidata. Check your internet connection.")
return
print(f"✅ Found {len(wikidata_results)} Brazilian institutions in Wikidata\n")
# Analyze matches
print("=" * 80)
print("📊 ANALYSIS: WIKIDATA INSTITUTIONS NOT IN OUR DATASET")
print("=" * 80)
print()
in_wikidata_not_ours = []
in_both = []
for result in wikidata_results:
name = result.get("itemLabel", {}).get("value", "")
qid = result.get("item", {}).get("value", "").split("/")[-1]
inst_type = result.get("typeLabel", {}).get("value", "")
city = result.get("cityLabel", {}).get("value", "N/A")
isil = result.get("isil", {}).get("value", "")
viaf = result.get("viaf", {}).get("value", "")
website = result.get("website", {}).get("value", "")
# Check if in our dataset (fuzzy match)
name_lower = name.lower()
found_in_ours = any(
name_lower in our_name or our_name in name_lower
for our_name in our_names
)
entry = {
"name": name,
"qid": qid,
"type": inst_type,
"city": city,
"isil": isil,
"viaf": viaf,
"website": website
}
if found_in_ours:
in_both.append(entry)
else:
in_wikidata_not_ours.append(entry)
# Report institutions in Wikidata but not in our dataset
if in_wikidata_not_ours:
print(f"🔍 Found {len(in_wikidata_not_ours)} institutions in Wikidata NOT in our dataset:")
print()
# Group by type
by_type = defaultdict(list)
for inst in in_wikidata_not_ours:
by_type[inst['type']].append(inst)
for inst_type, institutions in sorted(by_type.items()):
print(f"\n📚 {inst_type} ({len(institutions)}):")
print("-" * 80)
for inst in institutions[:10]: # Show first 10 per type
print(f"{inst['name']} ({inst['qid']})")
print(f" City: {inst['city']}")
if inst['isil']:
print(f" ISIL: {inst['isil']}")
if inst['website']:
print(f" Website: {inst['website']}")
print()
if len(institutions) > 10:
print(f" ... and {len(institutions) - 10} more")
# Report institutions in both
if in_both:
print("\n" + "=" * 80)
print(f"✅ Found {len(in_both)} institutions in BOTH Wikidata and our dataset")
print("=" * 80)
for inst in in_both[:5]:
print(f"{inst['name']} ({inst['qid']})")
# Summary statistics
print("\n" + "=" * 80)
print("📊 SUMMARY")
print("=" * 80)
print(f"Our dataset: {len(our_institutions)} Brazilian institutions")
print(f"Wikidata: {len(wikidata_results)} Brazilian institutions")
print(f"In both: {len(in_both)} institutions")
print(f"In Wikidata only: {len(in_wikidata_not_ours)} institutions")
print(f"In our dataset only: {len(our_institutions) - len(in_both)} institutions")
# Recommendation
print("\n" + "=" * 80)
print("💡 RECOMMENDATIONS")
print("=" * 80)
if len(in_wikidata_not_ours) > 50:
print("\n✅ GOOD NEWS: Wikidata has many major Brazilian institutions!")
print(f"{len(in_wikidata_not_ours)} institutions could be added to our dataset")
print("\n🎯 Next steps:")
print(" 1. Review institutions in conversations - may have missed extraction")
print(" 2. Consider adding major institutions to KNOWN_INSTITUTIONS dict")
print(" 3. Re-run fuzzy matching with adjusted threshold (0.75 instead of 0.85)")
if len(our_institutions) - len(in_both) > 50:
print("\n🌱 OPPORTUNITY: Our dataset has many regional institutions not in Wikidata")
print(f"{len(our_institutions) - len(in_both)} institutions could be added to Wikidata")
print("\n🎯 Consider contributing to Wikidata:")
print(" 1. Create entries for regional museums and archives")
print(" 2. Enrich with ISIL codes, VIAF IDs, and website URLs")
print(" 3. Benefits entire Linked Open Data ecosystem")
# Save results for manual review
output_file = Path("data/analysis/brazilian_institutions_wikidata_comparison.yaml")
output_file.parent.mkdir(parents=True, exist_ok=True)
comparison_data = {
"analysis_date": "2025-11-09",
"our_dataset_count": len(our_institutions),
"wikidata_count": len(wikidata_results),
"in_both_count": len(in_both),
"in_wikidata_only": in_wikidata_not_ours,
"in_both": in_both
}
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(comparison_data, f, allow_unicode=True, sort_keys=False)
print(f"\n💾 Detailed comparison saved to: {output_file}")
if __name__ == "__main__":
main()