238 lines
8.5 KiB
Python
Executable file
238 lines
8.5 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Query Wikidata for major Brazilian GLAM institutions and compare with our dataset.
|
|
|
|
This helps us understand if the low Brazilian coverage (1%) is due to:
|
|
1. Extraction issue (we missed them in conversations)
|
|
2. Wikidata gap (institutions need to be added to Wikidata)
|
|
3. Dataset composition (we focus on different institutions)
|
|
"""
|
|
|
|
import yaml
|
|
import requests
|
|
import sys
|
|
from pathlib import Path
|
|
from collections import defaultdict
|
|
|
|
WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
|
|
# SPARQL query for major Brazilian GLAM institutions
|
|
# Simplified to avoid timeout - direct instance matching only
|
|
BRAZILIAN_GLAM_QUERY = """
|
|
SELECT ?item ?itemLabel ?typeLabel ?cityLabel ?isil ?viaf ?website WHERE {
|
|
VALUES ?type {
|
|
wd:Q33506 # museum
|
|
wd:Q7075 # library
|
|
wd:Q166118 # archive
|
|
wd:Q207694 # art gallery
|
|
wd:Q1030034 # national library
|
|
wd:Q637519 # university library
|
|
wd:Q7210356 # public library
|
|
wd:Q7840289 # national archive
|
|
}
|
|
|
|
?item wdt:P31 ?type . # Direct instance, no subclass traversal
|
|
?item wdt:P17 wd:Q155 . # country = Brazil
|
|
|
|
OPTIONAL { ?item wdt:P131 ?city }
|
|
OPTIONAL { ?item wdt:P791 ?isil }
|
|
OPTIONAL { ?item wdt:P214 ?viaf }
|
|
OPTIONAL { ?item wdt:P856 ?website }
|
|
|
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "pt,en" }
|
|
}
|
|
ORDER BY ?itemLabel
|
|
LIMIT 200
|
|
"""
|
|
|
|
def query_wikidata(sparql_query: str) -> list:
|
|
"""Execute SPARQL query against Wikidata endpoint."""
|
|
headers = {
|
|
"User-Agent": "GLAM-Data-Extraction/1.0 (https://github.com/yourusername/glam; contact@example.com)",
|
|
"Accept": "application/json"
|
|
}
|
|
|
|
try:
|
|
response = requests.get(
|
|
WIKIDATA_SPARQL_ENDPOINT,
|
|
params={"query": sparql_query, "format": "json"},
|
|
headers=headers,
|
|
timeout=60
|
|
)
|
|
response.raise_for_status()
|
|
return response.json()["results"]["bindings"]
|
|
except Exception as e:
|
|
print(f"❌ Wikidata query failed: {e}")
|
|
return []
|
|
|
|
def load_our_brazilian_institutions() -> list:
|
|
"""Load Brazilian institutions from our dataset."""
|
|
input_file = Path("data/instances/global/global_heritage_institutions_wikidata_enriched.yaml")
|
|
|
|
if not input_file.exists():
|
|
print(f"❌ Input file not found: {input_file}")
|
|
sys.exit(1)
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
# Filter Brazilian institutions
|
|
brazilian = [
|
|
inst for inst in institutions
|
|
if inst.get('locations') and
|
|
inst['locations'][0].get('country') == 'BR'
|
|
]
|
|
|
|
return brazilian
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("🇧🇷 BRAZILIAN GLAM INSTITUTIONS - WIKIDATA vs OUR DATASET")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load our Brazilian institutions
|
|
print("📖 Loading our Brazilian institutions...")
|
|
our_institutions = load_our_brazilian_institutions()
|
|
print(f"✅ Found {len(our_institutions)} Brazilian institutions in our dataset\n")
|
|
|
|
# Create index by name for quick lookup
|
|
our_names = {inst.get('name', '').lower() for inst in our_institutions}
|
|
|
|
# Query Wikidata
|
|
print("🌐 Querying Wikidata for major Brazilian GLAM institutions...")
|
|
print(" (This may take 30-60 seconds...)\n")
|
|
|
|
wikidata_results = query_wikidata(BRAZILIAN_GLAM_QUERY)
|
|
|
|
if not wikidata_results:
|
|
print("❌ No results from Wikidata. Check your internet connection.")
|
|
return
|
|
|
|
print(f"✅ Found {len(wikidata_results)} Brazilian institutions in Wikidata\n")
|
|
|
|
# Analyze matches
|
|
print("=" * 80)
|
|
print("📊 ANALYSIS: WIKIDATA INSTITUTIONS NOT IN OUR DATASET")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
in_wikidata_not_ours = []
|
|
in_both = []
|
|
|
|
for result in wikidata_results:
|
|
name = result.get("itemLabel", {}).get("value", "")
|
|
qid = result.get("item", {}).get("value", "").split("/")[-1]
|
|
inst_type = result.get("typeLabel", {}).get("value", "")
|
|
city = result.get("cityLabel", {}).get("value", "N/A")
|
|
isil = result.get("isil", {}).get("value", "")
|
|
viaf = result.get("viaf", {}).get("value", "")
|
|
website = result.get("website", {}).get("value", "")
|
|
|
|
# Check if in our dataset (fuzzy match)
|
|
name_lower = name.lower()
|
|
found_in_ours = any(
|
|
name_lower in our_name or our_name in name_lower
|
|
for our_name in our_names
|
|
)
|
|
|
|
entry = {
|
|
"name": name,
|
|
"qid": qid,
|
|
"type": inst_type,
|
|
"city": city,
|
|
"isil": isil,
|
|
"viaf": viaf,
|
|
"website": website
|
|
}
|
|
|
|
if found_in_ours:
|
|
in_both.append(entry)
|
|
else:
|
|
in_wikidata_not_ours.append(entry)
|
|
|
|
# Report institutions in Wikidata but not in our dataset
|
|
if in_wikidata_not_ours:
|
|
print(f"🔍 Found {len(in_wikidata_not_ours)} institutions in Wikidata NOT in our dataset:")
|
|
print()
|
|
|
|
# Group by type
|
|
by_type = defaultdict(list)
|
|
for inst in in_wikidata_not_ours:
|
|
by_type[inst['type']].append(inst)
|
|
|
|
for inst_type, institutions in sorted(by_type.items()):
|
|
print(f"\n📚 {inst_type} ({len(institutions)}):")
|
|
print("-" * 80)
|
|
|
|
for inst in institutions[:10]: # Show first 10 per type
|
|
print(f" • {inst['name']} ({inst['qid']})")
|
|
print(f" City: {inst['city']}")
|
|
if inst['isil']:
|
|
print(f" ISIL: {inst['isil']}")
|
|
if inst['website']:
|
|
print(f" Website: {inst['website']}")
|
|
print()
|
|
|
|
if len(institutions) > 10:
|
|
print(f" ... and {len(institutions) - 10} more")
|
|
|
|
# Report institutions in both
|
|
if in_both:
|
|
print("\n" + "=" * 80)
|
|
print(f"✅ Found {len(in_both)} institutions in BOTH Wikidata and our dataset")
|
|
print("=" * 80)
|
|
for inst in in_both[:5]:
|
|
print(f" • {inst['name']} ({inst['qid']})")
|
|
|
|
# Summary statistics
|
|
print("\n" + "=" * 80)
|
|
print("📊 SUMMARY")
|
|
print("=" * 80)
|
|
print(f"Our dataset: {len(our_institutions)} Brazilian institutions")
|
|
print(f"Wikidata: {len(wikidata_results)} Brazilian institutions")
|
|
print(f"In both: {len(in_both)} institutions")
|
|
print(f"In Wikidata only: {len(in_wikidata_not_ours)} institutions")
|
|
print(f"In our dataset only: {len(our_institutions) - len(in_both)} institutions")
|
|
|
|
# Recommendation
|
|
print("\n" + "=" * 80)
|
|
print("💡 RECOMMENDATIONS")
|
|
print("=" * 80)
|
|
|
|
if len(in_wikidata_not_ours) > 50:
|
|
print("\n✅ GOOD NEWS: Wikidata has many major Brazilian institutions!")
|
|
print(f" → {len(in_wikidata_not_ours)} institutions could be added to our dataset")
|
|
print("\n🎯 Next steps:")
|
|
print(" 1. Review institutions in conversations - may have missed extraction")
|
|
print(" 2. Consider adding major institutions to KNOWN_INSTITUTIONS dict")
|
|
print(" 3. Re-run fuzzy matching with adjusted threshold (0.75 instead of 0.85)")
|
|
|
|
if len(our_institutions) - len(in_both) > 50:
|
|
print("\n🌱 OPPORTUNITY: Our dataset has many regional institutions not in Wikidata")
|
|
print(f" → {len(our_institutions) - len(in_both)} institutions could be added to Wikidata")
|
|
print("\n🎯 Consider contributing to Wikidata:")
|
|
print(" 1. Create entries for regional museums and archives")
|
|
print(" 2. Enrich with ISIL codes, VIAF IDs, and website URLs")
|
|
print(" 3. Benefits entire Linked Open Data ecosystem")
|
|
|
|
# Save results for manual review
|
|
output_file = Path("data/analysis/brazilian_institutions_wikidata_comparison.yaml")
|
|
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
comparison_data = {
|
|
"analysis_date": "2025-11-09",
|
|
"our_dataset_count": len(our_institutions),
|
|
"wikidata_count": len(wikidata_results),
|
|
"in_both_count": len(in_both),
|
|
"in_wikidata_only": in_wikidata_not_ours,
|
|
"in_both": in_both
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(comparison_data, f, allow_unicode=True, sort_keys=False)
|
|
|
|
print(f"\n💾 Detailed comparison saved to: {output_file}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|