glam/scripts/extract_argentina_wikidata.py
2025-11-19 23:25:22 +01:00

178 lines
5.3 KiB
Python

#!/usr/bin/env python3
"""
Extract Argentine Heritage Institutions from Wikidata
This script uses the Wikidata SPARQL endpoint to extract Argentine heritage
institutions (libraries, archives, museums) that are documented in Wikidata
but may not have ISIL codes.
Output: data/isil/AR/argentina_wikidata_institutions.json
"""
import json
from datetime import datetime, timezone
from typing import List, Dict, Optional
# Wikidata SPARQL query results (from manual query - 200 institutions)
# This data was extracted using the wikidata-authenticated_execute_sparql MCP tool
# Query: Argentine heritage institutions (archives, libraries, museums)
# Date: 2025-11-18
WIKIDATA_RESULTS = [
# Results from SPARQL query would be pasted here
# For now, we'll structure the extraction from the query results above
]
def extract_institution_from_wikidata(result: Dict) -> Dict:
"""
Convert Wikidata SPARQL result to heritage custodian record.
Args:
result: SPARQL query result dictionary
Returns:
Heritage custodian record dict
"""
# Extract Q-number from Wikidata URI
q_number = result['item']['value'].split('/')[-1]
# Parse institution name
name = result.get('itemLabel', {}).get('value', 'Unknown')
lang = result.get('itemLabel', {}).get('xml:lang', 'es')
# Parse institution type
type_label = result.get('typeLabel', {}).get('value', '').lower()
institution_type = map_type_to_taxonomy(type_label)
# Parse location
city = result.get('cityLabel', {}).get('value')
# Parse coordinates
coords_wkt = result.get('coords', {}).get('value')
lat, lon = parse_coordinates(coords_wkt) if coords_wkt else (None, None)
# Parse identifiers
identifiers = []
# Wikidata Q-number
identifiers.append({
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
})
# VIAF
if 'viaf' in result:
identifiers.append({
'identifier_scheme': 'VIAF',
'identifier_value': result['viaf']['value'],
'identifier_url': f"https://viaf.org/viaf/{result['viaf']['value']}"
})
# Website
if 'website' in result:
identifiers.append({
'identifier_scheme': 'Website',
'identifier_value': result['website']['value'],
'identifier_url': result['website']['value']
})
# Build record
record = {
'name': name,
'name_language': lang,
'institution_type': institution_type,
'country': 'AR',
'locations': [],
'identifiers': identifiers,
'provenance': {
'data_source': 'WIKIDATA',
'data_tier': 'TIER_3_CROWD_SOURCED',
'extraction_date': datetime.now(timezone.utc).isoformat(),
'extraction_method': 'Wikidata SPARQL query for Argentine heritage institutions',
'confidence_score': 0.85,
'notes': f'Extracted from Wikidata entity {q_number}'
}
}
# Add location if available
if city or lat:
location = {
'city': city,
'country': 'AR'
}
if lat and lon:
location['latitude'] = lat
location['longitude'] = lon
record['locations'].append(location)
return record
def map_type_to_taxonomy(type_label: str) -> str:
"""Map Wikidata type labels to GLAMORCUBESFIXPHDNT taxonomy."""
type_mapping = {
'archivo': 'ARCHIVE',
'biblioteca': 'LIBRARY',
'museo': 'MUSEUM',
'exposición': 'MUSEUM', # Exhibitions are museum-related
'galería': 'GALLERY'
}
for key, value in type_mapping.items():
if key in type_label:
return value
return 'UNKNOWN'
def parse_coordinates(wkt: str) -> tuple:
"""
Parse WKT Point format to lat/lon.
Example: "Point(-58.3719 -34.6043)" -> (-34.6043, -58.3719)
Returns:
(latitude, longitude) tuple
"""
try:
# Remove "Point(" prefix and ")" suffix
coords = wkt.replace('Point(', '').replace(')', '')
lon, lat = map(float, coords.split())
return (lat, lon)
except:
return (None, None)
def main():
"""Main execution."""
print("=" * 80)
print("WIKIDATA ARGENTINE INSTITUTIONS EXTRACTION")
print("=" * 80)
print()
print("Status: This script is a template for future Wikidata extraction.")
print()
print("The Wikidata SPARQL query found ~200 Argentine heritage institutions,")
print("but NONE of them have ISIL codes recorded in Wikidata.")
print()
print("Findings:")
print(" - 200+ institutions in Wikidata (archives, libraries, museums)")
print(" - Many have websites, VIAF IDs, coordinates")
print(" - ISIL codes NOT systematically recorded")
print()
print("Recommendation: Focus on IRAM email for ISIL registry instead of")
print(" extracting from Wikidata.")
print()
print("However, Wikidata can be used for:")
print(" 1. Enriching institutions with Wikidata Q-numbers")
print(" 2. Finding coordinates for geocoding")
print(" 3. Cross-referencing with CONABIP/AGN data")
print()
print("=" * 80)
if __name__ == "__main__":
main()