178 lines
5.3 KiB
Python
178 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Extract Argentine Heritage Institutions from Wikidata
|
|
|
|
This script uses the Wikidata SPARQL endpoint to extract Argentine heritage
|
|
institutions (libraries, archives, museums) that are documented in Wikidata
|
|
but may not have ISIL codes.
|
|
|
|
Output: data/isil/AR/argentina_wikidata_institutions.json
|
|
"""
|
|
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from typing import List, Dict, Optional
|
|
|
|
|
|
# Wikidata SPARQL query results (from manual query - 200 institutions)
|
|
# This data was extracted using the wikidata-authenticated_execute_sparql MCP tool
|
|
# Query: Argentine heritage institutions (archives, libraries, museums)
|
|
# Date: 2025-11-18
|
|
|
|
WIKIDATA_RESULTS = [
|
|
# Results from SPARQL query would be pasted here
|
|
# For now, we'll structure the extraction from the query results above
|
|
]
|
|
|
|
|
|
def extract_institution_from_wikidata(result: Dict) -> Dict:
|
|
"""
|
|
Convert Wikidata SPARQL result to heritage custodian record.
|
|
|
|
Args:
|
|
result: SPARQL query result dictionary
|
|
|
|
Returns:
|
|
Heritage custodian record dict
|
|
"""
|
|
# Extract Q-number from Wikidata URI
|
|
q_number = result['item']['value'].split('/')[-1]
|
|
|
|
# Parse institution name
|
|
name = result.get('itemLabel', {}).get('value', 'Unknown')
|
|
lang = result.get('itemLabel', {}).get('xml:lang', 'es')
|
|
|
|
# Parse institution type
|
|
type_label = result.get('typeLabel', {}).get('value', '').lower()
|
|
institution_type = map_type_to_taxonomy(type_label)
|
|
|
|
# Parse location
|
|
city = result.get('cityLabel', {}).get('value')
|
|
|
|
# Parse coordinates
|
|
coords_wkt = result.get('coords', {}).get('value')
|
|
lat, lon = parse_coordinates(coords_wkt) if coords_wkt else (None, None)
|
|
|
|
# Parse identifiers
|
|
identifiers = []
|
|
|
|
# Wikidata Q-number
|
|
identifiers.append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_number,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
|
})
|
|
|
|
# VIAF
|
|
if 'viaf' in result:
|
|
identifiers.append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': result['viaf']['value'],
|
|
'identifier_url': f"https://viaf.org/viaf/{result['viaf']['value']}"
|
|
})
|
|
|
|
# Website
|
|
if 'website' in result:
|
|
identifiers.append({
|
|
'identifier_scheme': 'Website',
|
|
'identifier_value': result['website']['value'],
|
|
'identifier_url': result['website']['value']
|
|
})
|
|
|
|
# Build record
|
|
record = {
|
|
'name': name,
|
|
'name_language': lang,
|
|
'institution_type': institution_type,
|
|
'country': 'AR',
|
|
'locations': [],
|
|
'identifiers': identifiers,
|
|
'provenance': {
|
|
'data_source': 'WIKIDATA',
|
|
'data_tier': 'TIER_3_CROWD_SOURCED',
|
|
'extraction_date': datetime.now(timezone.utc).isoformat(),
|
|
'extraction_method': 'Wikidata SPARQL query for Argentine heritage institutions',
|
|
'confidence_score': 0.85,
|
|
'notes': f'Extracted from Wikidata entity {q_number}'
|
|
}
|
|
}
|
|
|
|
# Add location if available
|
|
if city or lat:
|
|
location = {
|
|
'city': city,
|
|
'country': 'AR'
|
|
}
|
|
if lat and lon:
|
|
location['latitude'] = lat
|
|
location['longitude'] = lon
|
|
record['locations'].append(location)
|
|
|
|
return record
|
|
|
|
|
|
def map_type_to_taxonomy(type_label: str) -> str:
|
|
"""Map Wikidata type labels to GLAMORCUBESFIXPHDNT taxonomy."""
|
|
type_mapping = {
|
|
'archivo': 'ARCHIVE',
|
|
'biblioteca': 'LIBRARY',
|
|
'museo': 'MUSEUM',
|
|
'exposición': 'MUSEUM', # Exhibitions are museum-related
|
|
'galería': 'GALLERY'
|
|
}
|
|
|
|
for key, value in type_mapping.items():
|
|
if key in type_label:
|
|
return value
|
|
|
|
return 'UNKNOWN'
|
|
|
|
|
|
def parse_coordinates(wkt: str) -> tuple:
|
|
"""
|
|
Parse WKT Point format to lat/lon.
|
|
|
|
Example: "Point(-58.3719 -34.6043)" -> (-34.6043, -58.3719)
|
|
|
|
Returns:
|
|
(latitude, longitude) tuple
|
|
"""
|
|
try:
|
|
# Remove "Point(" prefix and ")" suffix
|
|
coords = wkt.replace('Point(', '').replace(')', '')
|
|
lon, lat = map(float, coords.split())
|
|
return (lat, lon)
|
|
except:
|
|
return (None, None)
|
|
|
|
|
|
def main():
|
|
"""Main execution."""
|
|
print("=" * 80)
|
|
print("WIKIDATA ARGENTINE INSTITUTIONS EXTRACTION")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
print("Status: This script is a template for future Wikidata extraction.")
|
|
print()
|
|
print("The Wikidata SPARQL query found ~200 Argentine heritage institutions,")
|
|
print("but NONE of them have ISIL codes recorded in Wikidata.")
|
|
print()
|
|
print("Findings:")
|
|
print(" - 200+ institutions in Wikidata (archives, libraries, museums)")
|
|
print(" - Many have websites, VIAF IDs, coordinates")
|
|
print(" - ISIL codes NOT systematically recorded")
|
|
print()
|
|
print("Recommendation: Focus on IRAM email for ISIL registry instead of")
|
|
print(" extracting from Wikidata.")
|
|
print()
|
|
print("However, Wikidata can be used for:")
|
|
print(" 1. Enriching institutions with Wikidata Q-numbers")
|
|
print(" 2. Finding coordinates for geocoding")
|
|
print(" 3. Cross-referencing with CONABIP/AGN data")
|
|
print()
|
|
print("=" * 80)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|