- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
266 lines
9.8 KiB
Python
Executable file
266 lines
9.8 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Query Wikidata for Chilean Archives using SPARQL
|
|
Uses Wikidata Query Service to find archives in Chile with their Q-numbers
|
|
"""
|
|
|
|
import yaml
|
|
from SPARQLWrapper import SPARQLWrapper, JSON
|
|
from typing import List, Dict
|
|
from pathlib import Path
|
|
|
|
def query_chilean_archives() -> List[Dict]:
|
|
"""Query Wikidata for all archives in Chile."""
|
|
|
|
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
|
|
|
|
# SPARQL query for archives in Chile
|
|
# P31 = instance of, P17 = country, Q298 = Chile
|
|
# Q166118 = archive institution
|
|
query = """
|
|
SELECT DISTINCT ?archive ?archiveLabel ?cityLabel ?coords ?founded WHERE {
|
|
# Archive types (including subclasses)
|
|
?archive wdt:P31/wdt:P279* wd:Q166118 .
|
|
|
|
# Located in Chile
|
|
?archive wdt:P17 wd:Q298 .
|
|
|
|
# Get city/location
|
|
OPTIONAL { ?archive wdt:P131 ?city . }
|
|
|
|
# Get coordinates
|
|
OPTIONAL { ?archive wdt:P625 ?coords . }
|
|
|
|
# Get founding date
|
|
OPTIONAL { ?archive wdt:P571 ?founded . }
|
|
|
|
# Get labels in Spanish and English
|
|
SERVICE wikibase:label {
|
|
bd:serviceParam wikibase:language "es,en" .
|
|
}
|
|
}
|
|
ORDER BY ?archiveLabel
|
|
"""
|
|
|
|
sparql.setQuery(query)
|
|
sparql.setReturnFormat(JSON)
|
|
|
|
print("🔍 Querying Wikidata for Chilean archives...")
|
|
print(" Endpoint: https://query.wikidata.org/sparql")
|
|
print()
|
|
|
|
try:
|
|
results = sparql.query().convert() # type: ignore
|
|
|
|
archives = []
|
|
for result in results["results"]["bindings"]: # type: ignore
|
|
archive_uri = result["archive"]["value"] # type: ignore
|
|
q_number = archive_uri.split("/")[-1]
|
|
|
|
archive = {
|
|
"q_number": q_number,
|
|
"name": result.get("archiveLabel", {}).get("value", ""), # type: ignore
|
|
"city": result.get("cityLabel", {}).get("value", ""), # type: ignore
|
|
"founded": result.get("founded", {}).get("value", "")[:4] if "founded" in result else "", # type: ignore
|
|
"wikidata_url": f"https://www.wikidata.org/wiki/{q_number}"
|
|
}
|
|
archives.append(archive)
|
|
|
|
return archives
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error querying Wikidata: {e}")
|
|
return []
|
|
|
|
def load_chilean_institutions(file_path: Path) -> List[Dict]:
|
|
"""Load Chilean institutions from YAML file."""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
return yaml.safe_load(f)
|
|
|
|
def normalize_name(name: str) -> str:
|
|
"""Normalize institution name for matching."""
|
|
return name.lower().strip().replace("'", "").replace(" ", " ")
|
|
|
|
def find_matches(institutions: List[Dict], wikidata_archives: List[Dict]) -> List[Dict]:
|
|
"""Find matches between our institutions and Wikidata archives."""
|
|
|
|
matches = []
|
|
|
|
# Filter institutions without Wikidata
|
|
archives_without_wd = [
|
|
inst for inst in institutions
|
|
if inst.get('institution_type') == 'ARCHIVE'
|
|
and not any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
]
|
|
|
|
print(f"📊 Matching {len(archives_without_wd)} institutions against {len(wikidata_archives)} Wikidata entries...")
|
|
print()
|
|
|
|
for inst in archives_without_wd:
|
|
inst_name = normalize_name(inst['name'])
|
|
inst_city = inst.get('locations', [{}])[0].get('city', '').lower()
|
|
|
|
for wd_archive in wikidata_archives:
|
|
wd_name = normalize_name(wd_archive['name'])
|
|
wd_city = wd_archive['city'].lower()
|
|
|
|
# Name match strategies
|
|
name_match = False
|
|
|
|
# Strategy 1: Exact match
|
|
if inst_name == wd_name:
|
|
name_match = True
|
|
|
|
# Strategy 2: Partial match (institution name contains Wikidata name or vice versa)
|
|
elif inst_name in wd_name or wd_name in inst_name:
|
|
name_match = True
|
|
|
|
# Strategy 3: Key words match (archivo/archive + significant word)
|
|
elif ('archivo' in inst_name or 'archive' in inst_name) and ('archivo' in wd_name or 'archive' in wd_name):
|
|
inst_words = set(inst_name.split())
|
|
wd_words = set(wd_name.split())
|
|
common_words = inst_words & wd_words
|
|
# Must share at least 2 significant words beyond "archivo"
|
|
significant_common = common_words - {'de', 'del', 'la', 'el', 'archivo', 'archives', 'historico', 'histórico', 'national', 'nacional', 's'}
|
|
if len(significant_common) >= 1: # At least 1 significant word for archives
|
|
name_match = True
|
|
|
|
# Strategy 4: "Archivo Nacional" special case (high-value institution)
|
|
if 'nacional' in inst_name and ('archivo' in inst_name or 'archive' in inst_name):
|
|
if 'nacional' in wd_name or 'national' in wd_name:
|
|
name_match = True
|
|
|
|
# Strategy 5: University archives (USACH, Universidad de Chile)
|
|
if 'universidad' in inst_name or 'university' in inst_name:
|
|
# Extract university name
|
|
if 'usach' in inst_name and 'usach' in wd_name:
|
|
name_match = True
|
|
elif 'chile' in inst_name and 'chile' in wd_name:
|
|
name_match = True
|
|
|
|
# Strategy 6: Diocese/Church archives (Arzobispado, Diócesis)
|
|
if 'diocesis' in inst_name or 'arzobispado' in inst_name:
|
|
if 'diocese' in wd_name or 'diocesis' in wd_name or 'arzobispado' in wd_name:
|
|
name_match = True
|
|
|
|
# City match (flexible - allows partial matches)
|
|
city_match = False
|
|
if inst_city and wd_city:
|
|
if inst_city in wd_city or wd_city in inst_city:
|
|
city_match = True
|
|
|
|
# Accept match if name matches and either city matches or no city info
|
|
# Allow special exceptions for national/well-known institutions
|
|
allow_match = (
|
|
city_match or
|
|
not wd_city or
|
|
'nacional' in inst_name or
|
|
'national' in wd_name
|
|
)
|
|
|
|
if name_match and allow_match:
|
|
match = {
|
|
'institution': inst,
|
|
'wikidata': wd_archive,
|
|
'name_confidence': 'exact' if inst_name == wd_name else 'partial',
|
|
'city_match': city_match
|
|
}
|
|
matches.append(match)
|
|
break # Only take first match per institution
|
|
|
|
return matches
|
|
|
|
def main():
|
|
print("=" * 80)
|
|
print("WIKIDATA SPARQL QUERY - CHILEAN ARCHIVES")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Query Wikidata
|
|
wikidata_archives = query_chilean_archives()
|
|
|
|
if not wikidata_archives:
|
|
print("❌ No results from Wikidata")
|
|
return
|
|
|
|
print(f"✅ Found {len(wikidata_archives)} archives in Wikidata")
|
|
print()
|
|
|
|
# Show sample
|
|
print("Sample results (first 10):")
|
|
for i, archive in enumerate(wikidata_archives[:10], 1):
|
|
print(f" {i}. {archive['name']} ({archive['city']}) → {archive['q_number']}")
|
|
print()
|
|
|
|
# Load our institutions
|
|
input_file = Path('data/instances/chile/chilean_institutions_batch8_enriched.yaml')
|
|
institutions = load_chilean_institutions(input_file)
|
|
|
|
print(f"📖 Loaded {len(institutions)} Chilean institutions")
|
|
archives_count = sum(1 for i in institutions if i.get('institution_type') == 'ARCHIVE')
|
|
print(f" {archives_count} are archives")
|
|
|
|
with_wikidata = sum(
|
|
1 for inst in institutions
|
|
if inst.get('institution_type') == 'ARCHIVE'
|
|
and any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
)
|
|
print(f" {with_wikidata} already have Wikidata")
|
|
print(f" {archives_count - with_wikidata} need enrichment")
|
|
print()
|
|
|
|
# Find matches
|
|
matches = find_matches(institutions, wikidata_archives)
|
|
|
|
print("=" * 80)
|
|
print(f"MATCHING RESULTS: {len(matches)} potential matches found")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Display matches
|
|
for i, match in enumerate(matches, 1):
|
|
inst = match['institution']
|
|
wd = match['wikidata']
|
|
|
|
print(f"{i}. {inst['name']}")
|
|
print(f" Our city: {inst.get('locations', [{}])[0].get('city', 'Unknown')}")
|
|
print(f" ↓ MATCH ({match['name_confidence']} name, city: {match['city_match']})")
|
|
print(f" Wikidata: {wd['name']} ({wd['city']})")
|
|
print(f" Q-number: {wd['q_number']}")
|
|
if wd['founded']:
|
|
print(f" Founded: {wd['founded']}")
|
|
print()
|
|
|
|
# Export matches to JSON for batch processing
|
|
output_file = Path('data/instances/chile/wikidata_matches_batch9_archives.json')
|
|
|
|
import json
|
|
match_data = [
|
|
{
|
|
'institution_name': match['institution']['name'],
|
|
'institution_city': match['institution'].get('locations', [{}])[0].get('city', ''),
|
|
'q_number': match['wikidata']['q_number'],
|
|
'wikidata_name': match['wikidata']['name'],
|
|
'wikidata_city': match['wikidata']['city'],
|
|
'founded': match['wikidata']['founded'],
|
|
'confidence': match['name_confidence'],
|
|
'city_match': match['city_match']
|
|
}
|
|
for match in matches
|
|
]
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(match_data, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"💾 Saved {len(matches)} matches to: {output_file}")
|
|
print()
|
|
print("🎯 Next step: Review matches and create Batch 9 enrichment script")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|