- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
592 lines
23 KiB
Python
Executable file
592 lines
23 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Wikidata enrichment for Latin American heritage institutions with alternative name matching.
|
|
|
|
This script applies the successful Tunisia enrichment strategy to Latin America:
|
|
- Entity type validation (museums must be museums, not banks)
|
|
- Geographic validation (institutions must be in correct city/country)
|
|
- Alternative name matching (Portuguese/Spanish ↔ English)
|
|
- Fuzzy matching with 70% threshold
|
|
- Prevents false positives through multiple validation layers
|
|
|
|
Target: 304 Latin American institutions (BR: 97, MX: 109, CL: 90, AR: 1, US: 7)
|
|
Current Wikidata coverage: 56/304 (18.4%)
|
|
Expected improvement: 60-75% coverage (similar to Tunisia's 76.5%)
|
|
|
|
GLAM Data Extraction Project
|
|
Schema: LinkML v0.2.1
|
|
"""
|
|
|
|
import yaml
|
|
import time
|
|
import requests
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, List, Set
|
|
from rapidfuzz import fuzz
|
|
|
|
SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
|
|
USER_AGENT = "GLAM-LatAm-Wikidata-Enrichment/1.0"
|
|
|
|
# Institution type mapping: LinkML enum -> Wikidata entity types
|
|
INSTITUTION_TYPE_MAPPING = {
|
|
'MUSEUM': {
|
|
'Q33506', # Museum
|
|
'Q1030034', # Archaeological museum
|
|
'Q3329412', # Archaeological museum (variant)
|
|
'Q473972', # Art museum
|
|
'Q2668072', # National museum
|
|
'Q207694', # History museum
|
|
'Q7328910', # Science museum
|
|
'Q15243387', # Cultural heritage site
|
|
'Q3152824', # Archaeological site
|
|
'Q1153562', # Open-air museum
|
|
'Q1496967', # Folk museum
|
|
'Q17431399', # Heritage museum
|
|
'Q28835878', # Heritage site
|
|
'Q641635', # Natural history museum
|
|
'Q2142332', # Contemporary art museum
|
|
},
|
|
'LIBRARY': {
|
|
'Q7075', # Library
|
|
'Q2668072', # National library
|
|
'Q570116', # Public library
|
|
'Q5193377', # University library
|
|
'Q28564', # Academic library
|
|
'Q1479716', # Regional library
|
|
'Q1622062', # Digital library
|
|
'Q17297735', # Diocesan library
|
|
'Q105338594', # Bibliothèque diocésaine
|
|
},
|
|
'ARCHIVE': {
|
|
'Q166118', # Archive
|
|
'Q7840289', # Art gallery (can have archival collections)
|
|
'Q2668072', # National archive
|
|
'Q1497375', # Historical archive
|
|
'Q64578911', # Regional archive
|
|
'Q11396317', # State archive
|
|
},
|
|
'HOLY_SITES': {
|
|
'Q22687', # Synagogue
|
|
'Q16970', # Church
|
|
'Q32815', # Mosque
|
|
'Q44539', # Temple
|
|
'Q44613', # Monastery
|
|
'Q34627', # Synagogue
|
|
'Q697295', # Cathedral
|
|
'Q56242275', # Pilgrimage site
|
|
},
|
|
'GALLERY': {
|
|
'Q7840289', # Art gallery
|
|
'Q473972', # Art museum
|
|
'Q1007870', # Art centre
|
|
},
|
|
'UNIVERSITY': {
|
|
'Q3918', # University
|
|
'Q875538', # Public university
|
|
'Q2467461', # Private university
|
|
'Q15936437', # Research university
|
|
'Q38723', # Higher education institution
|
|
'Q3354859', # Technical university
|
|
},
|
|
'RESEARCH_CENTER': {
|
|
'Q31855', # Research institute
|
|
'Q7315155', # Research center
|
|
'Q2467461', # Research institution
|
|
'Q483242', # Laboratory
|
|
'Q1664720', # Institute
|
|
},
|
|
'EDUCATION_PROVIDER': {
|
|
'Q2385804', # Educational institution
|
|
'Q5341295', # Music school
|
|
'Q1664720', # Institute
|
|
'Q180958', # Faculty
|
|
'Q38723', # Higher education institution
|
|
},
|
|
'OFFICIAL_INSTITUTION': {
|
|
'Q7210356', # Cultural institution
|
|
'Q7840289', # Cultural center
|
|
'Q1030034', # Cultural heritage institution
|
|
'Q1664720', # Institute
|
|
'Q7210356', # Government cultural organization
|
|
'Q24398318', # Theatre building
|
|
'Q17431399', # Cultural center
|
|
},
|
|
'PERSONAL_COLLECTION': {
|
|
'Q7075', # Library (personal libraries)
|
|
'Q166118', # Archive (personal archives)
|
|
'Q33506', # Museum (personal museums)
|
|
},
|
|
'MIXED': {
|
|
'Q33506', # Museum
|
|
'Q7075', # Library
|
|
'Q166118', # Archive
|
|
'Q7210356', # Cultural institution
|
|
'Q7840289', # Cultural center
|
|
'Q1030034', # Cultural complex
|
|
}
|
|
}
|
|
|
|
# Country name mapping for Wikidata queries
|
|
COUNTRY_MAPPING = {
|
|
'BR': {'qid': 'Q155', 'name': 'Brazil', 'lang': 'pt,en,es'},
|
|
'MX': {'qid': 'Q96', 'name': 'Mexico', 'lang': 'es,en'},
|
|
'CL': {'qid': 'Q298', 'name': 'Chile', 'lang': 'es,en'},
|
|
'AR': {'qid': 'Q414', 'name': 'Argentina', 'lang': 'es,en'},
|
|
'US': {'qid': 'Q30', 'name': 'United States', 'lang': 'en,es,pt'},
|
|
}
|
|
|
|
def get_valid_types_for_institution(inst_type: str) -> Set[str]:
|
|
"""Get set of valid Wikidata entity types for institution type."""
|
|
return INSTITUTION_TYPE_MAPPING.get(inst_type, set())
|
|
|
|
def generate_alternative_names(name: str, inst_type: str, country: str) -> List[str]:
|
|
"""
|
|
Generate alternative name variations for multilingual matching.
|
|
|
|
Handles Portuguese (Brazil) and Spanish (Mexico, Chile) translations:
|
|
- Biblioteca/Biblioteca → Library
|
|
- Museu/Museo → Museum
|
|
- Arquivo/Archivo → Archive
|
|
- Teatro → Theatre
|
|
- Centro Cultural → Cultural Center
|
|
"""
|
|
alternatives = []
|
|
name_lower = name.lower()
|
|
|
|
# Portuguese → English (Brazil)
|
|
if country == 'BR':
|
|
if 'biblioteca' in name_lower:
|
|
alternatives.append(name.replace('Biblioteca', 'Library').replace('biblioteca', 'library'))
|
|
if 'museu' in name_lower:
|
|
alternatives.append(name.replace('Museu', 'Museum').replace('museu', 'museum'))
|
|
if 'arquivo' in name_lower:
|
|
alternatives.append(name.replace('Arquivo', 'Archive').replace('arquivo', 'archive'))
|
|
if 'teatro' in name_lower:
|
|
alternatives.append(name.replace('Teatro', 'Theatre').replace('teatro', 'theatre'))
|
|
if 'centro cultural' in name_lower:
|
|
alternatives.append(name.replace('Centro Cultural', 'Cultural Center').replace('centro cultural', 'cultural center'))
|
|
|
|
# Spanish → English (Mexico, Chile, Argentina)
|
|
elif country in {'MX', 'CL', 'AR'}:
|
|
if 'biblioteca' in name_lower:
|
|
alternatives.append(name.replace('Biblioteca', 'Library').replace('biblioteca', 'library'))
|
|
if 'museo' in name_lower:
|
|
alternatives.append(name.replace('Museo', 'Museum').replace('museo', 'museum'))
|
|
if 'archivo' in name_lower:
|
|
alternatives.append(name.replace('Archivo', 'Archive').replace('archivo', 'archive'))
|
|
if 'teatro' in name_lower:
|
|
alternatives.append(name.replace('Teatro', 'Theatre').replace('teatro', 'theatre'))
|
|
if 'centro cultural' in name_lower:
|
|
alternatives.append(name.replace('Centro Cultural', 'Cultural Center').replace('centro cultural', 'cultural center'))
|
|
|
|
return alternatives
|
|
|
|
def search_wikidata_with_validation(
|
|
name: str,
|
|
inst_type: str,
|
|
country: str,
|
|
city: Optional[str] = None,
|
|
alternative_names: Optional[List[str]] = None,
|
|
timeout: int = 60
|
|
) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Search Wikidata for Latin American heritage institutions with validation.
|
|
|
|
Args:
|
|
name: Institution name to search
|
|
inst_type: Institution type (MUSEUM, LIBRARY, ARCHIVE, etc.)
|
|
country: ISO country code (BR, MX, CL, AR, US)
|
|
city: Optional city name for additional filtering
|
|
alternative_names: List of alternative names to try
|
|
timeout: Query timeout in seconds
|
|
|
|
Returns:
|
|
Dict with Wikidata data if valid match found, None otherwise
|
|
"""
|
|
|
|
# Get valid Wikidata entity types
|
|
valid_types = get_valid_types_for_institution(inst_type)
|
|
|
|
if not valid_types:
|
|
print(f" ⚠️ Unknown institution type: {inst_type}")
|
|
return None
|
|
|
|
# Get country info
|
|
country_info = COUNTRY_MAPPING.get(country)
|
|
if not country_info:
|
|
print(f" ⚠️ Unknown country: {country}")
|
|
return None
|
|
|
|
# Build VALUES clause for entity types
|
|
type_values = " ".join([f"wd:{qid}" for qid in valid_types])
|
|
|
|
# Build SPARQL query - country-specific
|
|
query = f"""
|
|
SELECT DISTINCT ?item ?itemLabel ?itemDescription ?type ?typeLabel
|
|
?viaf ?isil ?website ?coords ?inception ?itemAltLabel
|
|
?location ?locationLabel
|
|
WHERE {{
|
|
# Must be in {country_info['name']}
|
|
?item wdt:P17 wd:{country_info['qid']} .
|
|
|
|
# Must have instance-of type matching institution type
|
|
?item wdt:P31 ?type .
|
|
|
|
# Filter to relevant types (server-side)
|
|
VALUES ?type {{ {type_values} }}
|
|
|
|
# Location (P131: located in administrative territorial entity)
|
|
OPTIONAL {{ ?item wdt:P131 ?location . }}
|
|
|
|
OPTIONAL {{ ?item wdt:P214 ?viaf . }}
|
|
OPTIONAL {{ ?item wdt:P791 ?isil . }}
|
|
OPTIONAL {{ ?item wdt:P856 ?website . }}
|
|
OPTIONAL {{ ?item wdt:P625 ?coords . }}
|
|
OPTIONAL {{ ?item wdt:P571 ?inception . }}
|
|
OPTIONAL {{ ?item skos:altLabel ?itemAltLabel . FILTER(LANG(?itemAltLabel) IN ({', '.join(f'"{lang}"' for lang in country_info['lang'].split(','))})) }}
|
|
|
|
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{country_info['lang']}" . }}
|
|
}}
|
|
LIMIT 200
|
|
"""
|
|
|
|
headers = {'User-Agent': USER_AGENT}
|
|
params = {
|
|
'query': query,
|
|
'format': 'json'
|
|
}
|
|
|
|
try:
|
|
time.sleep(1.5) # Rate limiting
|
|
response = requests.get(SPARQL_ENDPOINT, params=params, headers=headers, timeout=timeout)
|
|
response.raise_for_status()
|
|
|
|
results = response.json()
|
|
bindings = results.get("results", {}).get("bindings", [])
|
|
|
|
if not bindings:
|
|
return None
|
|
|
|
# Fuzzy match with entity type AND geographic validation
|
|
best_match = None
|
|
best_score = 0
|
|
matched_name = name
|
|
|
|
# Prepare all names to try (primary + existing alternatives + generated)
|
|
names_to_try = [name]
|
|
if alternative_names:
|
|
names_to_try.extend(alternative_names)
|
|
|
|
# Generate multilingual alternatives
|
|
generated_alternatives = generate_alternative_names(name, inst_type, country)
|
|
names_to_try.extend(generated_alternatives)
|
|
|
|
city_lower = city.lower() if city else None
|
|
|
|
# Location-specific institutions require geographic matching
|
|
requires_city_match = inst_type in {'UNIVERSITY', 'RESEARCH_CENTER', 'EDUCATION_PROVIDER'}
|
|
|
|
# Try each name variation
|
|
for name_variant in names_to_try:
|
|
name_lower = name_variant.lower()
|
|
|
|
for binding in bindings:
|
|
# VALIDATE ENTITY TYPE FIRST
|
|
entity_type_uri = binding.get("type", {}).get("value", "")
|
|
entity_type_qid = entity_type_uri.split("/")[-1] if entity_type_uri else None
|
|
|
|
if entity_type_qid not in valid_types:
|
|
continue
|
|
|
|
# GEOGRAPHIC VALIDATION for location-specific institutions
|
|
if city_lower and requires_city_match:
|
|
location_label = binding.get("locationLabel", {}).get("value", "").lower() if binding.get("locationLabel") else ""
|
|
|
|
if not location_label:
|
|
continue
|
|
|
|
location_match = fuzz.ratio(city_lower, location_label)
|
|
if location_match < 70:
|
|
continue
|
|
|
|
# Fuzzy matching on validated entities
|
|
item_label = binding.get("itemLabel", {}).get("value", "").lower()
|
|
|
|
# Multiple fuzzy match strategies
|
|
label_score = fuzz.ratio(name_lower, item_label)
|
|
partial_score = fuzz.partial_ratio(name_lower, item_label)
|
|
token_score = fuzz.token_set_ratio(name_lower, item_label)
|
|
|
|
score = max(label_score, partial_score, token_score)
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_match = binding
|
|
matched_name = name_variant
|
|
|
|
# Require minimum 70% match
|
|
if best_score < 70:
|
|
return None
|
|
|
|
if not best_match:
|
|
return None
|
|
|
|
item_uri = best_match.get("item", {}).get("value", "")
|
|
qid = item_uri.split("/")[-1] if item_uri else None
|
|
|
|
if not qid or not qid.startswith("Q"):
|
|
return None
|
|
|
|
result = {
|
|
"qid": qid,
|
|
"name": best_match.get("itemLabel", {}).get("value", ""),
|
|
"description": best_match.get("itemDescription", {}).get("value", ""),
|
|
"entity_type": best_match.get("typeLabel", {}).get("value", ""),
|
|
"match_score": best_score,
|
|
"matched_name": matched_name
|
|
}
|
|
|
|
# Add optional fields
|
|
viaf_data = best_match.get("viaf")
|
|
if viaf_data and isinstance(viaf_data, dict):
|
|
result["viaf"] = viaf_data.get("value", "")
|
|
|
|
isil_data = best_match.get("isil")
|
|
if isil_data and isinstance(isil_data, dict):
|
|
result["isil"] = isil_data.get("value", "")
|
|
|
|
website_data = best_match.get("website")
|
|
if website_data and isinstance(website_data, dict):
|
|
result["website"] = website_data.get("value", "")
|
|
|
|
inception_data = best_match.get("inception")
|
|
if inception_data and isinstance(inception_data, dict):
|
|
result["founded_date"] = inception_data.get("value", "").split("T")[0]
|
|
|
|
coords_data = best_match.get("coords")
|
|
if coords_data and isinstance(coords_data, dict):
|
|
coords_str = coords_data.get("value", "")
|
|
if coords_str and coords_str.startswith("Point("):
|
|
lon, lat = coords_str[6:-1].split()
|
|
result["latitude"] = float(lat)
|
|
result["longitude"] = float(lon)
|
|
|
|
return result
|
|
|
|
except requests.exceptions.Timeout:
|
|
print(f" ⏱️ Query timeout (>{timeout}s)")
|
|
return None
|
|
except requests.exceptions.RequestException as e:
|
|
print(f" ❌ Network error: {e}")
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Error: {e}")
|
|
return None
|
|
|
|
def add_wikidata_to_institution(institution: dict, wikidata_result: dict):
|
|
"""Add Wikidata information to institution record."""
|
|
|
|
if 'identifiers' not in institution:
|
|
institution['identifiers'] = []
|
|
|
|
# Check existing schemes
|
|
existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}
|
|
|
|
# Add Wikidata
|
|
if 'Wikidata' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': wikidata_result['qid'],
|
|
'identifier_url': f"https://www.wikidata.org/wiki/{wikidata_result['qid']}"
|
|
})
|
|
|
|
# Add VIAF if present
|
|
if wikidata_result.get('viaf') and 'VIAF' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': wikidata_result['viaf'],
|
|
'identifier_url': f"https://viaf.org/viaf/{wikidata_result['viaf']}"
|
|
})
|
|
|
|
# Add ISIL if present
|
|
if wikidata_result.get('isil') and 'ISIL' not in existing_schemes:
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'ISIL',
|
|
'identifier_value': wikidata_result['isil'],
|
|
'identifier_url': f"https://isil.org/{wikidata_result['isil']}"
|
|
})
|
|
|
|
# Update provenance
|
|
if 'provenance' not in institution:
|
|
institution['provenance'] = {}
|
|
|
|
# Create enrichment note
|
|
entity_type = wikidata_result.get('entity_type', 'unknown')
|
|
matched_name = wikidata_result.get('matched_name', institution.get('name'))
|
|
match_note = f" [matched: {matched_name}]" if matched_name != institution.get('name') else ""
|
|
|
|
enrich_note = f" Wikidata enriched {datetime.now(timezone.utc).strftime('%Y-%m-%d')} ({wikidata_result['qid']} [{entity_type}], match: {wikidata_result.get('match_score', 0):.0f}%{match_note}, validated)."
|
|
|
|
# Append to existing notes
|
|
existing_notes = institution['provenance'].get('notes', '')
|
|
institution['provenance']['notes'] = (existing_notes + enrich_note).strip()
|
|
|
|
def save_checkpoint(institutions: list, output_file: Path, stats: dict):
|
|
"""Save progress checkpoint."""
|
|
print(f"\n💾 Saving checkpoint... (enriched: {stats['enriched']}, total coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']})")
|
|
|
|
# Create metadata
|
|
data = {
|
|
'_metadata': {
|
|
'generated': datetime.now(timezone.utc).isoformat(),
|
|
'source': 'Latin American GLAM institutions',
|
|
'enhancements': [
|
|
'Wikidata enrichment (alternative names + validation)',
|
|
'Entity type validation',
|
|
'Geographic validation',
|
|
'Multilingual matching (Portuguese/Spanish/English)'
|
|
],
|
|
'statistics': {
|
|
'total_institutions': stats['total'],
|
|
'wikidata_coverage': stats['already_enriched'] + stats['enriched'],
|
|
'newly_enriched': stats['enriched']
|
|
}
|
|
},
|
|
'institutions': institutions
|
|
}
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
|
|
def main():
|
|
input_file = Path('data/instances/latin_american_institutions_AUTHORITATIVE.yaml')
|
|
output_file = Path('data/instances/latin_american_institutions_AUTHORITATIVE.yaml') # Overwrite
|
|
|
|
print("Latin American Wikidata Enrichment (Alternative Names + Validation)")
|
|
print("=" * 80)
|
|
print("Target: 304 institutions (BR: 97, MX: 109, CL: 90, AR: 1, US: 7)")
|
|
print("Current coverage: 56/304 (18.4%)")
|
|
print()
|
|
print("Features:")
|
|
print(" ✅ Entity type validation (museums must be museums)")
|
|
print(" ✅ Geographic validation (institutions in correct cities)")
|
|
print(" ✅ Alternative name matching (Portuguese/Spanish ↔ English)")
|
|
print(" ✅ Fuzzy matching (70% threshold)")
|
|
print(" ✅ Checkpoint saving every 10 institutions")
|
|
print("=" * 80)
|
|
|
|
# Load data
|
|
print(f"\nReading: {input_file}")
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
# Handle both list and dict formats
|
|
institutions = data if isinstance(data, list) else data.get('institutions', [])
|
|
print(f"Total institutions: {len(institutions)}")
|
|
|
|
# Statistics
|
|
stats = {
|
|
'total': len(institutions),
|
|
'already_enriched': 0,
|
|
'searched': 0,
|
|
'found': 0,
|
|
'enriched': 0,
|
|
'failed': 0,
|
|
'by_country': {
|
|
'BR': {'searched': 0, 'found': 0, 'enriched': 0},
|
|
'MX': {'searched': 0, 'found': 0, 'enriched': 0},
|
|
'CL': {'searched': 0, 'found': 0, 'enriched': 0},
|
|
'AR': {'searched': 0, 'found': 0, 'enriched': 0},
|
|
'US': {'searched': 0, 'found': 0, 'enriched': 0},
|
|
}
|
|
}
|
|
|
|
# Process each institution
|
|
checkpoint_interval = 10
|
|
|
|
for i, inst in enumerate(institutions, 1):
|
|
name = inst.get('name', '')
|
|
inst_type = inst.get('institution_type', 'MIXED')
|
|
locations = inst.get('locations', [])
|
|
country = locations[0].get('country', '') if locations else ''
|
|
city = locations[0].get('city', '') if locations else ''
|
|
|
|
# Check if already has Wikidata
|
|
identifiers = inst.get('identifiers', [])
|
|
existing_schemes = {id.get('identifier_scheme') for id in identifiers}
|
|
|
|
if 'Wikidata' in existing_schemes:
|
|
stats['already_enriched'] += 1
|
|
qid = next((id['identifier_value'] for id in identifiers if id.get('identifier_scheme') == 'Wikidata'), 'unknown')
|
|
print(f"[{i}/{len(institutions)}] ✓ {name} (already has {qid})")
|
|
continue
|
|
|
|
# Search Wikidata with validation
|
|
print(f"[{i}/{len(institutions)}] Searching: {name} [{inst_type}] ({city}, {country})")
|
|
stats['searched'] += 1
|
|
if country in stats['by_country']:
|
|
stats['by_country'][country]['searched'] += 1
|
|
|
|
# Get existing alternative names
|
|
alt_names = inst.get('alternative_names', [])
|
|
|
|
result = search_wikidata_with_validation(name, inst_type, country, city, alternative_names=alt_names, timeout=60)
|
|
|
|
if result:
|
|
stats['found'] += 1
|
|
if country in stats['by_country']:
|
|
stats['by_country'][country]['found'] += 1
|
|
|
|
match_score = result.get('match_score', 0)
|
|
entity_type = result.get('entity_type', 'unknown')
|
|
matched_name = result.get('matched_name', name)
|
|
|
|
name_note = f" [matched: {matched_name}]" if matched_name != name else ""
|
|
print(f" ✅ Found: {result['qid']} [{entity_type}] - {result.get('name', '')} (match: {match_score:.0f}%{name_note})")
|
|
|
|
add_wikidata_to_institution(inst, result)
|
|
stats['enriched'] += 1
|
|
if country in stats['by_country']:
|
|
stats['by_country'][country]['enriched'] += 1
|
|
print(f" ✅ Enriched with validated match")
|
|
else:
|
|
stats['failed'] += 1
|
|
print(f" ❌ Not found or type mismatch")
|
|
|
|
# Checkpoint every N institutions
|
|
if i % checkpoint_interval == 0 or i == len(institutions):
|
|
save_checkpoint(institutions, output_file, stats)
|
|
|
|
# Final save
|
|
save_checkpoint(institutions, output_file, stats)
|
|
|
|
# Print statistics
|
|
print("\n" + "=" * 80)
|
|
print("LATIN AMERICA WIKIDATA ENRICHMENT STATISTICS")
|
|
print("=" * 80)
|
|
print(f"Total institutions: {stats['total']}")
|
|
print(f"Already enriched: {stats['already_enriched']}")
|
|
print(f"Searched: {stats['searched']}")
|
|
print(f"Found (validated): {stats['found']}")
|
|
print(f"Enriched (new): {stats['enriched']}")
|
|
print(f"Failed: {stats['failed']}")
|
|
print(f"\nFinal Wikidata coverage: {stats['already_enriched'] + stats['enriched']}/{stats['total']} ({100*(stats['already_enriched'] + stats['enriched'])/stats['total']:.1f}%)")
|
|
|
|
print("\nBy country:")
|
|
for country, country_stats in sorted(stats['by_country'].items()):
|
|
if country_stats['searched'] > 0:
|
|
print(f" {country}: searched {country_stats['searched']}, found {country_stats['found']}, enriched {country_stats['enriched']}")
|
|
|
|
if stats['enriched'] > 0:
|
|
improvement = stats['enriched']
|
|
old_coverage = stats['already_enriched']
|
|
new_coverage = stats['already_enriched'] + stats['enriched']
|
|
print(f"\n✨ Added {improvement} new validated Wikidata identifiers!")
|
|
print(f"✨ Coverage improved from {old_coverage}/{stats['total']} ({100*old_coverage/stats['total']:.1f}%) to {new_coverage}/{stats['total']} ({100*new_coverage/stats['total']:.1f}%)")
|
|
print(f"✅ All matches validated against correct entity types and countries")
|
|
|
|
print("\n✅ Wikidata enrichment complete!")
|
|
|
|
if __name__ == '__main__':
|
|
main()
|