- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
185 lines
6.7 KiB
Python
185 lines
6.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Mexican Wikidata Enrichment - Batch 2
|
|
Add Wikidata identifiers to 4 institutions with perfect name matches (100% score).
|
|
|
|
Institutions:
|
|
1. Archivo General de la Nación (Q2860534, VIAF: 159570855)
|
|
2. Museo Frida Kahlo (Q2663377, VIAF: 144233695)
|
|
3. Museo Soumaya (Q2097646, VIAF: 135048064)
|
|
4. Museo de Antropología de Xalapa (Q1841655, VIAF: 138582541)
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
# Define enrichment data - Only 100% perfect matches from Wikidata SPARQL query
|
|
BATCH_2_ENRICHMENTS = {
|
|
"Archivo General de la Nación": {
|
|
"wikidata": "Q2860534",
|
|
"viaf": "159570855",
|
|
"confidence": 0.98,
|
|
"notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference"
|
|
},
|
|
"Museo Frida Kahlo": {
|
|
"wikidata": "Q2663377",
|
|
"viaf": "144233695",
|
|
"confidence": 0.98,
|
|
"notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference"
|
|
},
|
|
"Museo Soumaya": {
|
|
"wikidata": "Q2097646",
|
|
"viaf": "135048064",
|
|
"confidence": 0.98,
|
|
"notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference"
|
|
},
|
|
"Museo de Antropología de Xalapa": {
|
|
"wikidata": "Q1841655",
|
|
"viaf": "138582541",
|
|
"confidence": 0.98,
|
|
"notes": "Perfect name match (100% score) via SPARQL query and VIAF cross-reference"
|
|
}
|
|
}
|
|
|
|
|
|
def add_enrichment_identifiers(institution: dict, enrichment: dict) -> dict:
|
|
"""Add Wikidata and VIAF identifiers to institution record."""
|
|
|
|
# Ensure identifiers list exists
|
|
if 'identifiers' not in institution or institution['identifiers'] is None:
|
|
institution['identifiers'] = []
|
|
|
|
# Check if identifiers already exist (avoid duplicates)
|
|
existing_schemes = {i.get('identifier_scheme') for i in institution['identifiers']}
|
|
|
|
# Add Wikidata identifier if not present
|
|
if 'wikidata' in enrichment and 'Wikidata' not in existing_schemes:
|
|
q_number = enrichment['wikidata']
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_number,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
|
})
|
|
|
|
# Add VIAF identifier if not present
|
|
if 'viaf' in enrichment and 'VIAF' not in existing_schemes:
|
|
viaf_id = enrichment['viaf']
|
|
institution['identifiers'].append({
|
|
'identifier_scheme': 'VIAF',
|
|
'identifier_value': viaf_id,
|
|
'identifier_url': f'https://viaf.org/viaf/{viaf_id}'
|
|
})
|
|
|
|
# Update provenance metadata
|
|
current_time = datetime.now(timezone.utc).isoformat()
|
|
|
|
if 'provenance' not in institution:
|
|
institution['provenance'] = {}
|
|
|
|
# Add enrichment metadata
|
|
if 'enrichment_history' not in institution['provenance']:
|
|
institution['provenance']['enrichment_history'] = []
|
|
|
|
enrichment_entry = {
|
|
'enrichment_date': current_time,
|
|
'enrichment_method': 'Wikidata SPARQL query + VIAF cross-reference (Batch 2)',
|
|
'identifiers_added': [],
|
|
'confidence_score': enrichment['confidence'],
|
|
'notes': enrichment['notes']
|
|
}
|
|
|
|
if 'wikidata' in enrichment and 'Wikidata' not in existing_schemes:
|
|
enrichment_entry['identifiers_added'].append(f"Wikidata:{enrichment['wikidata']}")
|
|
if 'viaf' in enrichment and 'VIAF' not in existing_schemes:
|
|
enrichment_entry['identifiers_added'].append(f"VIAF:{enrichment['viaf']}")
|
|
|
|
# Only add enrichment entry if we actually added new identifiers
|
|
if enrichment_entry['identifiers_added']:
|
|
institution['provenance']['enrichment_history'].append(enrichment_entry)
|
|
|
|
return institution
|
|
|
|
|
|
def main():
|
|
"""Main enrichment workflow."""
|
|
|
|
# Load YAML file
|
|
yaml_path = Path('/Users/kempersc/apps/glam/data/instances/mexico/mexican_institutions_geocoded.yaml')
|
|
|
|
print("Loading Mexican institutions YAML file...")
|
|
with open(yaml_path, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f)
|
|
|
|
print(f"Loaded {len(institutions)} institutions")
|
|
|
|
# Track enrichment statistics
|
|
enriched_count = 0
|
|
skipped_count = 0
|
|
not_found = []
|
|
|
|
# Enrich each institution
|
|
for institution in institutions:
|
|
institution_name = institution.get('name', '')
|
|
|
|
if institution_name in BATCH_2_ENRICHMENTS:
|
|
enrichment = BATCH_2_ENRICHMENTS[institution_name]
|
|
|
|
# Check if already has identifiers
|
|
existing_identifiers = institution.get('identifiers', [])
|
|
has_wikidata = any(i.get('identifier_scheme') == 'Wikidata' for i in existing_identifiers)
|
|
has_viaf = any(i.get('identifier_scheme') == 'VIAF' for i in existing_identifiers)
|
|
|
|
if has_wikidata and has_viaf:
|
|
print(f"\n⏭️ Skipping (already enriched): {institution_name}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
print(f"\n✅ Enriching: {institution_name}")
|
|
print(f" - Wikidata: {enrichment.get('wikidata', 'N/A')}")
|
|
print(f" - VIAF: {enrichment.get('viaf', 'N/A')}")
|
|
|
|
institution = add_enrichment_identifiers(institution, enrichment)
|
|
enriched_count += 1
|
|
|
|
# Check for missing institutions
|
|
for name in BATCH_2_ENRICHMENTS.keys():
|
|
found = any(inst.get('name') == name for inst in institutions)
|
|
if not found:
|
|
not_found.append(name)
|
|
|
|
if not_found:
|
|
print("\n⚠️ Warning: Could not find these institutions in YAML:")
|
|
for name in not_found:
|
|
print(f" - {name}")
|
|
|
|
# Save enriched YAML
|
|
print(f"\n📝 Summary:")
|
|
print(f" - Newly enriched: {enriched_count}")
|
|
print(f" - Already enriched (skipped): {skipped_count}")
|
|
print(f" - Not found in dataset: {len(not_found)}")
|
|
|
|
print(f"\nWriting to {yaml_path}...")
|
|
|
|
with open(yaml_path, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f,
|
|
default_flow_style=False,
|
|
allow_unicode=True,
|
|
sort_keys=False,
|
|
width=1000)
|
|
|
|
print("✅ Batch 2 enrichment complete!")
|
|
|
|
# Generate statistics
|
|
total_wikidata = sum(1 for inst in institutions
|
|
if any(i.get('identifier_scheme') == 'Wikidata'
|
|
for i in inst.get('identifiers', [])))
|
|
|
|
coverage = (total_wikidata / len(institutions)) * 100
|
|
|
|
print(f"\n📊 Current Wikidata Coverage:")
|
|
print(f" {total_wikidata}/{len(institutions)} institutions ({coverage:.1f}%)")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|