glam/scripts/enrich_known_institutions_manual.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

148 lines
5.1 KiB
Python
Executable file
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Manually enrich well-known institutions with verified Wikidata QIDs.
This script handles high-profile institutions that should be in Wikidata
but may not match via fuzzy name matching due to naming variations.
"""
import yaml
import sys
from datetime import datetime, timezone
from pathlib import Path
# Verified Wikidata QIDs for well-known institutions
KNOWN_INSTITUTIONS = {
# US Institutions
"Internet Archive": "Q461",
"HathiTrust": "Q5683317",
"HathiTrust Digital Library": "Q5683317",
"Library of Congress": "Q131454",
"OCLC": "Q190927",
"WorldCat": "Q76630151",
# Belgian/EU Institutions
"European Parliament": "Q8889",
"European Commission": "Q8880",
"Council of the European Union": "Q8896",
# Brazilian Institutions
"Museu Nacional": "Q1850416", # National Museum Rio de Janeiro
"MASP": "Q82941", # São Paulo Museum of Art
"Pinacoteca": "Q2095209", # Pinacoteca do Estado de São Paulo
"Pinacoteca do Estado": "Q2095209",
"Pinacoteca de São Paulo": "Q2095209",
# Add more as needed
}
def has_wikidata_id(institution: dict) -> bool:
"""Check if institution already has a real Wikidata ID."""
return any(
id_obj.get("identifier_scheme") == "Wikidata" and
id_obj.get("identifier_value", "").startswith("Q") and
int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
for id_obj in institution.get("identifiers", [])
)
def add_wikidata_id(institution: dict, qid: str) -> bool:
"""Add Wikidata identifier to institution."""
if "identifiers" not in institution:
institution["identifiers"] = []
# Check if already exists
for id_obj in institution["identifiers"]:
if id_obj.get("identifier_scheme") == "Wikidata":
return False
# Add new Wikidata ID
institution["identifiers"].append({
"identifier_scheme": "Wikidata",
"identifier_value": qid,
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
})
# Update provenance
if "provenance" not in institution:
institution["provenance"] = {}
provenance = institution["provenance"]
if "enrichment_history" not in provenance:
provenance["enrichment_history"] = []
provenance["enrichment_history"].append({
"enrichment_date": datetime.now(timezone.utc).isoformat(),
"enrichment_method": "Manual Wikidata QID assignment (verified)",
"data_source": "Wikidata",
"confidence_score": 1.0
})
return True
def main():
input_file = Path("data/instances/global/global_heritage_institutions_wikidata_enriched.yaml")
if not input_file.exists():
print(f"❌ Input file not found: {input_file}")
sys.exit(1)
print("=" * 80)
print("🔧 MANUAL WIKIDATA ENRICHMENT FOR KNOWN INSTITUTIONS")
print("=" * 80)
print(f"\n📖 Loading dataset from: {input_file.name}")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"✅ Loaded {len(institutions):,} institutions\n")
enriched_count = 0
matched_institutions = []
for institution in institutions:
name = institution.get("name", "")
# Check if this institution matches any known names
for known_name, qid in KNOWN_INSTITUTIONS.items():
if known_name.lower() in name.lower() or name.lower() in known_name.lower():
# Check if already has Wikidata
if has_wikidata_id(institution):
continue
# Add Wikidata ID
if add_wikidata_id(institution, qid):
enriched_count += 1
matched_institutions.append({
"name": name,
"qid": qid,
"matched_pattern": known_name
})
print(f"✅ Enriched: {name}")
print(f" → Wikidata: {qid} (matched '{known_name}')")
print()
if enriched_count == 0:
print(" No institutions enriched (all already have Wikidata IDs)")
return
# Create backup
backup_file = input_file.with_suffix('.yaml.manual_enrichment_backup')
print(f"\n💾 Creating backup: {backup_file.name}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
# Write updated dataset
print(f"💾 Writing updated dataset: {input_file.name}")
with open(input_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print("\n" + "=" * 80)
print(f"✨ ENRICHMENT COMPLETE")
print("=" * 80)
print(f"Institutions enriched: {enriched_count}")
print(f"\nMatched institutions:")
for match in matched_institutions:
print(f"{match['name']}{match['qid']}")
if __name__ == "__main__":
main()