- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive). - Added tests for extracted entities and result handling to validate the extraction process. - Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format. - Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns. - Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
148 lines
5.1 KiB
Python
Executable file
148 lines
5.1 KiB
Python
Executable file
#!/usr/bin/env python3
|
||
"""
|
||
Manually enrich well-known institutions with verified Wikidata QIDs.
|
||
|
||
This script handles high-profile institutions that should be in Wikidata
|
||
but may not match via fuzzy name matching due to naming variations.
|
||
"""
|
||
|
||
import yaml
|
||
import sys
|
||
from datetime import datetime, timezone
|
||
from pathlib import Path
|
||
|
||
# Verified Wikidata QIDs for well-known institutions
|
||
KNOWN_INSTITUTIONS = {
|
||
# US Institutions
|
||
"Internet Archive": "Q461",
|
||
"HathiTrust": "Q5683317",
|
||
"HathiTrust Digital Library": "Q5683317",
|
||
"Library of Congress": "Q131454",
|
||
"OCLC": "Q190927",
|
||
"WorldCat": "Q76630151",
|
||
|
||
# Belgian/EU Institutions
|
||
"European Parliament": "Q8889",
|
||
"European Commission": "Q8880",
|
||
"Council of the European Union": "Q8896",
|
||
|
||
# Brazilian Institutions
|
||
"Museu Nacional": "Q1850416", # National Museum Rio de Janeiro
|
||
"MASP": "Q82941", # São Paulo Museum of Art
|
||
"Pinacoteca": "Q2095209", # Pinacoteca do Estado de São Paulo
|
||
"Pinacoteca do Estado": "Q2095209",
|
||
"Pinacoteca de São Paulo": "Q2095209",
|
||
|
||
# Add more as needed
|
||
}
|
||
|
||
def has_wikidata_id(institution: dict) -> bool:
|
||
"""Check if institution already has a real Wikidata ID."""
|
||
return any(
|
||
id_obj.get("identifier_scheme") == "Wikidata" and
|
||
id_obj.get("identifier_value", "").startswith("Q") and
|
||
int(id_obj.get("identifier_value", "Q999999999")[1:]) < 100000000
|
||
for id_obj in institution.get("identifiers", [])
|
||
)
|
||
|
||
def add_wikidata_id(institution: dict, qid: str) -> bool:
|
||
"""Add Wikidata identifier to institution."""
|
||
if "identifiers" not in institution:
|
||
institution["identifiers"] = []
|
||
|
||
# Check if already exists
|
||
for id_obj in institution["identifiers"]:
|
||
if id_obj.get("identifier_scheme") == "Wikidata":
|
||
return False
|
||
|
||
# Add new Wikidata ID
|
||
institution["identifiers"].append({
|
||
"identifier_scheme": "Wikidata",
|
||
"identifier_value": qid,
|
||
"identifier_url": f"https://www.wikidata.org/wiki/{qid}"
|
||
})
|
||
|
||
# Update provenance
|
||
if "provenance" not in institution:
|
||
institution["provenance"] = {}
|
||
|
||
provenance = institution["provenance"]
|
||
if "enrichment_history" not in provenance:
|
||
provenance["enrichment_history"] = []
|
||
|
||
provenance["enrichment_history"].append({
|
||
"enrichment_date": datetime.now(timezone.utc).isoformat(),
|
||
"enrichment_method": "Manual Wikidata QID assignment (verified)",
|
||
"data_source": "Wikidata",
|
||
"confidence_score": 1.0
|
||
})
|
||
|
||
return True
|
||
|
||
def main():
|
||
input_file = Path("data/instances/global/global_heritage_institutions_wikidata_enriched.yaml")
|
||
|
||
if not input_file.exists():
|
||
print(f"❌ Input file not found: {input_file}")
|
||
sys.exit(1)
|
||
|
||
print("=" * 80)
|
||
print("🔧 MANUAL WIKIDATA ENRICHMENT FOR KNOWN INSTITUTIONS")
|
||
print("=" * 80)
|
||
print(f"\n📖 Loading dataset from: {input_file.name}")
|
||
|
||
with open(input_file, 'r', encoding='utf-8') as f:
|
||
institutions = yaml.safe_load(f)
|
||
|
||
print(f"✅ Loaded {len(institutions):,} institutions\n")
|
||
|
||
enriched_count = 0
|
||
matched_institutions = []
|
||
|
||
for institution in institutions:
|
||
name = institution.get("name", "")
|
||
|
||
# Check if this institution matches any known names
|
||
for known_name, qid in KNOWN_INSTITUTIONS.items():
|
||
if known_name.lower() in name.lower() or name.lower() in known_name.lower():
|
||
# Check if already has Wikidata
|
||
if has_wikidata_id(institution):
|
||
continue
|
||
|
||
# Add Wikidata ID
|
||
if add_wikidata_id(institution, qid):
|
||
enriched_count += 1
|
||
matched_institutions.append({
|
||
"name": name,
|
||
"qid": qid,
|
||
"matched_pattern": known_name
|
||
})
|
||
print(f"✅ Enriched: {name}")
|
||
print(f" → Wikidata: {qid} (matched '{known_name}')")
|
||
print()
|
||
|
||
if enriched_count == 0:
|
||
print("ℹ️ No institutions enriched (all already have Wikidata IDs)")
|
||
return
|
||
|
||
# Create backup
|
||
backup_file = input_file.with_suffix('.yaml.manual_enrichment_backup')
|
||
print(f"\n💾 Creating backup: {backup_file.name}")
|
||
with open(backup_file, 'w', encoding='utf-8') as f:
|
||
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
||
|
||
# Write updated dataset
|
||
print(f"💾 Writing updated dataset: {input_file.name}")
|
||
with open(input_file, 'w', encoding='utf-8') as f:
|
||
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
||
|
||
print("\n" + "=" * 80)
|
||
print(f"✨ ENRICHMENT COMPLETE")
|
||
print("=" * 80)
|
||
print(f"Institutions enriched: {enriched_count}")
|
||
print(f"\nMatched institutions:")
|
||
for match in matched_institutions:
|
||
print(f" • {match['name']} → {match['qid']}")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|