glam/scripts/add_missing_locations.py
kempersc e5a532a8bc Add comprehensive tests for NLP institution extraction and RDF partnership integration
- Introduced `test_nlp_extractor.py` with unit tests for the InstitutionExtractor, covering various extraction patterns (ISIL, Wikidata, VIAF, city names) and ensuring proper classification of institutions (museum, library, archive).
- Added tests for extracted entities and result handling to validate the extraction process.
- Created `test_partnership_rdf_integration.py` to validate the end-to-end process of extracting partnerships from a conversation and exporting them to RDF format.
- Implemented tests for temporal properties in partnerships and ensured compliance with W3C Organization Ontology patterns.
- Verified that extracted partnerships are correctly linked with PROV-O provenance metadata.
2025-11-19 23:20:47 +01:00

238 lines
6.5 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Add locations to institutions that are missing them.
For Mexican national institutions: Mexico City
For international resources: Their actual headquarters
"""
import yaml
from pathlib import Path
# Location mappings
LOCATION_MAP = {
# Mexican National Institutions - All in Mexico City
"Instituto Nacional de Antropología e Historia": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Sistema de Información Cultural (SIC)": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Mexicana - Repositorio del Patrimonio Cultural": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Archivo General de la Nación": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Memórica México Platform": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Biblioteca Nacional de México": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Hemeroteca Nacional Digital de México": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Museo Nacional de Arte (MUNAL)": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Museo Universitario Arte Contemporáneo (MUAC)": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Museo Nacional de Antropología": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Museo del Templo Mayor": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Sistema Bibliotecario UNAM (SIBIUNAM)": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Instituto Politécnico Nacional (IPN)": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"El Colegio de México (COLMEX)": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Centro de Investigación y Docencia Económicas (CIDE)": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Biblioteca Virtual de Yucatán": {
"city": "Mérida",
"region": "Yucatán",
"country": "MX",
},
"Mexico City Resources": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Archivo Histórico Diplomático (SRE)": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Mapoteca Manuel Orozco y Berra": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Archivo Histórico de la UNAM (AHUNAM)": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Centro de Estudios de Historia de México Carso": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Memórica México": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Mexicana Repository": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Red de Humanidades Digitales (RedHD)": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Códices de México": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
"Fonoteca Nacional": {
"city": "Mexico City",
"region": "Ciudad de México",
"country": "MX",
},
# International Resources - Actual headquarters
"WorldCat.org": {
"city": "Dublin",
"region": "Ohio",
"country": "US",
},
"WorldCat Registry": {
"city": "Dublin",
"region": "Ohio",
"country": "US",
},
"HathiTrust Digital Library": {
"city": "Ann Arbor",
"region": "Michigan",
"country": "US",
},
"Internet Archive": {
"city": "San Francisco",
"region": "California",
"country": "US",
},
"Nettie Lee Benson Collection (UT Austin)": {
"city": "Austin",
"region": "Texas",
"country": "US",
},
"Library of Congress Hispanic Reading Room": {
"city": "Washington",
"region": "District of Columbia",
"country": "US",
},
"CLACSO Virtual Libraries": {
"city": "Buenos Aires",
"region": "Buenos Aires",
"country": "AR",
},
"Latin American Network Information Center (LANIC)": {
"city": "Austin",
"region": "Texas",
"country": "US",
},
}
def main():
project_root = Path(__file__).parent.parent
input_file = project_root / "data" / "instances" / "latin_american_institutions_AUTHORITATIVE.yaml"
print(f"Loading institutions from: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f)
print(f"Found {len(institutions)} institutions")
# Add locations
updated_count = 0
for inst in institutions:
name = inst.get('name')
if name in LOCATION_MAP and not inst.get('locations'):
location = LOCATION_MAP[name]
inst['locations'] = [location]
updated_count += 1
print(f"✓ Added location for: {name}")
print(f"{location['city']}, {location['country']}")
print()
print(f"Updated {updated_count} institutions with locations")
# Write back
print(f"Writing updated YAML to: {input_file}")
with open(input_file, 'w', encoding='utf-8') as f:
# Keep the header
header = """---
# Latin American GLAM Institutions - GHCID Enhanced
# Last updated with locations for national/international institutions
# Ready for complete GHCID generation
"""
f.write(header)
yaml.dump(institutions, f,
default_flow_style=False,
allow_unicode=True,
sort_keys=False,
width=100)
print("✅ Done!")
if __name__ == "__main__":
main()