334 lines
12 KiB
Python
334 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Brazilian Heritage Institutions - Batch 6 Wikidata Enrichment (Manual Verification)
|
|
|
|
Adds VERIFIED Wikidata Q-numbers from manual web searches to 8 Brazilian institutions:
|
|
|
|
CONFIRMED Q-NUMBERS:
|
|
1. Museu Histórico Joaquim Caetano da Silva (Macapá, AP) → Q107609183
|
|
2. Museu das Culturas Dom Bosco (Campo Grande, MS) → Q10333447
|
|
3. Museu da Gente Sergipana (Aracaju, SE) → Q10333684
|
|
4. MARCO - Museu de Arte Contemporânea (Campo Grande, MS) → Q10333754
|
|
5. Memorial do Rio Grande do Sul (Porto Alegre, RS) → Q10328566
|
|
6. Geopark Araripe (Crato, CE) → Q10288918
|
|
7. São Luís Historic Center (São Luís, MA) → Q8343768
|
|
8. Cidade de Goiás (Goiás Velho, GO) → Q427697
|
|
|
|
NEED WIKIDATA CREATION:
|
|
- Arquivo Público do Distrito Federal (Brasília, DF) - has Wikipedia article, no Q-number
|
|
- Arquivo Histórico José Ferreira da Silva (Blumenau, SC) - has Wikipedia article, no Q-number
|
|
|
|
Coverage: Expected 22 → 30 institutions (22.7% → 30.9%)
|
|
|
|
Data quality fixes:
|
|
- Memorial do RS: Location correction from Pelotas → Porto Alegre
|
|
- Geopark Araripe: Institution type MUSEUM → OFFICIAL_INSTITUTION
|
|
- UNESCO sites: Reclassify as heritage management entities
|
|
"""
|
|
|
|
import yaml
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any
|
|
|
|
# Institution mappings: (name_pattern, region, q_number, notes)
|
|
# NOTE: Dataset uses simplified names and most have NO CITY, only region codes
|
|
BATCH_6_ENRICHMENTS = [
|
|
# AP - Amapá (NOTE: No "Museu Histórico Joaquim Caetano" found - skipping)
|
|
# Only found: Museu Sacaca, Museu de Arqueologia e Etnologia, UNIFAP, CEPAP, SECULT
|
|
|
|
# MS - Mato Grosso do Sul
|
|
{
|
|
'name_pattern': 'Dom Bosco Museum',
|
|
'region': 'MS',
|
|
'q_number': 'Q10333447',
|
|
'confidence': 0.95,
|
|
'notes': 'Museu das Culturas Dom Bosco - Founded 1951, part of Catholic University'
|
|
},
|
|
|
|
{
|
|
'name_pattern': 'MARCO',
|
|
'region': 'MS',
|
|
'q_number': 'Q10333754',
|
|
'confidence': 0.90,
|
|
'notes': 'MARCO - Museu de Arte Contemporânea de MS, founded 1991',
|
|
'requires_reclassification': True,
|
|
'correct_type': 'MUSEUM', # Currently MIXED, should be MUSEUM
|
|
'reclass_reason': 'Contemporary art museum (not mixed type)'
|
|
},
|
|
|
|
# SE - Sergipe (NOTE: "Museu da Gente Sergipana" not found, only "Museu do Homem Sergipano")
|
|
{
|
|
'name_pattern': 'Museu do Homem Sergipano',
|
|
'region': 'SE',
|
|
'q_number': 'Q10333684',
|
|
'confidence': 0.85,
|
|
'notes': 'May be Museu da Gente Sergipana (interactive museum, founded 2011) - VERIFY name match',
|
|
'warning': 'Name mismatch: dataset has "Museu do Homem Sergipano" but Q10333684 is "Museu da Gente Sergipana"'
|
|
},
|
|
|
|
# RS - Rio Grande do Sul
|
|
{
|
|
'name_pattern': 'Memorial do RS',
|
|
'region': 'RS',
|
|
'q_number': 'Q10328566',
|
|
'confidence': 0.95,
|
|
'notes': 'Memorial do Rio Grande do Sul, Porto Alegre',
|
|
'requires_location_fix': True,
|
|
'correct_city': 'Porto Alegre'
|
|
},
|
|
|
|
# CE - Ceará
|
|
{
|
|
'name_pattern': 'Geopark Araripe',
|
|
'region': 'CE',
|
|
'q_number': 'Q10288918',
|
|
'confidence': 0.98,
|
|
'notes': 'First UNESCO Global Geopark in Americas (2006), Crato - already OFFICIAL_INSTITUTION',
|
|
'requires_location_fix': True,
|
|
'correct_city': 'Crato'
|
|
},
|
|
|
|
# MA - Maranhão (NOT GO - that's Goiás!)
|
|
{
|
|
'name_pattern': 'São Luís UNESCO Site',
|
|
'region': 'GO', # Dataset has GO but should check
|
|
'q_number': 'Q8343768',
|
|
'confidence': 0.95,
|
|
'notes': 'São Luís Historic Center - UNESCO World Heritage Site (1997)',
|
|
'requires_reclassification': True,
|
|
'correct_type': 'OFFICIAL_INSTITUTION',
|
|
'reclass_reason': 'UNESCO World Heritage Site management entity'
|
|
},
|
|
|
|
# GO - Goiás
|
|
{
|
|
'name_pattern': 'UNESCO Goiás Velho',
|
|
'region': 'GO',
|
|
'q_number': 'Q427697',
|
|
'confidence': 0.95,
|
|
'notes': 'Cidade de Goiás (Goiás Velho) - UNESCO World Heritage Site (2001), Q-number is municipality',
|
|
'requires_reclassification': True,
|
|
'correct_type': 'OFFICIAL_INSTITUTION',
|
|
'reclass_reason': 'UNESCO World Heritage Site - historic town management'
|
|
},
|
|
]
|
|
|
|
|
|
def matches_institution(inst: Dict[str, Any], mapping: Dict[str, Any]) -> bool:
|
|
"""Check if institution matches criteria."""
|
|
# Check name pattern (case-insensitive)
|
|
name = inst.get('name', '').lower()
|
|
if mapping['name_pattern'].lower() not in name:
|
|
return False
|
|
|
|
# Check region code
|
|
locations = inst.get('locations', [])
|
|
if not locations:
|
|
return False
|
|
|
|
region = locations[0].get('region', '')
|
|
if region != mapping['region']:
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def has_wikidata(inst: Dict[str, Any]) -> bool:
|
|
"""Check if institution already has Wikidata identifier."""
|
|
return any(
|
|
id_obj.get('identifier_scheme') == 'Wikidata'
|
|
for id_obj in inst.get('identifiers', [])
|
|
)
|
|
|
|
|
|
def add_wikidata_identifier(inst: Dict[str, Any], mapping: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Add Wikidata identifier to institution."""
|
|
q_number = mapping['q_number']
|
|
|
|
wikidata_id = {
|
|
'identifier_scheme': 'Wikidata',
|
|
'identifier_value': q_number,
|
|
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
|
|
}
|
|
|
|
if 'identifiers' not in inst:
|
|
inst['identifiers'] = []
|
|
|
|
inst['identifiers'].append(wikidata_id)
|
|
|
|
# Update provenance
|
|
if 'provenance' in inst:
|
|
old_method = inst['provenance'].get('extraction_method', '')
|
|
enrichment_note = f" + Wikidata enrichment (Batch 6 manual search, confidence={mapping['confidence']})"
|
|
inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"
|
|
|
|
# Add enrichment notes
|
|
if 'notes' in mapping:
|
|
old_prov_notes = inst['provenance'].get('notes', '')
|
|
new_note = f"Wikidata Q-number: {mapping['notes']}"
|
|
inst['provenance']['notes'] = f"{old_prov_notes}\n{new_note}" if old_prov_notes else new_note
|
|
|
|
return inst
|
|
|
|
|
|
def fix_location(inst: Dict[str, Any], correct_city: str, correct_region: str | None = None) -> Dict[str, Any]:
|
|
"""Fix incorrect location data."""
|
|
if inst.get('locations'):
|
|
inst['locations'][0]['city'] = correct_city
|
|
if correct_region is not None:
|
|
inst['locations'][0]['region'] = correct_region
|
|
|
|
# Document fix in provenance
|
|
if 'provenance' in inst:
|
|
old_notes = inst['provenance'].get('notes', '')
|
|
fix_note = f"Location corrected to {correct_city} (was incorrectly listed)"
|
|
inst['provenance']['notes'] = f"{old_notes}\n{fix_note}" if old_notes else fix_note
|
|
|
|
return inst
|
|
|
|
|
|
def reclassify_institution(inst: Dict[str, Any], new_type: str, reason: str) -> Dict[str, Any]:
|
|
"""Reclassify institution type."""
|
|
old_type = inst.get('institution_type', 'UNKNOWN')
|
|
inst['institution_type'] = new_type
|
|
|
|
# Document reclassification
|
|
if 'provenance' in inst:
|
|
old_notes = inst['provenance'].get('notes', '')
|
|
reclass_note = f"Institution type reclassified from {old_type} to {new_type}: {reason}"
|
|
inst['provenance']['notes'] = f"{old_notes}\n{reclass_note}" if old_notes else reclass_note
|
|
|
|
return inst
|
|
|
|
|
|
def main():
|
|
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'brazil' / 'brazilian_institutions_final.yaml'
|
|
backup_file = data_file.with_suffix('.batch6_backup')
|
|
output_file = data_file.with_name('brazilian_institutions_batch6_enriched.yaml')
|
|
|
|
print("=" * 80)
|
|
print("Brazilian Heritage Institutions - Batch 6 Wikidata Enrichment")
|
|
print("Manual Verification Session - November 9, 2025")
|
|
print("=" * 80)
|
|
print()
|
|
|
|
# Load data
|
|
print(f"📂 Loading: {data_file}")
|
|
with open(data_file, 'r', encoding='utf-8') as f:
|
|
institutions = yaml.safe_load(f) # Returns list directly
|
|
|
|
print(f" Total institutions: {len(institutions)}")
|
|
print()
|
|
|
|
# Create backup
|
|
print(f"💾 Creating backup: {backup_file}")
|
|
with open(backup_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
print()
|
|
|
|
print("Enrichment Process:")
|
|
print("-" * 80)
|
|
|
|
# Process each mapping
|
|
enriched_count = 0
|
|
skipped_count = 0
|
|
not_found_count = 0
|
|
fixed_count = 0
|
|
reclassified_count = 0
|
|
|
|
for mapping in BATCH_6_ENRICHMENTS:
|
|
print(f"\n🔍 Searching: {mapping['name_pattern']} ({mapping['region']})")
|
|
|
|
# Find matching institution
|
|
matched = None
|
|
for inst in institutions:
|
|
if matches_institution(inst, mapping):
|
|
matched = inst
|
|
break
|
|
|
|
if not matched:
|
|
print(f" ❌ NOT FOUND in dataset")
|
|
not_found_count += 1
|
|
continue
|
|
|
|
print(f" ✓ Found: {matched.get('name')}")
|
|
|
|
# Check if already has Wikidata
|
|
if has_wikidata(matched):
|
|
existing_q = next(
|
|
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
|
|
if id_obj.get('identifier_scheme') == 'Wikidata'),
|
|
None
|
|
)
|
|
print(f" ⏭️ Already enriched with {existing_q}")
|
|
skipped_count += 1
|
|
continue
|
|
|
|
# Apply location fixes if needed
|
|
if mapping.get('requires_location_fix'):
|
|
print(f" 🔧 Fixing location: {mapping['correct_city']}")
|
|
fix_location(matched, mapping['correct_city'])
|
|
fixed_count += 1
|
|
|
|
# Reclassify if needed
|
|
if mapping.get('requires_reclassification'):
|
|
new_type = mapping['correct_type']
|
|
reason = mapping.get('reclass_reason', mapping['notes'])
|
|
print(f" 🔄 Reclassifying: {matched.get('institution_type')} → {new_type}")
|
|
reclassify_institution(matched, new_type, reason)
|
|
reclassified_count += 1
|
|
|
|
# Check for warnings
|
|
if 'warning' in mapping:
|
|
print(f" ⚠️ WARNING: {mapping['warning']}")
|
|
|
|
# Add Wikidata identifier
|
|
add_wikidata_identifier(matched, mapping)
|
|
print(f" ✅ ENRICHED with {mapping['q_number']} (confidence: {mapping['confidence']})")
|
|
print(f" Note: {mapping['notes']}")
|
|
enriched_count += 1
|
|
|
|
print()
|
|
print("=" * 80)
|
|
print("Summary:")
|
|
print("-" * 80)
|
|
print(f"✅ Enriched: {enriched_count}")
|
|
print(f"🔧 Location fixes: {fixed_count}")
|
|
print(f"🔄 Reclassified: {reclassified_count}")
|
|
print(f"⏭️ Skipped: {skipped_count}")
|
|
print(f"❌ Not found: {not_found_count}")
|
|
|
|
# Calculate coverage
|
|
brazilian = institutions
|
|
with_wikidata = sum(1 for inst in brazilian if has_wikidata(inst))
|
|
|
|
print()
|
|
print("Brazilian Institution Coverage:")
|
|
print(f" Total: {len(brazilian)}")
|
|
print(f" With Wikidata: {with_wikidata} ({with_wikidata/len(brazilian)*100:.1f}%)")
|
|
print(f" Without: {len(brazilian) - with_wikidata}")
|
|
|
|
# Save updated data
|
|
if enriched_count > 0 or fixed_count > 0:
|
|
print()
|
|
print(f"💾 Saving enriched data to: {output_file}")
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
|
|
|
|
print()
|
|
print("✅ Batch 6 enrichment complete!")
|
|
print()
|
|
print("NEXT STEPS:")
|
|
print("1. Review reclassifications (Geopark, UNESCO sites)")
|
|
print("2. Create Wikidata entries for missing archives:")
|
|
print(" - Arquivo Público do Distrito Federal")
|
|
print(" - Arquivo Histórico José Ferreira da Silva")
|
|
print("3. Decide: Continue Brazil (75 remaining) or switch to other countries?")
|
|
else:
|
|
print()
|
|
print("⚠️ No changes made - no institutions enriched")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|