glam/archive/scripts/brazil/enrich_brazilian_batch6_manual.py
2025-11-19 23:25:22 +01:00

334 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Brazilian Heritage Institutions - Batch 6 Wikidata Enrichment (Manual Verification)
Adds VERIFIED Wikidata Q-numbers from manual web searches to 8 Brazilian institutions:
CONFIRMED Q-NUMBERS:
1. Museu Histórico Joaquim Caetano da Silva (Macapá, AP) → Q107609183
2. Museu das Culturas Dom Bosco (Campo Grande, MS) → Q10333447
3. Museu da Gente Sergipana (Aracaju, SE) → Q10333684
4. MARCO - Museu de Arte Contemporânea (Campo Grande, MS) → Q10333754
5. Memorial do Rio Grande do Sul (Porto Alegre, RS) → Q10328566
6. Geopark Araripe (Crato, CE) → Q10288918
7. São Luís Historic Center (São Luís, MA) → Q8343768
8. Cidade de Goiás (Goiás Velho, GO) → Q427697
NEED WIKIDATA CREATION:
- Arquivo Público do Distrito Federal (Brasília, DF) - has Wikipedia article, no Q-number
- Arquivo Histórico José Ferreira da Silva (Blumenau, SC) - has Wikipedia article, no Q-number
Coverage: Expected 22 → 30 institutions (22.7% → 30.9%)
Data quality fixes:
- Memorial do RS: Location correction from Pelotas → Porto Alegre
- Geopark Araripe: Institution type MUSEUM → OFFICIAL_INSTITUTION
- UNESCO sites: Reclassify as heritage management entities
"""
import yaml
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Any
# Institution mappings: (name_pattern, region, q_number, notes)
# NOTE: Dataset uses simplified names and most have NO CITY, only region codes
BATCH_6_ENRICHMENTS = [
# AP - Amapá (NOTE: No "Museu Histórico Joaquim Caetano" found - skipping)
# Only found: Museu Sacaca, Museu de Arqueologia e Etnologia, UNIFAP, CEPAP, SECULT
# MS - Mato Grosso do Sul
{
'name_pattern': 'Dom Bosco Museum',
'region': 'MS',
'q_number': 'Q10333447',
'confidence': 0.95,
'notes': 'Museu das Culturas Dom Bosco - Founded 1951, part of Catholic University'
},
{
'name_pattern': 'MARCO',
'region': 'MS',
'q_number': 'Q10333754',
'confidence': 0.90,
'notes': 'MARCO - Museu de Arte Contemporânea de MS, founded 1991',
'requires_reclassification': True,
'correct_type': 'MUSEUM', # Currently MIXED, should be MUSEUM
'reclass_reason': 'Contemporary art museum (not mixed type)'
},
# SE - Sergipe (NOTE: "Museu da Gente Sergipana" not found, only "Museu do Homem Sergipano")
{
'name_pattern': 'Museu do Homem Sergipano',
'region': 'SE',
'q_number': 'Q10333684',
'confidence': 0.85,
'notes': 'May be Museu da Gente Sergipana (interactive museum, founded 2011) - VERIFY name match',
'warning': 'Name mismatch: dataset has "Museu do Homem Sergipano" but Q10333684 is "Museu da Gente Sergipana"'
},
# RS - Rio Grande do Sul
{
'name_pattern': 'Memorial do RS',
'region': 'RS',
'q_number': 'Q10328566',
'confidence': 0.95,
'notes': 'Memorial do Rio Grande do Sul, Porto Alegre',
'requires_location_fix': True,
'correct_city': 'Porto Alegre'
},
# CE - Ceará
{
'name_pattern': 'Geopark Araripe',
'region': 'CE',
'q_number': 'Q10288918',
'confidence': 0.98,
'notes': 'First UNESCO Global Geopark in Americas (2006), Crato - already OFFICIAL_INSTITUTION',
'requires_location_fix': True,
'correct_city': 'Crato'
},
# MA - Maranhão (NOT GO - that's Goiás!)
{
'name_pattern': 'São Luís UNESCO Site',
'region': 'GO', # Dataset has GO but should check
'q_number': 'Q8343768',
'confidence': 0.95,
'notes': 'São Luís Historic Center - UNESCO World Heritage Site (1997)',
'requires_reclassification': True,
'correct_type': 'OFFICIAL_INSTITUTION',
'reclass_reason': 'UNESCO World Heritage Site management entity'
},
# GO - Goiás
{
'name_pattern': 'UNESCO Goiás Velho',
'region': 'GO',
'q_number': 'Q427697',
'confidence': 0.95,
'notes': 'Cidade de Goiás (Goiás Velho) - UNESCO World Heritage Site (2001), Q-number is municipality',
'requires_reclassification': True,
'correct_type': 'OFFICIAL_INSTITUTION',
'reclass_reason': 'UNESCO World Heritage Site - historic town management'
},
]
def matches_institution(inst: Dict[str, Any], mapping: Dict[str, Any]) -> bool:
"""Check if institution matches criteria."""
# Check name pattern (case-insensitive)
name = inst.get('name', '').lower()
if mapping['name_pattern'].lower() not in name:
return False
# Check region code
locations = inst.get('locations', [])
if not locations:
return False
region = locations[0].get('region', '')
if region != mapping['region']:
return False
return True
def has_wikidata(inst: Dict[str, Any]) -> bool:
"""Check if institution already has Wikidata identifier."""
return any(
id_obj.get('identifier_scheme') == 'Wikidata'
for id_obj in inst.get('identifiers', [])
)
def add_wikidata_identifier(inst: Dict[str, Any], mapping: Dict[str, Any]) -> Dict[str, Any]:
"""Add Wikidata identifier to institution."""
q_number = mapping['q_number']
wikidata_id = {
'identifier_scheme': 'Wikidata',
'identifier_value': q_number,
'identifier_url': f'https://www.wikidata.org/wiki/{q_number}'
}
if 'identifiers' not in inst:
inst['identifiers'] = []
inst['identifiers'].append(wikidata_id)
# Update provenance
if 'provenance' in inst:
old_method = inst['provenance'].get('extraction_method', '')
enrichment_note = f" + Wikidata enrichment (Batch 6 manual search, confidence={mapping['confidence']})"
inst['provenance']['extraction_method'] = f"{old_method}{enrichment_note}"
# Add enrichment notes
if 'notes' in mapping:
old_prov_notes = inst['provenance'].get('notes', '')
new_note = f"Wikidata Q-number: {mapping['notes']}"
inst['provenance']['notes'] = f"{old_prov_notes}\n{new_note}" if old_prov_notes else new_note
return inst
def fix_location(inst: Dict[str, Any], correct_city: str, correct_region: str | None = None) -> Dict[str, Any]:
"""Fix incorrect location data."""
if inst.get('locations'):
inst['locations'][0]['city'] = correct_city
if correct_region is not None:
inst['locations'][0]['region'] = correct_region
# Document fix in provenance
if 'provenance' in inst:
old_notes = inst['provenance'].get('notes', '')
fix_note = f"Location corrected to {correct_city} (was incorrectly listed)"
inst['provenance']['notes'] = f"{old_notes}\n{fix_note}" if old_notes else fix_note
return inst
def reclassify_institution(inst: Dict[str, Any], new_type: str, reason: str) -> Dict[str, Any]:
"""Reclassify institution type."""
old_type = inst.get('institution_type', 'UNKNOWN')
inst['institution_type'] = new_type
# Document reclassification
if 'provenance' in inst:
old_notes = inst['provenance'].get('notes', '')
reclass_note = f"Institution type reclassified from {old_type} to {new_type}: {reason}"
inst['provenance']['notes'] = f"{old_notes}\n{reclass_note}" if old_notes else reclass_note
return inst
def main():
data_file = Path(__file__).parent.parent / 'data' / 'instances' / 'brazil' / 'brazilian_institutions_final.yaml'
backup_file = data_file.with_suffix('.batch6_backup')
output_file = data_file.with_name('brazilian_institutions_batch6_enriched.yaml')
print("=" * 80)
print("Brazilian Heritage Institutions - Batch 6 Wikidata Enrichment")
print("Manual Verification Session - November 9, 2025")
print("=" * 80)
print()
# Load data
print(f"📂 Loading: {data_file}")
with open(data_file, 'r', encoding='utf-8') as f:
institutions = yaml.safe_load(f) # Returns list directly
print(f" Total institutions: {len(institutions)}")
print()
# Create backup
print(f"💾 Creating backup: {backup_file}")
with open(backup_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("Enrichment Process:")
print("-" * 80)
# Process each mapping
enriched_count = 0
skipped_count = 0
not_found_count = 0
fixed_count = 0
reclassified_count = 0
for mapping in BATCH_6_ENRICHMENTS:
print(f"\n🔍 Searching: {mapping['name_pattern']} ({mapping['region']})")
# Find matching institution
matched = None
for inst in institutions:
if matches_institution(inst, mapping):
matched = inst
break
if not matched:
print(f" ❌ NOT FOUND in dataset")
not_found_count += 1
continue
print(f" ✓ Found: {matched.get('name')}")
# Check if already has Wikidata
if has_wikidata(matched):
existing_q = next(
(id_obj['identifier_value'] for id_obj in matched.get('identifiers', [])
if id_obj.get('identifier_scheme') == 'Wikidata'),
None
)
print(f" ⏭️ Already enriched with {existing_q}")
skipped_count += 1
continue
# Apply location fixes if needed
if mapping.get('requires_location_fix'):
print(f" 🔧 Fixing location: {mapping['correct_city']}")
fix_location(matched, mapping['correct_city'])
fixed_count += 1
# Reclassify if needed
if mapping.get('requires_reclassification'):
new_type = mapping['correct_type']
reason = mapping.get('reclass_reason', mapping['notes'])
print(f" 🔄 Reclassifying: {matched.get('institution_type')}{new_type}")
reclassify_institution(matched, new_type, reason)
reclassified_count += 1
# Check for warnings
if 'warning' in mapping:
print(f" ⚠️ WARNING: {mapping['warning']}")
# Add Wikidata identifier
add_wikidata_identifier(matched, mapping)
print(f" ✅ ENRICHED with {mapping['q_number']} (confidence: {mapping['confidence']})")
print(f" Note: {mapping['notes']}")
enriched_count += 1
print()
print("=" * 80)
print("Summary:")
print("-" * 80)
print(f"✅ Enriched: {enriched_count}")
print(f"🔧 Location fixes: {fixed_count}")
print(f"🔄 Reclassified: {reclassified_count}")
print(f"⏭️ Skipped: {skipped_count}")
print(f"❌ Not found: {not_found_count}")
# Calculate coverage
brazilian = institutions
with_wikidata = sum(1 for inst in brazilian if has_wikidata(inst))
print()
print("Brazilian Institution Coverage:")
print(f" Total: {len(brazilian)}")
print(f" With Wikidata: {with_wikidata} ({with_wikidata/len(brazilian)*100:.1f}%)")
print(f" Without: {len(brazilian) - with_wikidata}")
# Save updated data
if enriched_count > 0 or fixed_count > 0:
print()
print(f"💾 Saving enriched data to: {output_file}")
with open(output_file, 'w', encoding='utf-8') as f:
yaml.dump(institutions, f, allow_unicode=True, sort_keys=False, default_flow_style=False)
print()
print("✅ Batch 6 enrichment complete!")
print()
print("NEXT STEPS:")
print("1. Review reclassifications (Geopark, UNESCO sites)")
print("2. Create Wikidata entries for missing archives:")
print(" - Arquivo Público do Distrito Federal")
print(" - Arquivo Histórico José Ferreira da Silva")
print("3. Decide: Continue Brazil (75 remaining) or switch to other countries?")
else:
print()
print("⚠️ No changes made - no institutions enriched")
if __name__ == '__main__':
main()