""" Generate Manual Review Report for Wikidata Fuzzy Matches Analyzes denmark_complete_enriched.json to extract all fuzzy matches (85-99% confidence) and creates a prioritized CSV report for manual validation. """ import json import csv import re from pathlib import Path from typing import Dict, List, Optional def parse_identifier_string(identifier_str: str) -> Optional[Dict]: """Parse identifier from string representation.""" if not identifier_str or not isinstance(identifier_str, str): return None scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str) value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str) url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str) if scheme_match and value_match: return { 'scheme': scheme_match.group(1), 'value': value_match.group(1), 'url': url_match.group(1) if url_match else None } return None def extract_fuzzy_matches(institutions: List[Dict]) -> List[Dict]: """ Extract institutions with fuzzy Wikidata matches (85-99% confidence). Returns list of review records with institution and match metadata. """ fuzzy_matches = [] for inst in institutions: enrichment_history = inst.get('enrichment_history', []) for enrichment in enrichment_history: match_score = enrichment.get('match_score') # Fuzzy match: 85-99% confidence if match_score and 85 <= match_score < 100: # Extract Wikidata Q-number wikidata_qid = None for identifier_data in inst.get('identifiers', []): identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'Wikidata': wikidata_qid = identifier.get('value') break # Extract ISIL code isil_code = None for identifier_data in inst.get('identifiers', []): identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'ISIL': isil_code = identifier.get('value') break # Extract location city = None locations = inst.get('locations', []) if locations: first_loc = locations[0] if isinstance(first_loc, str): city_match = re.search(r"'city':\s*'([^']*)'", first_loc) if city_match: city = city_match.group(1) elif isinstance(first_loc, dict): city = first_loc.get('city', '') # Extract GHCID ghcid = inst.get('ghcid', '') fuzzy_matches.append({ 'institution_name': inst.get('name', ''), 'institution_type': inst.get('institution_type', ''), 'city': city or '', 'isil_code': isil_code or '', 'ghcid': ghcid, 'wikidata_qid': wikidata_qid or '', 'wikidata_label': enrichment.get('matched_label', ''), 'match_score': match_score, 'wikidata_url': f"https://www.wikidata.org/wiki/{wikidata_qid}" if wikidata_qid else '', 'institution_id': inst.get('id', ''), 'validation_status': '', # For manual review 'validation_notes': '' # For manual review }) # Sort by match score (lowest first = most uncertain) fuzzy_matches.sort(key=lambda x: x['match_score']) return fuzzy_matches def generate_csv_report(fuzzy_matches: List[Dict], output_path: Path): """Generate CSV report for manual review.""" fieldnames = [ 'priority', 'match_score', 'institution_name', 'wikidata_label', 'city', 'institution_type', 'isil_code', 'ghcid', 'wikidata_qid', 'wikidata_url', 'validation_status', 'validation_notes', 'institution_id' ] with open(output_path, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for i, match in enumerate(fuzzy_matches, 1): # Assign priority (1=highest, 5=lowest) score = match['match_score'] if score < 87: priority = 1 # Very uncertain elif score < 90: priority = 2 # Uncertain elif score < 93: priority = 3 # Moderate elif score < 96: priority = 4 # Fairly confident else: priority = 5 # Mostly confident writer.writerow({ 'priority': priority, 'match_score': match['match_score'], 'institution_name': match['institution_name'], 'wikidata_label': match['wikidata_label'], 'city': match['city'], 'institution_type': match['institution_type'], 'isil_code': match['isil_code'], 'ghcid': match['ghcid'], 'wikidata_qid': match['wikidata_qid'], 'wikidata_url': match['wikidata_url'], 'validation_status': match['validation_status'], 'validation_notes': match['validation_notes'], 'institution_id': match['institution_id'] }) def generate_statistics(fuzzy_matches: List[Dict]) -> Dict: """Calculate statistics for fuzzy matches.""" stats = { 'total': len(fuzzy_matches), 'by_priority': {}, 'by_type': {}, 'by_score_range': { '85-87': 0, '87-90': 0, '90-93': 0, '93-96': 0, '96-99': 0 } } # Count by priority for match in fuzzy_matches: score = match['match_score'] if score < 87: priority = 1 elif score < 90: priority = 2 elif score < 93: priority = 3 elif score < 96: priority = 4 else: priority = 5 stats['by_priority'][priority] = stats['by_priority'].get(priority, 0) + 1 # Count by institution type for match in fuzzy_matches: inst_type = match['institution_type'] stats['by_type'][inst_type] = stats['by_type'].get(inst_type, 0) + 1 # Count by score range for match in fuzzy_matches: score = match['match_score'] if 85 <= score < 87: stats['by_score_range']['85-87'] += 1 elif 87 <= score < 90: stats['by_score_range']['87-90'] += 1 elif 90 <= score < 93: stats['by_score_range']['90-93'] += 1 elif 93 <= score < 96: stats['by_score_range']['93-96'] += 1 elif 96 <= score < 100: stats['by_score_range']['96-99'] += 1 return stats def main(): print("=" * 70) print("Wikidata Fuzzy Match Review Report Generator") print("=" * 70) # Load enriched dataset input_path = Path('data/instances/denmark_complete_enriched.json') print(f"\nLoading enriched dataset: {input_path}") with open(input_path, 'r', encoding='utf-8') as f: institutions = json.load(f) print(f" ✅ Loaded {len(institutions)} institutions") # Extract fuzzy matches print("\nExtracting fuzzy matches (85-99% confidence)...") fuzzy_matches = extract_fuzzy_matches(institutions) print(f" ✅ Found {len(fuzzy_matches)} fuzzy matches") # Generate statistics stats = generate_statistics(fuzzy_matches) # Generate CSV report output_csv = Path('data/review/denmark_wikidata_fuzzy_matches.csv') output_csv.parent.mkdir(parents=True, exist_ok=True) print(f"\nGenerating CSV report: {output_csv}") generate_csv_report(fuzzy_matches, output_csv) print(f" ✅ CSV report generated ({len(fuzzy_matches)} rows)") # Print statistics print("\n" + "=" * 70) print("Fuzzy Match Statistics") print("=" * 70) print(f"\nTotal fuzzy matches: {stats['total']}") print("\nBy Priority (1=most uncertain, 5=fairly confident):") for priority in sorted(stats['by_priority'].keys()): count = stats['by_priority'][priority] print(f" Priority {priority}: {count:3d} matches") print("\nBy Match Score Range:") for score_range, count in stats['by_score_range'].items(): if count > 0: print(f" {score_range}%: {count:3d} matches") print("\nBy Institution Type:") for inst_type, count in sorted(stats['by_type'].items()): print(f" {inst_type}: {count:3d} matches") print("\n" + "=" * 70) print("Next Steps for Manual Review") print("=" * 70) print(f""" 1. Open: {output_csv} 2. Start with Priority 1 (most uncertain) matches 3. For each row: a. Check institution_name vs wikidata_label b. Visit wikidata_url to verify match c. Check city, institution_type, ISIL code d. Set validation_status: CORRECT | INCORRECT | UNCERTAIN e. Add validation_notes if needed 4. Run update script to apply validated changes 5. Re-export RDF with corrected Wikidata links CSV columns: - priority: 1 (review first) to 5 (review last) - match_score: Fuzzy match confidence (85-99%) - institution_name: Our dataset name - wikidata_label: Wikidata entity label - city: Institution location - institution_type: LIBRARY | ARCHIVE - isil_code: ISIL identifier (if available) - ghcid: Global Heritage Custodian ID - wikidata_qid: Wikidata Q-number - wikidata_url: Direct link to Wikidata entity - validation_status: Fill in: CORRECT | INCORRECT | UNCERTAIN - validation_notes: Your comments - institution_id: W3ID URI (for reference) """) print("=" * 70) print("✅ Review Report Generation Complete") print("=" * 70) if __name__ == '__main__': main()