glam/scripts/generate_wikidata_review_report.py

"""
Generate Manual Review Report for Wikidata Fuzzy Matches

Analyzes denmark_complete_enriched.json to extract all fuzzy matches (85-99% confidence)
and creates a prioritized CSV report for manual validation.
"""

import json
import csv
import re
from pathlib import Path
from typing import Dict, List, Optional


def parse_identifier_string(identifier_str: str) -> Optional[Dict]:
    """Parse identifier from string representation."""
    if not identifier_str or not isinstance(identifier_str, str):
        return None

    scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str)
    value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str)
    url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str)

    if scheme_match and value_match:
        return {
            'scheme': scheme_match.group(1),
            'value': value_match.group(1),
            'url': url_match.group(1) if url_match else None
        }
    return None


def extract_fuzzy_matches(institutions: List[Dict]) -> List[Dict]:
    """
    Extract institutions with fuzzy Wikidata matches (85-99% confidence).

    Returns list of review records with institution and match metadata.
    """
    fuzzy_matches = []

    for inst in institutions:
        enrichment_history = inst.get('enrichment_history', [])

        for enrichment in enrichment_history:
            match_score = enrichment.get('match_score')

            # Fuzzy match: 85-99% confidence
            if match_score and 85 <= match_score < 100:
                # Extract Wikidata Q-number
                wikidata_qid = None
                for identifier_data in inst.get('identifiers', []):
                    identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
                    if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'Wikidata':
                        wikidata_qid = identifier.get('value')
                        break

                # Extract ISIL code
                isil_code = None
                for identifier_data in inst.get('identifiers', []):
                    identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
                    if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'ISIL':
                        isil_code = identifier.get('value')
                        break

                # Extract location
                city = None
                locations = inst.get('locations', [])
                if locations:
                    first_loc = locations[0]
                    if isinstance(first_loc, str):
                        city_match = re.search(r"'city':\s*'([^']*)'", first_loc)
                        if city_match:
                            city = city_match.group(1)
                    elif isinstance(first_loc, dict):
                        city = first_loc.get('city', '')

                # Extract GHCID
                ghcid = inst.get('ghcid', '')

                fuzzy_matches.append({
                    'institution_name': inst.get('name', ''),
                    'institution_type': inst.get('institution_type', ''),
                    'city': city or '',
                    'isil_code': isil_code or '',
                    'ghcid': ghcid,
                    'wikidata_qid': wikidata_qid or '',
                    'wikidata_label': enrichment.get('matched_label', ''),
                    'match_score': match_score,
                    'wikidata_url': f"https://www.wikidata.org/wiki/{wikidata_qid}" if wikidata_qid else '',
                    'institution_id': inst.get('id', ''),
                    'validation_status': '',  # For manual review
                    'validation_notes': ''    # For manual review
                })

    # Sort by match score (lowest first = most uncertain)
    fuzzy_matches.sort(key=lambda x: x['match_score'])

    return fuzzy_matches


def generate_csv_report(fuzzy_matches: List[Dict], output_path: Path):
    """Generate CSV report for manual review."""

    fieldnames = [
        'priority',
        'match_score',
        'institution_name',
        'wikidata_label',
        'city',
        'institution_type',
        'isil_code',
        'ghcid',
        'wikidata_qid',
        'wikidata_url',
        'validation_status',
        'validation_notes',
        'institution_id'
    ]

    with open(output_path, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for i, match in enumerate(fuzzy_matches, 1):
            # Assign priority (1=highest, 5=lowest)
            score = match['match_score']
            if score < 87:
                priority = 1  # Very uncertain
            elif score < 90:
                priority = 2  # Uncertain
            elif score < 93:
                priority = 3  # Moderate
            elif score < 96:
                priority = 4  # Fairly confident
            else:
                priority = 5  # Mostly confident

            writer.writerow({
                'priority': priority,
                'match_score': match['match_score'],
                'institution_name': match['institution_name'],
                'wikidata_label': match['wikidata_label'],
                'city': match['city'],
                'institution_type': match['institution_type'],
                'isil_code': match['isil_code'],
                'ghcid': match['ghcid'],
                'wikidata_qid': match['wikidata_qid'],
                'wikidata_url': match['wikidata_url'],
                'validation_status': match['validation_status'],
                'validation_notes': match['validation_notes'],
                'institution_id': match['institution_id']
            })


def generate_statistics(fuzzy_matches: List[Dict]) -> Dict:
    """Calculate statistics for fuzzy matches."""

    stats = {
        'total': len(fuzzy_matches),
        'by_priority': {},
        'by_type': {},
        'by_score_range': {
            '85-87': 0,
            '87-90': 0,
            '90-93': 0,
            '93-96': 0,
            '96-99': 0
        }
    }

    # Count by priority
    for match in fuzzy_matches:
        score = match['match_score']
        if score < 87:
            priority = 1
        elif score < 90:
            priority = 2
        elif score < 93:
            priority = 3
        elif score < 96:
            priority = 4
        else:
            priority = 5

        stats['by_priority'][priority] = stats['by_priority'].get(priority, 0) + 1

    # Count by institution type
    for match in fuzzy_matches:
        inst_type = match['institution_type']
        stats['by_type'][inst_type] = stats['by_type'].get(inst_type, 0) + 1

    # Count by score range
    for match in fuzzy_matches:
        score = match['match_score']
        if 85 <= score < 87:
            stats['by_score_range']['85-87'] += 1
        elif 87 <= score < 90:
            stats['by_score_range']['87-90'] += 1
        elif 90 <= score < 93:
            stats['by_score_range']['90-93'] += 1
        elif 93 <= score < 96:
            stats['by_score_range']['93-96'] += 1
        elif 96 <= score < 100:
            stats['by_score_range']['96-99'] += 1

    return stats


def main():
    print("=" * 70)
    print("Wikidata Fuzzy Match Review Report Generator")
    print("=" * 70)

    # Load enriched dataset
    input_path = Path('data/instances/denmark_complete_enriched.json')
    print(f"\nLoading enriched dataset: {input_path}")

    with open(input_path, 'r', encoding='utf-8') as f:
        institutions = json.load(f)

    print(f"  ✅ Loaded {len(institutions)} institutions")

    # Extract fuzzy matches
    print("\nExtracting fuzzy matches (85-99% confidence)...")
    fuzzy_matches = extract_fuzzy_matches(institutions)
    print(f"  ✅ Found {len(fuzzy_matches)} fuzzy matches")

    # Generate statistics
    stats = generate_statistics(fuzzy_matches)

    # Generate CSV report
    output_csv = Path('data/review/denmark_wikidata_fuzzy_matches.csv')
    output_csv.parent.mkdir(parents=True, exist_ok=True)

    print(f"\nGenerating CSV report: {output_csv}")
    generate_csv_report(fuzzy_matches, output_csv)
    print(f"  ✅ CSV report generated ({len(fuzzy_matches)} rows)")

    # Print statistics
    print("\n" + "=" * 70)
    print("Fuzzy Match Statistics")
    print("=" * 70)
    print(f"\nTotal fuzzy matches: {stats['total']}")

    print("\nBy Priority (1=most uncertain, 5=fairly confident):")
    for priority in sorted(stats['by_priority'].keys()):
        count = stats['by_priority'][priority]
        print(f"  Priority {priority}: {count:3d} matches")

    print("\nBy Match Score Range:")
    for score_range, count in stats['by_score_range'].items():
        if count > 0:
            print(f"  {score_range}%: {count:3d} matches")

    print("\nBy Institution Type:")
    for inst_type, count in sorted(stats['by_type'].items()):
        print(f"  {inst_type}: {count:3d} matches")

    print("\n" + "=" * 70)
    print("Next Steps for Manual Review")
    print("=" * 70)
    print(f"""
1. Open: {output_csv}
2. Start with Priority 1 (most uncertain) matches
3. For each row:
   a. Check institution_name vs wikidata_label
   b. Visit wikidata_url to verify match
   c. Check city, institution_type, ISIL code
   d. Set validation_status: CORRECT | INCORRECT | UNCERTAIN
   e. Add validation_notes if needed
4. Run update script to apply validated changes
5. Re-export RDF with corrected Wikidata links

CSV columns:
  - priority: 1 (review first) to 5 (review last)
  - match_score: Fuzzy match confidence (85-99%)
  - institution_name: Our dataset name
  - wikidata_label: Wikidata entity label
  - city: Institution location
  - institution_type: LIBRARY | ARCHIVE
  - isil_code: ISIL identifier (if available)
  - ghcid: Global Heritage Custodian ID
  - wikidata_qid: Wikidata Q-number
  - wikidata_url: Direct link to Wikidata entity
  - validation_status: Fill in: CORRECT | INCORRECT | UNCERTAIN
  - validation_notes: Your comments
  - institution_id: W3ID URI (for reference)
""")

    print("=" * 70)
    print("✅ Review Report Generation Complete")
    print("=" * 70)


if __name__ == '__main__':
    main()