glam/scripts/generate_wikidata_review_report.py
2025-11-19 23:25:22 +01:00

296 lines
10 KiB
Python

"""
Generate Manual Review Report for Wikidata Fuzzy Matches
Analyzes denmark_complete_enriched.json to extract all fuzzy matches (85-99% confidence)
and creates a prioritized CSV report for manual validation.
"""
import json
import csv
import re
from pathlib import Path
from typing import Dict, List, Optional
def parse_identifier_string(identifier_str: str) -> Optional[Dict]:
"""Parse identifier from string representation."""
if not identifier_str or not isinstance(identifier_str, str):
return None
scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str)
value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str)
url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str)
if scheme_match and value_match:
return {
'scheme': scheme_match.group(1),
'value': value_match.group(1),
'url': url_match.group(1) if url_match else None
}
return None
def extract_fuzzy_matches(institutions: List[Dict]) -> List[Dict]:
"""
Extract institutions with fuzzy Wikidata matches (85-99% confidence).
Returns list of review records with institution and match metadata.
"""
fuzzy_matches = []
for inst in institutions:
enrichment_history = inst.get('enrichment_history', [])
for enrichment in enrichment_history:
match_score = enrichment.get('match_score')
# Fuzzy match: 85-99% confidence
if match_score and 85 <= match_score < 100:
# Extract Wikidata Q-number
wikidata_qid = None
for identifier_data in inst.get('identifiers', []):
identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'Wikidata':
wikidata_qid = identifier.get('value')
break
# Extract ISIL code
isil_code = None
for identifier_data in inst.get('identifiers', []):
identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'ISIL':
isil_code = identifier.get('value')
break
# Extract location
city = None
locations = inst.get('locations', [])
if locations:
first_loc = locations[0]
if isinstance(first_loc, str):
city_match = re.search(r"'city':\s*'([^']*)'", first_loc)
if city_match:
city = city_match.group(1)
elif isinstance(first_loc, dict):
city = first_loc.get('city', '')
# Extract GHCID
ghcid = inst.get('ghcid', '')
fuzzy_matches.append({
'institution_name': inst.get('name', ''),
'institution_type': inst.get('institution_type', ''),
'city': city or '',
'isil_code': isil_code or '',
'ghcid': ghcid,
'wikidata_qid': wikidata_qid or '',
'wikidata_label': enrichment.get('matched_label', ''),
'match_score': match_score,
'wikidata_url': f"https://www.wikidata.org/wiki/{wikidata_qid}" if wikidata_qid else '',
'institution_id': inst.get('id', ''),
'validation_status': '', # For manual review
'validation_notes': '' # For manual review
})
# Sort by match score (lowest first = most uncertain)
fuzzy_matches.sort(key=lambda x: x['match_score'])
return fuzzy_matches
def generate_csv_report(fuzzy_matches: List[Dict], output_path: Path):
"""Generate CSV report for manual review."""
fieldnames = [
'priority',
'match_score',
'institution_name',
'wikidata_label',
'city',
'institution_type',
'isil_code',
'ghcid',
'wikidata_qid',
'wikidata_url',
'validation_status',
'validation_notes',
'institution_id'
]
with open(output_path, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for i, match in enumerate(fuzzy_matches, 1):
# Assign priority (1=highest, 5=lowest)
score = match['match_score']
if score < 87:
priority = 1 # Very uncertain
elif score < 90:
priority = 2 # Uncertain
elif score < 93:
priority = 3 # Moderate
elif score < 96:
priority = 4 # Fairly confident
else:
priority = 5 # Mostly confident
writer.writerow({
'priority': priority,
'match_score': match['match_score'],
'institution_name': match['institution_name'],
'wikidata_label': match['wikidata_label'],
'city': match['city'],
'institution_type': match['institution_type'],
'isil_code': match['isil_code'],
'ghcid': match['ghcid'],
'wikidata_qid': match['wikidata_qid'],
'wikidata_url': match['wikidata_url'],
'validation_status': match['validation_status'],
'validation_notes': match['validation_notes'],
'institution_id': match['institution_id']
})
def generate_statistics(fuzzy_matches: List[Dict]) -> Dict:
"""Calculate statistics for fuzzy matches."""
stats = {
'total': len(fuzzy_matches),
'by_priority': {},
'by_type': {},
'by_score_range': {
'85-87': 0,
'87-90': 0,
'90-93': 0,
'93-96': 0,
'96-99': 0
}
}
# Count by priority
for match in fuzzy_matches:
score = match['match_score']
if score < 87:
priority = 1
elif score < 90:
priority = 2
elif score < 93:
priority = 3
elif score < 96:
priority = 4
else:
priority = 5
stats['by_priority'][priority] = stats['by_priority'].get(priority, 0) + 1
# Count by institution type
for match in fuzzy_matches:
inst_type = match['institution_type']
stats['by_type'][inst_type] = stats['by_type'].get(inst_type, 0) + 1
# Count by score range
for match in fuzzy_matches:
score = match['match_score']
if 85 <= score < 87:
stats['by_score_range']['85-87'] += 1
elif 87 <= score < 90:
stats['by_score_range']['87-90'] += 1
elif 90 <= score < 93:
stats['by_score_range']['90-93'] += 1
elif 93 <= score < 96:
stats['by_score_range']['93-96'] += 1
elif 96 <= score < 100:
stats['by_score_range']['96-99'] += 1
return stats
def main():
print("=" * 70)
print("Wikidata Fuzzy Match Review Report Generator")
print("=" * 70)
# Load enriched dataset
input_path = Path('data/instances/denmark_complete_enriched.json')
print(f"\nLoading enriched dataset: {input_path}")
with open(input_path, 'r', encoding='utf-8') as f:
institutions = json.load(f)
print(f" ✅ Loaded {len(institutions)} institutions")
# Extract fuzzy matches
print("\nExtracting fuzzy matches (85-99% confidence)...")
fuzzy_matches = extract_fuzzy_matches(institutions)
print(f" ✅ Found {len(fuzzy_matches)} fuzzy matches")
# Generate statistics
stats = generate_statistics(fuzzy_matches)
# Generate CSV report
output_csv = Path('data/review/denmark_wikidata_fuzzy_matches.csv')
output_csv.parent.mkdir(parents=True, exist_ok=True)
print(f"\nGenerating CSV report: {output_csv}")
generate_csv_report(fuzzy_matches, output_csv)
print(f" ✅ CSV report generated ({len(fuzzy_matches)} rows)")
# Print statistics
print("\n" + "=" * 70)
print("Fuzzy Match Statistics")
print("=" * 70)
print(f"\nTotal fuzzy matches: {stats['total']}")
print("\nBy Priority (1=most uncertain, 5=fairly confident):")
for priority in sorted(stats['by_priority'].keys()):
count = stats['by_priority'][priority]
print(f" Priority {priority}: {count:3d} matches")
print("\nBy Match Score Range:")
for score_range, count in stats['by_score_range'].items():
if count > 0:
print(f" {score_range}%: {count:3d} matches")
print("\nBy Institution Type:")
for inst_type, count in sorted(stats['by_type'].items()):
print(f" {inst_type}: {count:3d} matches")
print("\n" + "=" * 70)
print("Next Steps for Manual Review")
print("=" * 70)
print(f"""
1. Open: {output_csv}
2. Start with Priority 1 (most uncertain) matches
3. For each row:
a. Check institution_name vs wikidata_label
b. Visit wikidata_url to verify match
c. Check city, institution_type, ISIL code
d. Set validation_status: CORRECT | INCORRECT | UNCERTAIN
e. Add validation_notes if needed
4. Run update script to apply validated changes
5. Re-export RDF with corrected Wikidata links
CSV columns:
- priority: 1 (review first) to 5 (review last)
- match_score: Fuzzy match confidence (85-99%)
- institution_name: Our dataset name
- wikidata_label: Wikidata entity label
- city: Institution location
- institution_type: LIBRARY | ARCHIVE
- isil_code: ISIL identifier (if available)
- ghcid: Global Heritage Custodian ID
- wikidata_qid: Wikidata Q-number
- wikidata_url: Direct link to Wikidata entity
- validation_status: Fill in: CORRECT | INCORRECT | UNCERTAIN
- validation_notes: Your comments
- institution_id: W3ID URI (for reference)
""")
print("=" * 70)
print("✅ Review Report Generation Complete")
print("=" * 70)
if __name__ == '__main__':
main()