296 lines
10 KiB
Python
296 lines
10 KiB
Python
"""
|
|
Generate Manual Review Report for Wikidata Fuzzy Matches
|
|
|
|
Analyzes denmark_complete_enriched.json to extract all fuzzy matches (85-99% confidence)
|
|
and creates a prioritized CSV report for manual validation.
|
|
"""
|
|
|
|
import json
|
|
import csv
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional
|
|
|
|
|
|
def parse_identifier_string(identifier_str: str) -> Optional[Dict]:
|
|
"""Parse identifier from string representation."""
|
|
if not identifier_str or not isinstance(identifier_str, str):
|
|
return None
|
|
|
|
scheme_match = re.search(r"'identifier_scheme':\s*'([^']+)'", identifier_str)
|
|
value_match = re.search(r"'identifier_value':\s*'([^']+)'", identifier_str)
|
|
url_match = re.search(r"'identifier_url':\s*'([^']+)'", identifier_str)
|
|
|
|
if scheme_match and value_match:
|
|
return {
|
|
'scheme': scheme_match.group(1),
|
|
'value': value_match.group(1),
|
|
'url': url_match.group(1) if url_match else None
|
|
}
|
|
return None
|
|
|
|
|
|
def extract_fuzzy_matches(institutions: List[Dict]) -> List[Dict]:
|
|
"""
|
|
Extract institutions with fuzzy Wikidata matches (85-99% confidence).
|
|
|
|
Returns list of review records with institution and match metadata.
|
|
"""
|
|
fuzzy_matches = []
|
|
|
|
for inst in institutions:
|
|
enrichment_history = inst.get('enrichment_history', [])
|
|
|
|
for enrichment in enrichment_history:
|
|
match_score = enrichment.get('match_score')
|
|
|
|
# Fuzzy match: 85-99% confidence
|
|
if match_score and 85 <= match_score < 100:
|
|
# Extract Wikidata Q-number
|
|
wikidata_qid = None
|
|
for identifier_data in inst.get('identifiers', []):
|
|
identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
|
|
if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'Wikidata':
|
|
wikidata_qid = identifier.get('value')
|
|
break
|
|
|
|
# Extract ISIL code
|
|
isil_code = None
|
|
for identifier_data in inst.get('identifiers', []):
|
|
identifier = parse_identifier_string(identifier_data) if isinstance(identifier_data, str) else identifier_data
|
|
if identifier and isinstance(identifier, dict) and identifier.get('scheme') == 'ISIL':
|
|
isil_code = identifier.get('value')
|
|
break
|
|
|
|
# Extract location
|
|
city = None
|
|
locations = inst.get('locations', [])
|
|
if locations:
|
|
first_loc = locations[0]
|
|
if isinstance(first_loc, str):
|
|
city_match = re.search(r"'city':\s*'([^']*)'", first_loc)
|
|
if city_match:
|
|
city = city_match.group(1)
|
|
elif isinstance(first_loc, dict):
|
|
city = first_loc.get('city', '')
|
|
|
|
# Extract GHCID
|
|
ghcid = inst.get('ghcid', '')
|
|
|
|
fuzzy_matches.append({
|
|
'institution_name': inst.get('name', ''),
|
|
'institution_type': inst.get('institution_type', ''),
|
|
'city': city or '',
|
|
'isil_code': isil_code or '',
|
|
'ghcid': ghcid,
|
|
'wikidata_qid': wikidata_qid or '',
|
|
'wikidata_label': enrichment.get('matched_label', ''),
|
|
'match_score': match_score,
|
|
'wikidata_url': f"https://www.wikidata.org/wiki/{wikidata_qid}" if wikidata_qid else '',
|
|
'institution_id': inst.get('id', ''),
|
|
'validation_status': '', # For manual review
|
|
'validation_notes': '' # For manual review
|
|
})
|
|
|
|
# Sort by match score (lowest first = most uncertain)
|
|
fuzzy_matches.sort(key=lambda x: x['match_score'])
|
|
|
|
return fuzzy_matches
|
|
|
|
|
|
def generate_csv_report(fuzzy_matches: List[Dict], output_path: Path):
|
|
"""Generate CSV report for manual review."""
|
|
|
|
fieldnames = [
|
|
'priority',
|
|
'match_score',
|
|
'institution_name',
|
|
'wikidata_label',
|
|
'city',
|
|
'institution_type',
|
|
'isil_code',
|
|
'ghcid',
|
|
'wikidata_qid',
|
|
'wikidata_url',
|
|
'validation_status',
|
|
'validation_notes',
|
|
'institution_id'
|
|
]
|
|
|
|
with open(output_path, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
|
|
for i, match in enumerate(fuzzy_matches, 1):
|
|
# Assign priority (1=highest, 5=lowest)
|
|
score = match['match_score']
|
|
if score < 87:
|
|
priority = 1 # Very uncertain
|
|
elif score < 90:
|
|
priority = 2 # Uncertain
|
|
elif score < 93:
|
|
priority = 3 # Moderate
|
|
elif score < 96:
|
|
priority = 4 # Fairly confident
|
|
else:
|
|
priority = 5 # Mostly confident
|
|
|
|
writer.writerow({
|
|
'priority': priority,
|
|
'match_score': match['match_score'],
|
|
'institution_name': match['institution_name'],
|
|
'wikidata_label': match['wikidata_label'],
|
|
'city': match['city'],
|
|
'institution_type': match['institution_type'],
|
|
'isil_code': match['isil_code'],
|
|
'ghcid': match['ghcid'],
|
|
'wikidata_qid': match['wikidata_qid'],
|
|
'wikidata_url': match['wikidata_url'],
|
|
'validation_status': match['validation_status'],
|
|
'validation_notes': match['validation_notes'],
|
|
'institution_id': match['institution_id']
|
|
})
|
|
|
|
|
|
def generate_statistics(fuzzy_matches: List[Dict]) -> Dict:
|
|
"""Calculate statistics for fuzzy matches."""
|
|
|
|
stats = {
|
|
'total': len(fuzzy_matches),
|
|
'by_priority': {},
|
|
'by_type': {},
|
|
'by_score_range': {
|
|
'85-87': 0,
|
|
'87-90': 0,
|
|
'90-93': 0,
|
|
'93-96': 0,
|
|
'96-99': 0
|
|
}
|
|
}
|
|
|
|
# Count by priority
|
|
for match in fuzzy_matches:
|
|
score = match['match_score']
|
|
if score < 87:
|
|
priority = 1
|
|
elif score < 90:
|
|
priority = 2
|
|
elif score < 93:
|
|
priority = 3
|
|
elif score < 96:
|
|
priority = 4
|
|
else:
|
|
priority = 5
|
|
|
|
stats['by_priority'][priority] = stats['by_priority'].get(priority, 0) + 1
|
|
|
|
# Count by institution type
|
|
for match in fuzzy_matches:
|
|
inst_type = match['institution_type']
|
|
stats['by_type'][inst_type] = stats['by_type'].get(inst_type, 0) + 1
|
|
|
|
# Count by score range
|
|
for match in fuzzy_matches:
|
|
score = match['match_score']
|
|
if 85 <= score < 87:
|
|
stats['by_score_range']['85-87'] += 1
|
|
elif 87 <= score < 90:
|
|
stats['by_score_range']['87-90'] += 1
|
|
elif 90 <= score < 93:
|
|
stats['by_score_range']['90-93'] += 1
|
|
elif 93 <= score < 96:
|
|
stats['by_score_range']['93-96'] += 1
|
|
elif 96 <= score < 100:
|
|
stats['by_score_range']['96-99'] += 1
|
|
|
|
return stats
|
|
|
|
|
|
def main():
|
|
print("=" * 70)
|
|
print("Wikidata Fuzzy Match Review Report Generator")
|
|
print("=" * 70)
|
|
|
|
# Load enriched dataset
|
|
input_path = Path('data/instances/denmark_complete_enriched.json')
|
|
print(f"\nLoading enriched dataset: {input_path}")
|
|
|
|
with open(input_path, 'r', encoding='utf-8') as f:
|
|
institutions = json.load(f)
|
|
|
|
print(f" ✅ Loaded {len(institutions)} institutions")
|
|
|
|
# Extract fuzzy matches
|
|
print("\nExtracting fuzzy matches (85-99% confidence)...")
|
|
fuzzy_matches = extract_fuzzy_matches(institutions)
|
|
print(f" ✅ Found {len(fuzzy_matches)} fuzzy matches")
|
|
|
|
# Generate statistics
|
|
stats = generate_statistics(fuzzy_matches)
|
|
|
|
# Generate CSV report
|
|
output_csv = Path('data/review/denmark_wikidata_fuzzy_matches.csv')
|
|
output_csv.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"\nGenerating CSV report: {output_csv}")
|
|
generate_csv_report(fuzzy_matches, output_csv)
|
|
print(f" ✅ CSV report generated ({len(fuzzy_matches)} rows)")
|
|
|
|
# Print statistics
|
|
print("\n" + "=" * 70)
|
|
print("Fuzzy Match Statistics")
|
|
print("=" * 70)
|
|
print(f"\nTotal fuzzy matches: {stats['total']}")
|
|
|
|
print("\nBy Priority (1=most uncertain, 5=fairly confident):")
|
|
for priority in sorted(stats['by_priority'].keys()):
|
|
count = stats['by_priority'][priority]
|
|
print(f" Priority {priority}: {count:3d} matches")
|
|
|
|
print("\nBy Match Score Range:")
|
|
for score_range, count in stats['by_score_range'].items():
|
|
if count > 0:
|
|
print(f" {score_range}%: {count:3d} matches")
|
|
|
|
print("\nBy Institution Type:")
|
|
for inst_type, count in sorted(stats['by_type'].items()):
|
|
print(f" {inst_type}: {count:3d} matches")
|
|
|
|
print("\n" + "=" * 70)
|
|
print("Next Steps for Manual Review")
|
|
print("=" * 70)
|
|
print(f"""
|
|
1. Open: {output_csv}
|
|
2. Start with Priority 1 (most uncertain) matches
|
|
3. For each row:
|
|
a. Check institution_name vs wikidata_label
|
|
b. Visit wikidata_url to verify match
|
|
c. Check city, institution_type, ISIL code
|
|
d. Set validation_status: CORRECT | INCORRECT | UNCERTAIN
|
|
e. Add validation_notes if needed
|
|
4. Run update script to apply validated changes
|
|
5. Re-export RDF with corrected Wikidata links
|
|
|
|
CSV columns:
|
|
- priority: 1 (review first) to 5 (review last)
|
|
- match_score: Fuzzy match confidence (85-99%)
|
|
- institution_name: Our dataset name
|
|
- wikidata_label: Wikidata entity label
|
|
- city: Institution location
|
|
- institution_type: LIBRARY | ARCHIVE
|
|
- isil_code: ISIL identifier (if available)
|
|
- ghcid: Global Heritage Custodian ID
|
|
- wikidata_qid: Wikidata Q-number
|
|
- wikidata_url: Direct link to Wikidata entity
|
|
- validation_status: Fill in: CORRECT | INCORRECT | UNCERTAIN
|
|
- validation_notes: Your comments
|
|
- institution_id: W3ID URI (for reference)
|
|
""")
|
|
|
|
print("=" * 70)
|
|
print("✅ Review Report Generation Complete")
|
|
print("=" * 70)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|