glam/data/instances/all/DATASET_STATISTICS.yaml
2025-11-19 23:25:22 +01:00

168 lines
3.7 KiB
YAML

generated: '2025-11-11T16:45:00.000000+00:00'
project: GLAM Data Extraction
schema_version: v0.2.1
last_update: 'Phase 2 Brazil enrichment complete: +40 Wikidata IDs (13.7% → 32.5%, +18.9pp). Overall dataset: 7,619/13,502 (56.4%)'
unified_dataset:
total_institutions: 13502
countries_covered: 18
wikidata_coverage:
count: 7619
percentage: 56.4
geocoding_coverage:
count: 8229
percentage: 60.9
enrichment_needs:
total_candidates: 13461
needs_wikidata: 5883
needs_coordinates: 5324
needs_website: 2089
needs_description: 13012
by_country:
AR:
total: 2
wikidata_coverage:
count: 1
percentage: 50.0
geocoding_coverage:
count: 2
percentage: 100.0
BE:
total: 7
wikidata_coverage:
count: 7
percentage: 100.0
geocoding_coverage:
count: 7
percentage: 100.0
note: 'Manual enrichment Nov 2025: All 7 EU institutions enriched with Wikidata and VIAF'
BR:
total: 212
wikidata_coverage:
count: 69
percentage: 32.5
geocoding_coverage:
count: 97
percentage: 45.75
note: 'Phase 2 enrichment (2025-11-11): SPARQL batch query added 40 Wikidata IDs (+18.9pp improvement). 10/10 top matches were 100% confidence (perfect name matches: Museu Nacional, MASP, Instituto Moreira Salles). 70% fuzzy threshold with Portuguese normalization.'
CL:
total: 180
wikidata_coverage:
count: 97
percentage: 53.89
geocoding_coverage:
count: 168
percentage: 93.33
DK:
total: 1
wikidata_coverage:
count: 1
percentage: 100.0
geocoding_coverage:
count: 1
percentage: 100.0
DZ:
total: 19
wikidata_coverage:
count: 13
percentage: 68.42
geocoding_coverage:
count: 0
percentage: 0.0
GB:
total: 4
wikidata_coverage:
count: 1
percentage: 25.0
geocoding_coverage:
count: 0
percentage: 0.0
GE:
total: 14
wikidata_coverage:
count: 12
percentage: 85.7
geocoding_coverage:
count: 0
percentage: 0.0
note: 'Enriched in Task 6: 13 institutions updated with Wikidata'
IT:
total: 3
wikidata_coverage:
count: 1
percentage: 33.33
geocoding_coverage:
count: 3
percentage: 100.0
JP:
total: 12065
wikidata_coverage:
count: 7091
percentage: 58.77
geocoding_coverage:
count: 7091
percentage: 58.77
LU:
total: 1
wikidata_coverage:
count: 1
percentage: 100.0
geocoding_coverage:
count: 1
percentage: 100.0
note: 'Manual enrichment Nov 2025: Court of Justice of the European Union enriched with Wikidata Q4951 and VIAF 124913422'
LY:
total: 48
wikidata_coverage:
count: 37
percentage: 77.08
geocoding_coverage:
count: 0
percentage: 0.0
MX:
total: 226
wikidata_coverage:
count: 34
percentage: 15.04
geocoding_coverage:
count: 167
percentage: 73.89
NL:
total: 622
wikidata_coverage:
count: 193
percentage: 31.03
geocoding_coverage:
count: 621
percentage: 99.84
RU:
total: 1
wikidata_coverage:
count: 1
percentage: 100.0
geocoding_coverage:
count: 1
percentage: 100.0
TN:
total: 69
wikidata_coverage:
count: 52
percentage: 75.4
geocoding_coverage:
count: 52
percentage: 75.4
US:
total: 7
wikidata_coverage:
count: 0
percentage: 0.0
geocoding_coverage:
count: 7
percentage: 100.0
VN:
total: 21
wikidata_coverage:
count: 8
percentage: 38.1
geocoding_coverage:
count: 0
percentage: 0.0