From 4f6ca92084ca92d7347014af7bacd7518cf8393e Mon Sep 17 00:00:00 2001 From: kempersc Date: Tue, 23 Dec 2025 21:37:10 +0100 Subject: [PATCH] enrich: logo enrichment progress (JP: 1500, CZ: 40 started) --- .../.logo_enrichment_crawl4ai_checkpoint.json | 44 ++++++++- data/custodian/CZ-10-BRE-A-LAPNP.yaml | 75 +++++++++++----- data/custodian/CZ-10-CES-A-ACNB.yaml | 89 ++++++++++++++----- data/custodian/CZ-10-LIT-A-AHMP.yaml | 64 +++++++++---- data/custodian/CZ-10-PET-A-ANM.yaml | 45 ++++++++-- data/custodian/CZ-10-PET-A-ANTM.yaml | 39 ++++++-- data/custodian/CZ-10-PET-A-MUAAVCRVV.yaml | 25 ++++++ data/custodian/CZ-10-PET-A-NFA.yaml | 34 ++++++- data/custodian/CZ-10-PNM-A-ACR.yaml | 60 ++++++++++--- data/custodian/CZ-10-PRA-A-AKPR.yaml | 41 +++++++-- data/custodian/CZ-10-PRA-A-AMVC.yaml | 65 +++++++++++--- data/custodian/CZ-10-PRA-A-ANBU.yaml | 53 ++++++++--- data/custodian/CZ-10-PRA-A-ANBUS.yaml | 78 ++++++++++++---- data/custodian/CZ-10-PRA-A-APCR.yaml | 78 +++++++++++----- data/custodian/CZ-10-PRA-A-APH.yaml | 54 ++++++++--- data/custodian/CZ-10-PRA-A-APS.yaml | 40 +++++++-- data/custodian/CZ-10-PRA-A-ASMVCR.yaml | 62 ++++++++++--- data/custodian/CZ-10-PRA-A-AUPZSI.yaml | 36 ++++++-- data/custodian/CZ-10-PRA-A-AZMVP.yaml | 50 ++++++++--- data/custodian/CZ-10-PRA-A-BAKPR.yaml | 61 +++++++++---- data/custodian/CZ-10-PRA-A-BAVZ.yaml | 50 ++++++++--- data/custodian/JP-10-MAE-L-LITCGU.yaml | 19 ++++ ...-ML-maebashikokadaigakufuzoku_library.yaml | 27 ++++++ data/custodian/JP-10-MAE-L-MLF.yaml | 25 ++++++ data/custodian/JP-10-MAE-L-MLH.yaml | 25 ++++++ data/custodian/JP-10-MAE-L-MLJ.yaml | 25 ++++++ ...aebashishiritsu_library_kaigayabunkan.yaml | 25 ++++++ ...ebashishiritsu_library_kasukawabunkan.yaml | 25 ++++++ ...ebashishiritsu_library_kiyosatobunkan.yaml | 25 ++++++ data/custodian/JP-10-MAE-L-MLK.yaml | 25 ++++++ ...maebashishiritsu_library_miyagibunkan.yaml | 25 ++++++ data/custodian/JP-10-MAE-L-MLM.yaml | 25 ++++++ data/custodian/JP-10-MAE-L-MLN.yaml | 25 ++++++ data/custodian/JP-10-MAE-L-MLO.yaml | 25 ++++++ ...iritsu_library_sogokyoikupurazabunkan.yaml | 25 ++++++ ...S-maebashishiritsu_library_sojabunkan.yaml | 25 ++++++ data/custodian/JP-10-MAE-L-MLS.yaml | 25 ++++++ data/custodian/JP-10-MAE-L-NITGCL.yaml | 27 ++++++ 38 files changed, 1314 insertions(+), 252 deletions(-) diff --git a/data/custodian/.logo_enrichment_crawl4ai_checkpoint.json b/data/custodian/.logo_enrichment_crawl4ai_checkpoint.json index 99ffdf8750..a94cb3f9b3 100644 --- a/data/custodian/.logo_enrichment_crawl4ai_checkpoint.json +++ b/data/custodian/.logo_enrichment_crawl4ai_checkpoint.json @@ -7016,7 +7016,47 @@ "JP-10-KIT-M-MESM.yaml", "JP-10-MAE-A-GPA.yaml", "JP-10-MAE-L-GLI.yaml", - "JP-10-MAE-L-GLM.yaml" + "JP-10-MAE-L-GLM.yaml", + "CZ-10-BRA-L-BSS.yaml", + "CZ-10-BRE-A-LAPNP.yaml", + "CZ-10-BRN-A-NPUB.yaml", + "CZ-10-BRN-L-MZKSTK.yaml", + "CZ-10-CEL-L-SSTK.yaml", + "CZ-10-CES-A-ACNB.yaml", + "CZ-10-CES-A-ACSSD.yaml", + "CZ-10-CES-A-ACT.yaml", + "CZ-10-DOL-L-VUVSR.yaml", + "CZ-10-DOM-A-ANK.yaml", + "CZ-10-HUS-L-URSK.yaml", + "CZ-10-JIC-L-SDS.yaml", + "CZ-10-JIL-L-BVUBVLS.yaml", + "CZ-10-LIB-L-SLSR.yaml", + "CZ-10-LIT-A-AHMP.yaml", + "CZ-10-PAN-L-VPBSOISK.yaml", + "CZ-10-PET-A-ANM.yaml", + "CZ-10-PET-A-ANTM.yaml", + "CZ-10-PET-A-MUAAVCRVV.yaml", + "CZ-10-PET-A-NFA.yaml", + "CZ-10-PNM-A-ACR.yaml", + "CZ-10-PRA-A-ABIS.yaml", + "CZ-10-PRA-A-AKPR.yaml", + "CZ-10-PRA-A-AMVC.yaml", + "CZ-10-PRA-A-ANBU.yaml", + "CZ-10-PRA-A-ANBUS.yaml", + "CZ-10-PRA-A-ANG.yaml", + "CZ-10-PRA-A-APCR.yaml", + "CZ-10-PRA-A-APH.yaml", + "CZ-10-PRA-A-APS.yaml", + "CZ-10-PRA-A-ASMVCR.yaml", + "CZ-10-PRA-A-AUACVV.yaml", + "CZ-10-PRA-A-AUMAVENA.yaml", + "CZ-10-PRA-A-AUMAVESPR-archivalie_ulozene_mimo_archivy_v_evidenci_soa_pra.yaml", + "CZ-10-PRA-A-AUMAVESPR.yaml", + "CZ-10-PRA-A-AUMAVESPV.yaml", + "CZ-10-PRA-A-AUMAVESPZ.yaml", + "CZ-10-PRA-A-AUPZSI.yaml", + "CZ-10-PRA-A-AZMVP.yaml", + "CZ-10-PRA-A-BAKPR.yaml" ], - "last_index": 59 + "last_index": 39 } \ No newline at end of file diff --git a/data/custodian/CZ-10-BRE-A-LAPNP.yaml b/data/custodian/CZ-10-BRE-A-LAPNP.yaml index 0f4b3a3ff5..b472bc2df9 100644 --- a/data/custodian/CZ-10-BRE-A-LAPNP.yaml +++ b/data/custodian/CZ-10-BRE-A-LAPNP.yaml @@ -80,9 +80,10 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:39Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-07T00:00:20Z: XX->10 via Wikidata P131 (CZ-10)' - - 'City resolved 2025-12-07T00:35:14Z: XXX->BRE via Wikidata Q52679463 coords (50.0860,14.3893) -> Brevnov (GeoNames:3078748)' - - 'YouTube/Google Maps enrichment 2025-12-08T19:44:46Z: Maps: Muzeum literatury - Památník národního písemnictví (conf: - 0.95); YouTube: not found' + - 'City resolved 2025-12-07T00:35:14Z: XXX->BRE via Wikidata Q52679463 coords (50.0860,14.3893) + -> Brevnov (GeoNames:3078748)' + - 'YouTube/Google Maps enrichment 2025-12-08T19:44:46Z: Maps: Muzeum literatury + - Památník národního písemnictví (conf: 0.95); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:03Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:27Z ch_annotator: @@ -248,24 +249,29 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/111854463313628190722/reviews rating: 5 relative_time_description: a year ago - text: Great exposition of the history of Czech literature. Several interactive installations. A great section about oppression - and censorship of the communiat period (just keep in mind that Czechslovakia experienced a "mild" version of communist - occupation; in Ukraine, for example, it was way way worse). We came 1 hour before closing time and it was not enough, - I would recommend to allocate at least 2 hours for your visit. + text: Great exposition of the history of Czech literature. Several interactive + installations. A great section about oppression and censorship of the communiat + period (just keep in mind that Czechslovakia experienced a "mild" version of + communist occupation; in Ukraine, for example, it was way way worse). We came + 1 hour before closing time and it was not enough, I would recommend to allocate + at least 2 hours for your visit. publish_time: '2024-04-20T20:58:03.274534Z' - author_name: Evgeniia author_uri: https://www.google.com/maps/contrib/115318432713391005078/reviews rating: 4 relative_time_description: 6 months ago - text: Exposition about the Czech literature. Free entry. Cafe inside. WC inside as well + text: Exposition about the Czech literature. Free entry. Cafe inside. WC inside + as well publish_time: '2025-06-07T13:48:13.315370Z' - author_name: Julka Borghouts author_uri: https://www.google.com/maps/contrib/106169637951493716953/reviews rating: 4 relative_time_description: 2 years ago - text: It was interesting but I think it was because our guide who was the former director of the mudeum could say a lot - of interesting facts. It's about a 2-3 hours visit to see everything after that you can have a nice walk/rest in the - park. Would recommend ( idk for children because it was not interactive at all but there were cool things to see). + text: It was interesting but I think it was because our guide who was the former + director of the mudeum could say a lot of interesting facts. It's about a 2-3 + hours visit to see everything after that you can have a nice walk/rest in the + park. Would recommend ( idk for children because it was not interactive at all + but there were cool things to see). publish_time: '2023-08-25T16:47:39.617735Z' - author_name: Tetiana Yazlovetska author_uri: https://www.google.com/maps/contrib/111299840988431556455/reviews @@ -279,10 +285,12 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/102002753176059146376/reviews rating: 3 relative_time_description: 3 years ago - text: I went there with my 2 years old toddler. So start was very unpleasant I was unable to take the potty to my small - backpack (just in case) and water too. Which is big No No. But there wasn't a single person so my daughter was really - good time. I wasn't able to read everything but somehow I had the feeling of chaotic curators work. So who knows?! There - was everything and nothing special. But the price is super 100 CZK so perfect ❤️ and it's stroller friendly entry. + text: I went there with my 2 years old toddler. So start was very unpleasant I + was unable to take the potty to my small backpack (just in case) and water too. + Which is big No No. But there wasn't a single person so my daughter was really + good time. I wasn't able to read everything but somehow I had the feeling of + chaotic curators work. So who knows?! There was everything and nothing special. + But the price is super 100 CZK so perfect ❤️ and it's stroller friendly entry. publish_time: '2022-12-03T19:36:43.940881Z' opening_hours: open_now: false @@ -316,11 +324,13 @@ google_maps_enrichment: is_match: true confidence: 0.95 entity_type: GRP.HER - reasoning: The Google Maps candidate 'Muzeum literatury - Památník národního písemnictví' is a direct and descriptive - name for the source institution 'Literární archiv Památníku národního písemnictví', both referencing the 'Památník národního - písemnictví' (Museum of National Literature). The location in Prague, Czechia, matches the expected location for the - national institution. The Google Place type 'museum' is a perfect match for the expected GRP.HER entity type. The business - is operational and its website confirms the match. + reasoning: The Google Maps candidate 'Muzeum literatury - Památník národního písemnictví' + is a direct and descriptive name for the source institution 'Literární archiv + Památníku národního písemnictví', both referencing the 'Památník národního písemnictví' + (Museum of National Literature). The location in Prague, Czechia, matches the + expected location for the national institution. The Google Place type 'museum' + is a perfect match for the expected GRP.HER entity type. The business is operational + and its website confirms the match. agent: glm-4.6 verified: true ch_annotator_version: ch_annotator-v1_7_0 @@ -343,3 +353,28 @@ location: formatted_address: Pelléova 44/22, 160 00 Praha 6-Bubeneč, Czechia geonames_id: 3078748 normalization_timestamp: '2025-12-09T06:49:27.270377+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:18:12.679007+00:00' + source_url: http://www.pamatniknarodnihopisemnictvi.cz/o-literarnim-archivu + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.pamatniknarodnihopisemnictvi.cz/build/images/favicons/apple-touch-icon.4ae46d90.png + source_url: http://www.pamatniknarodnihopisemnictvi.cz/o-literarnim-archivu + css_selector: '[document] > html > head > link:nth-of-type(8)' + retrieved_on: '2025-12-23T20:18:12.679007+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: 180x180 + - claim_type: og_image_url + claim_value: https://img.pamatniknarodnihopisemnictvi.cz/userimages/og_image_scheme/1/e9ec123bd4b4af5b99049f8d9309363c_large.png + source_url: http://www.pamatniknarodnihopisemnictvi.cz/o-literarnim-archivu + css_selector: '[document] > html > head > meta:nth-of-type(12)' + retrieved_on: '2025-12-23T20:18:12.679007+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 4 diff --git a/data/custodian/CZ-10-CES-A-ACNB.yaml b/data/custodian/CZ-10-CES-A-ACNB.yaml index d044abc37f..404f24f647 100644 --- a/data/custodian/CZ-10-CES-A-ACNB.yaml +++ b/data/custodian/CZ-10-CES-A-ACNB.yaml @@ -43,10 +43,11 @@ ghcid: latitude: 48.97447 longitude: 14.47434 ghcid_history: - - previous_ghcid_component: "CB" - new_ghcid_component: "CES" - change_date: "2025-12-20T19:55:24Z" - reason: "Fixed 2-letter city code to proper 3-letter code per AGENTS.md. City: Ceske Budejovice" + - previous_ghcid_component: CB + new_ghcid_component: CES + change_date: '2025-12-20T19:55:24Z' + reason: 'Fixed 2-letter city code to proper 3-letter code per AGENTS.md. City: + Ceske Budejovice' - ghcid: CZ-10-CB-A-ACNB ghcid_numeric: 14327918484041920941 valid_from: '2025-12-08T11:21:41.224062+00:00' @@ -64,7 +65,8 @@ ghcid: reason: 'Region resolved via Wikidata P131: XX->10 (CZ-10)' - ghcid: CZ-10-CB-A-AČNB valid_from: '2025-12-07T00:19:32.878169+00:00' - reason: 'Location resolved via CH-Annotator TOP.SET extraction: České -> Ceske Budejovice (GeoNames:3077916)' + reason: 'Location resolved via CH-Annotator TOP.SET extraction: České -> Ceske + Budejovice (GeoNames:3077916)' custodian_name: claim_type: custodian_name claim_value: Archiv České národní banky @@ -93,10 +95,10 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:40Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-07T00:04:46Z: XX->10 via Wikidata P131 (CZ-10)' - - 'Location resolved 2025-12-07T00:19:32Z: CH-Annotator TOP.SET extraction ''České'' -> Ceske Budejovice (GeoNames:3077916, - Region:31)' - - 'YouTube/Google Maps enrichment 2025-12-08T19:44:56Z: Maps: State Regional Archive Ceske Budejovice (conf: 0.90); YouTube: - not found' + - 'Location resolved 2025-12-07T00:19:32Z: CH-Annotator TOP.SET extraction ''České'' + -> Ceske Budejovice (GeoNames:3077916, Region:31)' + - 'YouTube/Google Maps enrichment 2025-12-08T19:44:56Z: Maps: State Regional Archive + Ceske Budejovice (conf: 0.90); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:03Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:27Z ch_annotator: @@ -216,7 +218,8 @@ wikidata_enrichment: headquarters_location: id: Q973974 label: Prague 1 - description: administrative district, municipal district and municipal part of Prague + description: administrative district, municipal district and municipal part + of Prague country: &id006 id: Q213 label: Czech Republic @@ -240,7 +243,8 @@ google_maps_enrichment: coordinates: latitude: 48.9764677 longitude: 14.4845555 - formatted_address: 40, Rudolfovská tř. 70, 370 01 České Budějovice-České Budějovice 1, Czechia + formatted_address: 40, Rudolfovská tř. 70, 370 01 České Budějovice-České Budějovice + 1, Czechia short_address: Rudolfovská tř. 70, České Budějovice 1 phone_local: 386 701 214 phone_international: +420 386 701 214 @@ -256,17 +260,19 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/106223614438373881734/reviews rating: 5 relative_time_description: 9 months ago - text: Hello, we attended Eleonore's lecture yesterday and it was great. Beautiful. I have never experienced such a beautiful - lecture. + text: Hello, we attended Eleonore's lecture yesterday and it was great. Beautiful. + I have never experienced such a beautiful lecture. publish_time: '2025-02-26T09:05:14.246458Z' - author_name: Jakub Bouček (Opravdové příběhy) author_uri: https://www.google.com/maps/contrib/109262906392895391758/reviews rating: 5 relative_time_description: 7 years ago - text: 'State archives are one of the places where you can find real information about the history of a particular region, - and the České Budějovice archive is a good example of this. + text: 'State archives are one of the places where you can find real information + about the history of a particular region, and the České Budějovice archive is + a good example of this. - So if you want to find out details about the history of the city and its surroundings, the archive is the right place.' + So if you want to find out details about the history of the city and its surroundings, + the archive is the right place.' publish_time: '2018-03-31T13:23:16.449Z' - author_name: Hana Havlova author_uri: https://www.google.com/maps/contrib/109085353083285723508/reviews @@ -315,12 +321,15 @@ google_maps_enrichment: is_match: true confidence: 0.9 entity_type: GRP.HER - reasoning: '1. NAME MATCH: Partial but strong match. The source is ''Archiv České národní banky'' (Archive of the Czech - National Bank), while the candidate is ''State Regional Archive Ceske Budejovice''. Although the names differ, both - are archives, and the source Wikidata item points to ''Státní oblastní archiv v Českých Budějovicích'', which translates - directly to the candidate''s name. 2. LOCATION MATCH: Perfect match. Both are in České Budějovice, Czechia. 3. TYPE - MATCH: The Google Place types (''point_of_interest'', ''establishment'') are generic, but the name ''Archive'' and the - website (ceskearchivy.cz) confirm it is an archive, which is a valid heritage type. 4. ENTITY TYPE: The institution + reasoning: '1. NAME MATCH: Partial but strong match. The source is ''Archiv České + národní banky'' (Archive of the Czech National Bank), while the candidate is + ''State Regional Archive Ceske Budejovice''. Although the names differ, both + are archives, and the source Wikidata item points to ''Státní oblastní archiv + v Českých Budějovicích'', which translates directly to the candidate''s name. + 2. LOCATION MATCH: Perfect match. Both are in České Budějovice, Czechia. 3. + TYPE MATCH: The Google Place types (''point_of_interest'', ''establishment'') + are generic, but the name ''Archive'' and the website (ceskearchivy.cz) confirm + it is an archive, which is a valid heritage type. 4. ENTITY TYPE: The institution is an archive, which falls under the definition of a heritage institution (GRP.HER).' agent: glm-4.6 verified: true @@ -342,7 +351,41 @@ location: region_code: '10' country: CZ street_address: Rudolfovská tř. 70, České Budějovice 1 - formatted_address: 40, Rudolfovská tř. 70, 370 01 České Budějovice-České Budějovice 1, Czechia + formatted_address: 40, Rudolfovská tř. 70, 370 01 České Budějovice-České Budějovice + 1, Czechia geonames_id: 3077916 feature_code: PPLA normalization_timestamp: '2025-12-09T06:49:27.380425+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:18:24.032330+00:00' + source_url: https://www.ceskearchivy.cz/statni-okresni-archivy/ceske-budejovice/soka-cb-uvod + extraction_method: crawl4ai + claims: + - claim_type: logo_url + claim_value: https://www.ceskearchivy.cz/images/INST_logo.png + source_url: https://www.ceskearchivy.cz/statni-okresni-archivy/ceske-budejovice/soka-cb-uvod + css_selector: '#mod-custom206 > p > a > img' + retrieved_on: '2025-12-23T20:18:24.032330+00:00' + extraction_method: crawl4ai_header_logo + detection_confidence: high + alt_text: '' + - claim_type: favicon_url + claim_value: https://www.ceskearchivy.cz/favicon.ico + source_url: https://www.ceskearchivy.cz/statni-okresni-archivy/ceske-budejovice/soka-cb-uvod + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:18:24.032330+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: image/vnd.microsoft.icon + favicon_sizes: '' + - claim_type: og_image_url + claim_value: https://www.ceskearchivy.cz/administrator/cache/preview/80b931d60f4be56fcd0c341aab8b9bc2.jpg + source_url: https://www.ceskearchivy.cz/statni-okresni-archivy/ceske-budejovice/soka-cb-uvod + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:18:24.032330+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 3 + has_primary_logo: true + has_favicon: true + has_og_image: true + favicon_count: 1 diff --git a/data/custodian/CZ-10-LIT-A-AHMP.yaml b/data/custodian/CZ-10-LIT-A-AHMP.yaml index 49eabf5403..c994ea9107 100644 --- a/data/custodian/CZ-10-LIT-A-AHMP.yaml +++ b/data/custodian/CZ-10-LIT-A-AHMP.yaml @@ -80,8 +80,10 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:40Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-07T00:01:20Z: XX->10 via Wikidata P131 (CZ-10)' - - 'City resolved 2025-12-07T00:28:15Z: XXX->LIT via Wikidata Q19672898 coords (50.0400,14.4949) -> Litochleby (GeoNames:3071686)' - - 'YouTube/Google Maps enrichment 2025-12-08T19:45:16Z: Maps: Prague City Archives (conf: 1.00); YouTube: not found' + - 'City resolved 2025-12-07T00:28:15Z: XXX->LIT via Wikidata Q19672898 coords (50.0400,14.4949) + -> Litochleby (GeoNames:3071686)' + - 'YouTube/Google Maps enrichment 2025-12-08T19:45:16Z: Maps: Prague City Archives + (conf: 1.00); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:03Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:27Z ch_annotator: @@ -203,7 +205,8 @@ wikidata_enrichment: description: accumulation of historical records of a town or city - id: Q2085381 label: publishing company - description: company that prints and distributes pressed goods or electronic media + description: company that prints and distributes pressed goods or electronic + media wikidata_instance_of: *id005 wikidata_location: country: &id006 @@ -262,7 +265,8 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/109561705096115465097/reviews rating: 5 relative_time_description: a year ago - text: The building dates back to 1995, it's nice and very interesting. I recommend visiting the archive. + text: The building dates back to 1995, it's nice and very interesting. I recommend + visiting the archive. publish_time: '2024-04-21T15:15:20.793882Z' - author_name: Libor Šedivý author_uri: https://www.google.com/maps/contrib/112581391891260052369/reviews @@ -274,20 +278,25 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/102671748185462032746/reviews rating: 3 relative_time_description: 6 years ago - text: You need to communicate at least two weeks before the planned visit so that you don't lose out unnecessarily. You - won't get anything at first... In addition, you need to take into account that they will present you with 5 archival - units in one day, i.e. not 5 cartons, but only 5 folders, regardless of whether it is a bookmark with one sheet or a - package with hundreds of documents... The ambition to complete more extensive research requires much more time than - we are used to in other archives. So much luck, patience and research happiness! 🍀 + text: You need to communicate at least two weeks before the planned visit so that + you don't lose out unnecessarily. You won't get anything at first... In addition, + you need to take into account that they will present you with 5 archival units + in one day, i.e. not 5 cartons, but only 5 folders, regardless of whether it + is a bookmark with one sheet or a package with hundreds of documents... The + ambition to complete more extensive research requires much more time than we + are used to in other archives. So much luck, patience and research happiness! + 🍀 publish_time: '2019-12-01T07:46:56.111948Z' - author_name: Miroslav Havel author_uri: https://www.google.com/maps/contrib/109030248799737237070/reviews rating: 5 relative_time_description: 7 years ago - text: A modern archive building built in the 1990s, I was lucky enough to see the facilities. Willing and quirky workers, - the profession of archivist probably requires a certain amount of perspective, a sense of humor and self-irony. If you - come to a professional course, you can look forward not only to insightful information, but also to great comments and - glosses on current events. It's worth it. 😉 + text: A modern archive building built in the 1990s, I was lucky enough to see + the facilities. Willing and quirky workers, the profession of archivist probably + requires a certain amount of perspective, a sense of humor and self-irony. If + you come to a professional course, you can look forward not only to insightful + information, but also to great comments and glosses on current events. It's + worth it. 😉 publish_time: '2018-03-17T09:34:30.117Z' opening_hours: open_now: false @@ -321,10 +330,12 @@ google_maps_enrichment: is_match: true confidence: 1.0 entity_type: GRP.HER - reasoning: Strong match confirmed by name translation, identical official website, and correct location. The name 'Archiv - hlavního města Prahy' translates to 'Prague City Archives'. The source city is Prague, and the candidate address is - in Praha 4 (Prague 4), Czechia. The provided website 'http://www.ahmp.cz/' matches the institution's official domain. - While Google Place types are generic, the context and website confirm it is an archive, fitting the GRP.HER entity type. + reasoning: Strong match confirmed by name translation, identical official website, + and correct location. The name 'Archiv hlavního města Prahy' translates to 'Prague + City Archives'. The source city is Prague, and the candidate address is in Praha + 4 (Prague 4), Czechia. The provided website 'http://www.ahmp.cz/' matches the + institution's official domain. While Google Place types are generic, the context + and website confirm it is an archive, fitting the GRP.HER entity type. agent: glm-4.6 verified: true ch_annotator_version: ch_annotator-v1_7_0 @@ -347,3 +358,22 @@ location: formatted_address: Archivní 1280/6, 149 00 Praha 4-Chodov, Czechia geonames_id: 3071686 normalization_timestamp: '2025-12-09T06:49:27.606055+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:18:48.316223+00:00' + source_url: http://www.ahmp.cz + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.ahmp.cz/img/ahmp_favicon.ico + source_url: http://www.ahmp.cz + css_selector: '[document] > html > head > link' + retrieved_on: '2025-12-23T20:18:48.316223+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + summary: + total_claims: 1 + has_primary_logo: false + has_favicon: true + has_og_image: false + favicon_count: 1 diff --git a/data/custodian/CZ-10-PET-A-ANM.yaml b/data/custodian/CZ-10-PET-A-ANM.yaml index b39c9ed21a..4233404240 100644 --- a/data/custodian/CZ-10-PET-A-ANM.yaml +++ b/data/custodian/CZ-10-PET-A-ANM.yaml @@ -39,10 +39,11 @@ ghcid: city_label: Pelc Tyrolka geonames_id: 3068455 ghcid_history: - - previous_ghcid_component: "PT" - new_ghcid_component: "PET" - change_date: "2025-12-20T19:55:24Z" - reason: "Fixed 2-letter city code to proper 3-letter code per AGENTS.md. City: Pelc Tyrolka" + - previous_ghcid_component: PT + new_ghcid_component: PET + change_date: '2025-12-20T19:55:24Z' + reason: 'Fixed 2-letter city code to proper 3-letter code per AGENTS.md. City: + Pelc Tyrolka' - ghcid: XX-XX-XXX-A-ANM ghcid_numeric: 18249419148031109659 valid_from: '2025-12-06T23:37:44.753389+00:00' @@ -84,7 +85,8 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:40Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-06T23:59:55Z: XX->10 via Wikidata P131 (CZ-10)' - - 'City resolved 2025-12-07T00:31:35Z: XXX->PT via Wikidata Q25228907 coords (50.1100,14.4347) -> Pelc Tyrolka (GeoNames:3068455)' + - 'City resolved 2025-12-07T00:31:35Z: XXX->PT via Wikidata Q25228907 coords (50.1100,14.4347) + -> Pelc Tyrolka (GeoNames:3068455)' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:08Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:31Z - 'YouTube/Google Maps enrichment 2025-12-09T09:29:38Z: YouTube: not found' @@ -115,8 +117,8 @@ ch_annotator: annotation_metadata: confidence_score: 0.85 verified: false - verification_date: - verified_by: + verification_date: null + verified_by: null entity_claims: - claim_type: full_name claim_value: Archiv Národního muzea @@ -189,8 +191,8 @@ wikidata_enrichment: instance_of: &id005 - id: Q53566456 label: museum archive - description: archive established by a museum to collect, organize, preserve, and provide access to its organizational - records + description: archive established by a museum to collect, organize, preserve, + and provide access to its organizational records - id: Q101470010 label: specialized archives description: type of archives in Czechia @@ -244,3 +246,28 @@ location: youtube_status: NOT_FOUND youtube_search_query: Archiv Národního muzea official youtube_search_timestamp: '2025-12-09T09:29:38.113936+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:18:56.138622+00:00' + source_url: http://www.nm.cz/Studovny-a-badatelny/Archiv-Narodniho-muzea-studovna + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://www.nm.cz/file/ad1e3d8b659d4c5536c61a5d693fed81/4/favicon/nmicon.png + source_url: http://www.nm.cz/Studovny-a-badatelny/Archiv-Narodniho-muzea-studovna + css_selector: '[document] > html > head > link:nth-of-type(5)' + retrieved_on: '2025-12-23T20:18:56.138622+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: image/png + favicon_sizes: '' + - claim_type: og_image_url + claim_value: https://www.nm.cz/file/0df9b1550064a10e390fd23d2a739d27/3293/Archiv_foto_terezim.jpg.jpg + source_url: http://www.nm.cz/Studovny-a-badatelny/Archiv-Narodniho-muzea-studovna + css_selector: '[document] > html > head > meta:nth-of-type(4)' + retrieved_on: '2025-12-23T20:18:56.138622+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 1 diff --git a/data/custodian/CZ-10-PET-A-ANTM.yaml b/data/custodian/CZ-10-PET-A-ANTM.yaml index 10936091e2..2281a6643b 100644 --- a/data/custodian/CZ-10-PET-A-ANTM.yaml +++ b/data/custodian/CZ-10-PET-A-ANTM.yaml @@ -39,10 +39,11 @@ ghcid: city_label: Pelc Tyrolka geonames_id: 3068455 ghcid_history: - - previous_ghcid_component: "PT" - new_ghcid_component: "PET" - change_date: "2025-12-20T19:55:24Z" - reason: "Fixed 2-letter city code to proper 3-letter code per AGENTS.md. City: Pelc Tyrolka" + - previous_ghcid_component: PT + new_ghcid_component: PET + change_date: '2025-12-20T19:55:24Z' + reason: 'Fixed 2-letter city code to proper 3-letter code per AGENTS.md. City: + Pelc Tyrolka' - ghcid: XX-XX-XXX-A-ANTM ghcid_numeric: 9067919020428215504 valid_from: '2025-12-06T23:37:44.282267+00:00' @@ -84,7 +85,8 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:38Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-07T00:04:25Z: XX->10 via Wikidata P131 (CZ-10)' - - 'City resolved 2025-12-07T00:32:04Z: XXX->PT via Wikidata Q101474199 coords (50.1100,14.4347) -> Pelc Tyrolka (GeoNames:3068455)' + - 'City resolved 2025-12-07T00:32:04Z: XXX->PT via Wikidata Q101474199 coords (50.1100,14.4347) + -> Pelc Tyrolka (GeoNames:3068455)' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:08Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:31Z - 'YouTube/Google Maps enrichment 2025-12-09T09:29:38Z: YouTube: not found' @@ -115,8 +117,8 @@ ch_annotator: annotation_metadata: confidence_score: 0.85 verified: false - verification_date: - verified_by: + verification_date: null + verified_by: null entity_claims: - claim_type: full_name claim_value: Archiv Národního technického muzea @@ -181,8 +183,8 @@ wikidata_enrichment: description: type of archives in Czechia - id: Q53566456 label: museum archive - description: archive established by a museum to collect, organize, preserve, and provide access to its organizational - records + description: archive established by a museum to collect, organize, preserve, + and provide access to its organizational records wikidata_instance_of: *id005 wikidata_location: headquarters_location: @@ -222,3 +224,22 @@ location: youtube_status: NOT_FOUND youtube_search_query: Archiv Národního technického muzea official youtube_search_timestamp: '2025-12-09T09:29:38.777578+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:02.872590+00:00' + source_url: https://www.ntm.cz/archiv-knihovna/archiv-ntm + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://www.ntm.cz/file/30dc8e5fefba6ceba5d690d796c861ec/2220/favicon/NTM%20EN%20%C4%8Derven%C3%A1%20negativ.png + source_url: https://www.ntm.cz/archiv-knihovna/archiv-ntm + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:19:02.872590+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: image/png + favicon_sizes: '' + summary: + total_claims: 1 + has_primary_logo: false + has_favicon: true + has_og_image: false + favicon_count: 1 diff --git a/data/custodian/CZ-10-PET-A-MUAAVCRVV.yaml b/data/custodian/CZ-10-PET-A-MUAAVCRVV.yaml index d52246932a..bdf6d07f81 100644 --- a/data/custodian/CZ-10-PET-A-MUAAVCRVV.yaml +++ b/data/custodian/CZ-10-PET-A-MUAAVCRVV.yaml @@ -279,3 +279,28 @@ youtube_status: NOT_FOUND youtube_search_query: Masarykův ústav a Archiv Akademie věd České republiky v.v.i. official youtube_search_timestamp: '2025-12-09T09:29:39.442991+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:08.054373+00:00' + source_url: http://www.mua.cas.cz + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.mua.cas.cz/build/favicon/safari-pinned-tab.svg + source_url: http://www.mua.cas.cz + css_selector: '[document] > html > head > link:nth-of-type(7)' + retrieved_on: '2025-12-23T20:19:08.054373+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: https://mua.greendot.cz/build/img/hp-hero.jpg + source_url: http://www.mua.cas.cz + css_selector: '[document] > html > head > meta:nth-of-type(10)' + retrieved_on: '2025-12-23T20:19:08.054373+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 4 diff --git a/data/custodian/CZ-10-PET-A-NFA.yaml b/data/custodian/CZ-10-PET-A-NFA.yaml index 2d654f75f8..b3775f4119 100644 --- a/data/custodian/CZ-10-PET-A-NFA.yaml +++ b/data/custodian/CZ-10-PET-A-NFA.yaml @@ -39,10 +39,11 @@ ghcid: city_label: Pelc Tyrolka geonames_id: 3068455 ghcid_history: - - previous_ghcid_component: "PT" - new_ghcid_component: "PET" - change_date: "2025-12-20T19:57:18Z" - reason: "Fixed 2-letter city code to proper 3-letter code per AGENTS.md. City: Pelc Tyrolka" + - previous_ghcid_component: PT + new_ghcid_component: PET + change_date: '2025-12-20T19:57:18Z' + reason: 'Fixed 2-letter city code to proper 3-letter code per AGENTS.md. City: + Pelc Tyrolka' - ghcid: XX-XX-XXX-A-NFA ghcid_numeric: 15166324295331575978 valid_from: '2025-12-06T23:37:43.718883+00:00' @@ -295,3 +296,28 @@ location: youtube_status: NOT_FOUND youtube_search_query: Národní filmový archiv official youtube_search_timestamp: '2025-12-09T09:29:40.114231+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:13.185638+00:00' + source_url: https://nfa.cz + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://nfa.cz/safari-pinned-tab.svg + source_url: https://nfa.cz + css_selector: '[document] > html.no-js.show--consent > head > link:nth-of-type(6)' + retrieved_on: '2025-12-23T20:19:13.185638+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: https://nfa.cz/dokumenty/74703/image-thumb__74703__OGImage/zastupny-obrazek-open-graph.jpg + source_url: https://nfa.cz + css_selector: '[document] > html.no-js.show--consent > head > meta:nth-of-type(11)' + retrieved_on: '2025-12-23T20:19:13.185638+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 5 diff --git a/data/custodian/CZ-10-PNM-A-ACR.yaml b/data/custodian/CZ-10-PNM-A-ACR.yaml index 3e643b9e65..7fb2a80aa7 100644 --- a/data/custodian/CZ-10-PNM-A-ACR.yaml +++ b/data/custodian/CZ-10-PNM-A-ACR.yaml @@ -35,10 +35,11 @@ ghcid: city_label: Praha-Nove Mesto geonames_id: 11839017 ghcid_history: - - previous_ghcid_component: "PM" - new_ghcid_component: "PNM" - change_date: "2025-12-20T19:55:24Z" - reason: "Fixed 2-letter city code to proper 3-letter code per AGENTS.md. City: Praha-Nove Mesto" + - previous_ghcid_component: PM + new_ghcid_component: PNM + change_date: '2025-12-20T19:55:24Z' + reason: 'Fixed 2-letter city code to proper 3-letter code per AGENTS.md. City: + Praha-Nove Mesto' - ghcid: CZ-10-PM-A-ACR ghcid_numeric: 5300371129343583721 valid_from: '2025-12-08T11:21:33.063067+00:00' @@ -56,7 +57,8 @@ ghcid: reason: 'Region resolved via Wikidata P131: XX->10 (CZ-10)' - ghcid: CZ-10-PM-A-AČR valid_from: '2025-12-07T00:27:25.913910+00:00' - reason: 'City resolved via Wikidata Q28563975 coordinates: XXX->PM (Praha-Nove Mesto)' + reason: 'City resolved via Wikidata Q28563975 coordinates: XXX->PM (Praha-Nove + Mesto)' custodian_name: claim_type: custodian_name claim_value: Archiv Českého rozhlasu @@ -84,8 +86,10 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:39Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-07T00:00:58Z: XX->10 via Wikidata P131 (CZ-10)' - - 'City resolved 2025-12-07T00:27:25Z: XXX->PM via Wikidata Q28563975 coords (50.0742,14.4428) -> Praha-Nove Mesto (GeoNames:11839017)' - - 'YouTube/Google Maps enrichment 2025-12-08T19:45:47Z: Maps: rejected by LLM; YouTube: not found' + - 'City resolved 2025-12-07T00:27:25Z: XXX->PM via Wikidata Q28563975 coords (50.0742,14.4428) + -> Praha-Nove Mesto (GeoNames:11839017)' + - 'YouTube/Google Maps enrichment 2025-12-08T19:45:47Z: Maps: rejected by LLM; YouTube: + not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:03Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:27Z ch_annotator: @@ -115,8 +119,8 @@ ch_annotator: annotation_metadata: confidence_score: 0.85 verified: false - verification_date: - verified_by: + verification_date: null + verified_by: null entity_claims: - claim_type: full_name claim_value: Archiv Českého rozhlasu @@ -195,7 +199,8 @@ wikidata_enrichment: headquarters_location: id: Q2444636 label: Prague 2 - description: administrative district, municipal district and municipal part of Prague + description: administrative district, municipal district and municipal part + of Prague wikidata_country: *id005 wikidata_located_in: *id006 wikidata_organization: @@ -214,10 +219,12 @@ wikidata_enrichment: google_maps_status: NO_MATCH google_maps_rejected: candidate_name: Český rozhlas - rejection_reason: The Google Maps candidate 'Český rozhlas' is the main broadcasting corporation (Czech Radio), not the - specific 'Archiv Českého rozhlasu' (Czech Radio Archives). Although the archive is part of this organization and located - at the same address, they are distinct entities. The Google entry's types ('point_of_interest', 'establishment') do not - specify an archive or other heritage institution, leading to a mismatch with the target entity. + rejection_reason: The Google Maps candidate 'Český rozhlas' is the main broadcasting + corporation (Czech Radio), not the specific 'Archiv Českého rozhlasu' (Czech Radio + Archives). Although the archive is part of this organization and located at the + same address, they are distinct entities. The Google entry's types ('point_of_interest', + 'establishment') do not specify an archive or other heritage institution, leading + to a mismatch with the target entity. timestamp: '2025-12-08T19:45:46.803336+00:00' youtube_status: NOT_FOUND youtube_search_query: Archiv Českého rozhlasu official @@ -237,3 +244,28 @@ location: original_timestamp: '2025-12-09T15:34:38.871222+00:00' geonames_name: Praha-Nové Město feature_code: PPL +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:18.778759+00:00' + source_url: https://informace.rozhlas.cz/sluzby-archivu-7965113 + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://informace.rozhlas.cz/sites/all/themes/custom/e7/apple-touch-icon-precomposed-144x144.png + source_url: https://informace.rozhlas.cz/sluzby-archivu-7965113 + css_selector: '[document] > html.js.show--consent > head > link:nth-of-type(4)' + retrieved_on: '2025-12-23T20:19:18.778759+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: 144x144 + - claim_type: og_image_url + claim_value: https://portal.rozhlas.cz/sites/default/files/images/02923537.jpeg + source_url: https://informace.rozhlas.cz/sluzby-archivu-7965113 + css_selector: '[document] > html.js.show--consent > head > meta:nth-of-type(15)' + retrieved_on: '2025-12-23T20:19:18.778759+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 5 diff --git a/data/custodian/CZ-10-PRA-A-AKPR.yaml b/data/custodian/CZ-10-PRA-A-AKPR.yaml index 54513dc72e..e9eaca7930 100644 --- a/data/custodian/CZ-10-PRA-A-AKPR.yaml +++ b/data/custodian/CZ-10-PRA-A-AKPR.yaml @@ -75,9 +75,12 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:39Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-07T00:00:05Z: XX->10 via Wikidata P131 (CZ-10)' - - 'City resolved 2025-12-07T00:25:25Z: XXX->PRA via Wikidata Q46996293 coords (50.0913,14.4037) -> Prague (GeoNames:3067696)' - - Removed incorrect wikidata_enrichment on 2025-12-08T08:18:45.447081+00:00. Re-enrichment required with proper matching. - - 'YouTube/Google Maps enrichment 2025-12-08T19:45:54Z: Maps: rejected by LLM; YouTube: not found' + - 'City resolved 2025-12-07T00:25:25Z: XXX->PRA via Wikidata Q46996293 coords (50.0913,14.4037) + -> Prague (GeoNames:3067696)' + - Removed incorrect wikidata_enrichment on 2025-12-08T08:18:45.447081+00:00. Re-enrichment + required with proper matching. + - 'YouTube/Google Maps enrichment 2025-12-08T19:45:54Z: Maps: rejected by LLM; YouTube: + not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:27Z ch_annotator: @@ -107,8 +110,8 @@ ch_annotator: annotation_metadata: confidence_score: 0.85 verified: false - verification_date: - verified_by: + verification_date: null + verified_by: null entity_claims: - claim_type: full_name claim_value: Archiv Kanceláře prezidenta republiky @@ -210,10 +213,11 @@ wikidata_enrichment: google_maps_status: NO_MATCH google_maps_rejected: candidate_name: Archiv Pražského hradu - rejection_reason: The source institution 'Archiv Kanceláře prezidenta republiky' (Archive of the Office of the President - of the Republic) and the Google Maps candidate 'Archiv Pražského hradu' (Prague Castle Archive) are two distinct entities. - The former is the presidential archive, while the latter is the archive of Prague Castle. While both are archives located - in Prague, they are not the same institution. + rejection_reason: The source institution 'Archiv Kanceláře prezidenta republiky' + (Archive of the Office of the President of the Republic) and the Google Maps candidate + 'Archiv Pražského hradu' (Prague Castle Archive) are two distinct entities. The + former is the presidential archive, while the latter is the archive of Prague + Castle. While both are archives located in Prague, they are not the same institution. timestamp: '2025-12-08T19:45:54.624391+00:00' youtube_status: NOT_FOUND youtube_search_query: Archiv Kanceláře prezidenta republiky official @@ -233,3 +237,22 @@ location: original_timestamp: '2025-12-09T15:34:38.958575+00:00' geonames_name: Prague feature_code: PPLC +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:26.299080+00:00' + source_url: http://www.prazskyhradarchiv.cz/archivKPR/cz + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.prazskyhradarchiv.cz/img/safari-pinned-tab.svg + source_url: http://www.prazskyhradarchiv.cz/archivKPR/cz + css_selector: '[document] > html.js > head > link:nth-of-type(5)' + retrieved_on: '2025-12-23T20:19:26.299080+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + summary: + total_claims: 1 + has_primary_logo: false + has_favicon: true + has_og_image: false + favicon_count: 4 diff --git a/data/custodian/CZ-10-PRA-A-AMVC.yaml b/data/custodian/CZ-10-PRA-A-AMVC.yaml index e18b834e4b..e1631a0b9a 100644 --- a/data/custodian/CZ-10-PRA-A-AMVC.yaml +++ b/data/custodian/CZ-10-PRA-A-AMVC.yaml @@ -41,7 +41,8 @@ ghcid: reason: 'Country resolved via Wikidata P17: XX→CZ' - ghcid: CZ-10-PRA-A-AMVČ valid_from: '2025-12-07T12:39:42.482491+00:00' - reason: 'Location resolved from institution name pattern: ''Prague'' → region 10, city PRA' + reason: 'Location resolved from institution name pattern: ''Prague'' → region + 10, city PRA' custodian_name: claim_type: custodian_name claim_value: Archiv Ministerstva vnitra ČR @@ -67,7 +68,8 @@ provenance: confidence_score: 0.85 notes: - 'Country resolved 2025-12-06T23:56:11Z: XX→CZ via Wikidata P17' - - 'YouTube/Google Maps enrichment 2025-12-08T19:45:59Z: Maps: National Archive (conf: 0.80); YouTube: not found' + - 'YouTube/Google Maps enrichment 2025-12-08T19:45:59Z: Maps: National Archive (conf: + 0.80); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:11:19Z - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:27Z @@ -150,15 +152,16 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/107425710254765644916/reviews rating: 5 relative_time_description: 10 months ago - text: The staff are always very friendly. There's even a lady who speaks English very well. Even without the language, - you can communicate. Everyone is helpful and extremely customer-friendly. I enjoy coming here. + text: The staff are always very friendly. There's even a lady who speaks English + very well. Even without the language, you can communicate. Everyone is helpful + and extremely customer-friendly. I enjoy coming here. publish_time: '2025-02-06T21:43:14.918876Z' - author_name: Barunka G. author_uri: https://www.google.com/maps/contrib/102003756317069132015/reviews rating: 4 relative_time_description: 5 years ago - text: In the research room of the 1st department (Milady Horákové Street), they are very helpful and willing to help you - search for archival materials. + text: In the research room of the 1st department (Milady Horákové Street), they + are very helpful and willing to help you search for archival materials. publish_time: '2020-08-12T16:42:30.619293Z' - author_name: Kamila Svobodova author_uri: https://www.google.com/maps/contrib/109299302074939194601/reviews @@ -210,12 +213,15 @@ google_maps_enrichment: is_match: true confidence: 0.8 entity_type: GRP.HER - reasoning: 'NAME MATCH: The candidate name ''National Archive'' is a general but plausible English translation for the - source ''Archiv Ministerstva vnitra ČR''. The website nacr.cz confirms it is the National Archives of the Czech Republic, - which aligns with the source being a ministry archive. LOCATION MATCH: The address is in Praha (Prague), Czechia, matching - the expected country ''CZ''. TYPE MATCH: The Google Place type ''library'' is an acceptable heritage type. The website - confirms it is an archive institution. ENTITY TYPE: It is a heritage institution (archive). Confidence is not 1.0 due - to the generic English name and lack of a more direct name translation, but the evidence strongly supports a match.' + reasoning: 'NAME MATCH: The candidate name ''National Archive'' is a general but + plausible English translation for the source ''Archiv Ministerstva vnitra ČR''. + The website nacr.cz confirms it is the National Archives of the Czech Republic, + which aligns with the source being a ministry archive. LOCATION MATCH: The address + is in Praha (Prague), Czechia, matching the expected country ''CZ''. TYPE MATCH: + The Google Place type ''library'' is an acceptable heritage type. The website + confirms it is an archive institution. ENTITY TYPE: It is a heritage institution + (archive). Confidence is not 1.0 due to the generic English name and lack of + a more direct name translation, but the evidence strongly supports a match.' agent: glm-4.6 verified: true ch_annotator_version: ch_annotator-v1_7_0 @@ -236,3 +242,38 @@ location: street_address: M. Horákové 5, Praha 6-Hradčany formatted_address: 133, M. Horákové 5, 160 00 Praha 6-Hradčany, Czechia normalization_timestamp: '2025-12-09T06:49:27.868521+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:31.631386+00:00' + source_url: http://www.nacr.cz + extraction_method: crawl4ai + claims: + - claim_type: logo_url + claim_value: https://www.nacr.cz/wp-content/themes/narodni_archiv/img/logo_na_en.png + source_url: http://www.nacr.cz + css_selector: '#wrapper-navbar > header.header > div.header__main:nth-of-type(2) + > nav.navbar.navbar-expand-lg > div.navbar-mobile-top > div.navbar-brand > a + > img' + retrieved_on: '2025-12-23T20:19:31.631386+00:00' + extraction_method: crawl4ai_header_logo + detection_confidence: high + alt_text: National Archives + - claim_type: favicon_url + claim_value: https://www.nacr.cz/wp-content/themes/narodni_archiv/img/favicon/apple-icon-180x180.png + source_url: http://www.nacr.cz + css_selector: '[document] > html > head > link:nth-of-type(9)' + retrieved_on: '2025-12-23T20:19:31.631386+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: 180x180 + - claim_type: og_image_url + claim_value: https://www.nacr.cz/wp-content/uploads/2020/04/podatelna2_22-1.jpg + source_url: http://www.nacr.cz + css_selector: '[document] > html > head > meta:nth-of-type(17)' + retrieved_on: '2025-12-23T20:19:31.631386+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 3 + has_primary_logo: true + has_favicon: true + has_og_image: true + favicon_count: 13 diff --git a/data/custodian/CZ-10-PRA-A-ANBU.yaml b/data/custodian/CZ-10-PRA-A-ANBU.yaml index d573cd23bb..b5a3704905 100644 --- a/data/custodian/CZ-10-PRA-A-ANBU.yaml +++ b/data/custodian/CZ-10-PRA-A-ANBU.yaml @@ -30,7 +30,8 @@ ghcid: city_code: PRA method: WIKIDATA_LOCATION_RESEARCH resolution_timestamp: '2025-12-06T23:54:40.395769+00:00' - resolution_notes: National Security Authority Archive in Prague (national security office) + resolution_notes: National Security Authority Archive in Prague (national security + office) ghcid_history: - ghcid: CZ-10-PRA-A-ANBU ghcid_numeric: 16528725694186928927 @@ -73,8 +74,10 @@ provenance: confidence_score: 0.85 notes: - 'Country resolved 2025-12-06T23:54:40Z: XX→CZ via Wikidata P17' - - Removed incorrect wikidata_enrichment on 2025-12-08T08:18:45.454454+00:00. Re-enrichment required with proper matching. - - 'YouTube/Google Maps enrichment 2025-12-08T19:46:08Z: Maps: National Security Authority (conf: 1.00); YouTube: not found' + - Removed incorrect wikidata_enrichment on 2025-12-08T08:18:45.454454+00:00. Re-enrichment + required with proper matching. + - 'YouTube/Google Maps enrichment 2025-12-08T19:46:08Z: Maps: National Security + Authority (conf: 1.00); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:27Z ch_annotator: @@ -217,10 +220,12 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/105428557309487306330/reviews rating: 4 relative_time_description: 6 years ago - text: Beer was already being brewed here at the end of the 18th century, and the Košířský brewery was successful. At the - end of the 19th century, its annual output was 40,000 hl. After the First World War, things got worse and worse, and - competition grew stronger. The end came in 1934... The buildings themselves survived the change of owners and are in - good condition today. They represent a nice example of industrial buildings of the time, even with a preserved factory + text: Beer was already being brewed here at the end of the 18th century, and the + Košířský brewery was successful. At the end of the 19th century, its annual + output was 40,000 hl. After the First World War, things got worse and worse, + and competition grew stronger. The end came in 1934... The buildings themselves + survived the change of owners and are in good condition today. They represent + a nice example of industrial buildings of the time, even with a preserved factory chimney. publish_time: '2019-10-17T15:05:18.213430Z' - author_name: Ivo Novotný @@ -273,12 +278,15 @@ google_maps_enrichment: is_match: true confidence: 1.0 entity_type: GRP.HER - reasoning: 'The Google Maps place is a match. 1. NAME MATCH: ''National Security Authority'' is the English translation - of ''Národní bezpečnostního úřadu''. The source name, ''Archiv Národního bezpečnostního úřadu'', identifies this place - as the archive of that authority. The website ''nbu.cz'' confirms the identity. 2. LOCATION MATCH: The address is in - Praha, Czechia, matching the source country (CZ). 3. TYPE MATCH: While the Google Place type is ''local_government_office'', - this is not grounds for rejection. The source explicitly identifies the institution as an ''Archiv'', which falls under - the GRP.HER definition. 4. ENTITY TYPE: The institution is an archive, a type of heritage custodian.' + reasoning: 'The Google Maps place is a match. 1. NAME MATCH: ''National Security + Authority'' is the English translation of ''Národní bezpečnostního úřadu''. + The source name, ''Archiv Národního bezpečnostního úřadu'', identifies this + place as the archive of that authority. The website ''nbu.cz'' confirms the + identity. 2. LOCATION MATCH: The address is in Praha, Czechia, matching the + source country (CZ). 3. TYPE MATCH: While the Google Place type is ''local_government_office'', + this is not grounds for rejection. The source explicitly identifies the institution + as an ''Archiv'', which falls under the GRP.HER definition. 4. ENTITY TYPE: + The institution is an archive, a type of heritage custodian.' agent: glm-4.6 verified: true ch_annotator_version: ch_annotator-v1_7_0 @@ -299,3 +307,22 @@ location: street_address: Na Popelce 16/2, Praha 5 formatted_address: Na Popelce 16/2, 150 06 Praha 5, Czechia normalization_timestamp: '2025-12-09T06:49:27.913673+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:36.697627+00:00' + source_url: http://www.nbu.cz + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.nbu.cz/templates/nbu/favicon.ico + source_url: http://www.nbu.cz + css_selector: '[document] > html > head > link:nth-of-type(2)' + retrieved_on: '2025-12-23T20:19:36.697627+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: image/vnd.microsoft.icon + favicon_sizes: '' + summary: + total_claims: 1 + has_primary_logo: false + has_favicon: true + has_og_image: false + favicon_count: 1 diff --git a/data/custodian/CZ-10-PRA-A-ANBUS.yaml b/data/custodian/CZ-10-PRA-A-ANBUS.yaml index 9a419e09a4..c83c79375b 100644 --- a/data/custodian/CZ-10-PRA-A-ANBUS.yaml +++ b/data/custodian/CZ-10-PRA-A-ANBUS.yaml @@ -41,7 +41,8 @@ ghcid: reason: 'Country resolved via Wikidata P17: XX→CZ' - ghcid: CZ-10-PRA-A-ANBÚS valid_from: '2025-12-07T12:39:42.484117+00:00' - reason: 'Location resolved from institution name pattern: ''Prague'' → region 10, city PRA' + reason: 'Location resolved from institution name pattern: ''Prague'' → region + 10, city PRA' custodian_name: claim_type: custodian_name claim_value: Archiv Národního bezpečnostního úřadu - specializovaný @@ -67,8 +68,9 @@ provenance: confidence_score: 0.85 notes: - 'Country resolved 2025-12-06T23:56:12Z: XX→CZ via Wikidata P17' - - 'YouTube/Google Maps enrichment 2025-12-08T19:46:12Z: Maps: Ústav pro studium totalitních režimů - Archiv bezpečnostních - složek - Badatelna Na Struze (conf: 0.95); YouTube: not found' + - 'YouTube/Google Maps enrichment 2025-12-08T19:46:12Z: Maps: Ústav pro studium + totalitních režimů - Archiv bezpečnostních složek - Badatelna Na Struze (conf: + 0.95); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:11:19Z - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:27Z @@ -128,7 +130,8 @@ ch_annotator: creation_method: create_custodian_from_ch_annotator.py google_maps_enrichment: place_id: ChIJz-raT_CUC0cRzgWQGob4kwc - name: Ústav pro studium totalitních režimů - Archiv bezpečnostních složek - Badatelna Na Struze + name: Ústav pro studium totalitních režimů - Archiv bezpečnostních složek - Badatelna + Na Struze fetch_timestamp: '2025-12-08T19:46:09.193225+00:00' api_status: OK coordinates: @@ -150,36 +153,41 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/103022670456700368685/reviews rating: 5 relative_time_description: 6 years ago - text: An office where people with passion for the cause work. Experts who can advise and help with research as much as - possible. Safety rules are bearable and understandable at the given place. An agreed meeting at a specific time in which - a specialist in the given issue will also be dedicated to you. This is how every office should work. + text: An office where people with passion for the cause work. Experts who can + advise and help with research as much as possible. Safety rules are bearable + and understandable at the given place. An agreed meeting at a specific time + in which a specialist in the given issue will also be dedicated to you. This + is how every office should work. publish_time: '2018-12-11T20:04:25.713896388Z' - author_name: Salziger Reiter author_uri: https://www.google.com/maps/contrib/109315899389057527570/reviews rating: 5 relative_time_description: a year ago - text: Very helpful and accommodating employees. The request was processed quite quickly and the handover took place as - agreed. + text: Very helpful and accommodating employees. The request was processed quite + quickly and the handover took place as agreed. publish_time: '2024-08-03T15:09:18.204573Z' - author_name: Michala Pickova author_uri: https://www.google.com/maps/contrib/105350789437390454077/reviews rating: 5 relative_time_description: 7 years ago - text: An official miracle. Pleasant people from the gatekeeper to the research room, extremely helpful and willing, the - ladies in the research room were very nice. More such miracles ☺ + text: An official miracle. Pleasant people from the gatekeeper to the research + room, extremely helpful and willing, the ladies in the research room were very + nice. More such miracles ☺ publish_time: '2018-07-17T12:48:37.042Z' - author_name: Michaela Blaháková author_uri: https://www.google.com/maps/contrib/103600310747938101580/reviews rating: 5 relative_time_description: 2 years ago - text: Communication was perfect, everyone was very nice and helpful, nothing was a problem and on the contrary they advised - and helped with everything. Thank you very much. + text: Communication was perfect, everyone was very nice and helpful, nothing was + a problem and on the contrary they advised and helped with everything. Thank + you very much. publish_time: '2022-12-09T21:22:32.106182Z' - author_name: Vasil “Ben Lee Meier” Mohorita author_uri: https://www.google.com/maps/contrib/108576684499626840750/reviews rating: 5 relative_time_description: 5 years ago - text: Thank you very much for your help in finding and subsequently publishing information and facts about Operation Benjamin... + text: Thank you very much for your help in finding and subsequently publishing + information and facts about Operation Benjamin... publish_time: '2020-10-02T21:21:44.310941Z' opening_hours: open_now: false @@ -213,11 +221,14 @@ google_maps_enrichment: is_match: true confidence: 0.95 entity_type: GRP.HER - reasoning: The source name is 'Archiv Národního bezpečnostního úřadu' (Archive of the National Security Office). The candidate - name, 'Ústav pro studium totalitních režimů - Archiv bezpečnostních složek - Badatelna Na Struze', refers to the successor - institution, which took over the archival responsibilities. 'Archiv bezpečnostních složek' (Archive of the Security - Forces) is the modern name for the same collection. The location is correct (Prague, Czechia). While the generic Google - types lack 'archive', the website and detailed name confirm it is an archive. The entity type is a correct match. + reasoning: The source name is 'Archiv Národního bezpečnostního úřadu' (Archive + of the National Security Office). The candidate name, 'Ústav pro studium totalitních + režimů - Archiv bezpečnostních složek - Badatelna Na Struze', refers to the + successor institution, which took over the archival responsibilities. 'Archiv + bezpečnostních složek' (Archive of the Security Forces) is the modern name for + the same collection. The location is correct (Prague, Czechia). While the generic + Google types lack 'archive', the website and detailed name confirm it is an + archive. The entity type is a correct match. agent: glm-4.6 verified: true ch_annotator_version: ch_annotator-v1_7_0 @@ -238,3 +249,32 @@ location: street_address: Na Struze 229, Nové Město formatted_address: 3, Na Struze 229, Nové Město, 110 00 Praha-Praha 1, Czechia normalization_timestamp: '2025-12-09T06:49:27.953513+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:42.765327+00:00' + source_url: http://www.abscr.cz/cs/provoz-badatelen#struha + extraction_method: crawl4ai + claims: + - claim_type: logo_url + claim_value: https://www.abscr.cz/wp-content/themes/ustrcr/img/logo.png + source_url: http://www.abscr.cz/cs/provoz-badatelen#struha + css_selector: '[document] > html.no-js > body.wp-singular.page-template-default + > div.container:nth-of-type(2) > header.header.cf > div.header__between.cf:nth-of-type(2) + > p.header__logo > a > img' + retrieved_on: '2025-12-23T20:19:42.765327+00:00' + extraction_method: crawl4ai_header_logo + detection_confidence: high + alt_text: Archiv bezpečnostních složek + - claim_type: favicon_url + claim_value: https://www.abscr.cz/wp-content/themes/ustrcr/img/favicon.ico + source_url: http://www.abscr.cz/cs/provoz-badatelen#struha + css_selector: '[document] > html.no-js > head > link' + retrieved_on: '2025-12-23T20:19:42.765327+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + summary: + total_claims: 2 + has_primary_logo: true + has_favicon: true + has_og_image: false + favicon_count: 1 diff --git a/data/custodian/CZ-10-PRA-A-APCR.yaml b/data/custodian/CZ-10-PRA-A-APCR.yaml index 9708e9930c..fee4884ad0 100644 --- a/data/custodian/CZ-10-PRA-A-APCR.yaml +++ b/data/custodian/CZ-10-PRA-A-APCR.yaml @@ -80,8 +80,10 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:40Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-07T00:04:43Z: XX->10 via Wikidata P131 (CZ-10)' - - 'City resolved 2025-12-07T00:27:41Z: XXX->PRA via Wikidata Q101475944 coords (50.0875,14.4214) -> Prague (GeoNames:3067696)' - - 'YouTube/Google Maps enrichment 2025-12-08T19:46:20Z: Maps: Czech Police Museum (conf: 1.00); YouTube: not found' + - 'City resolved 2025-12-07T00:27:41Z: XXX->PRA via Wikidata Q101475944 coords (50.0875,14.4214) + -> Prague (GeoNames:3067696)' + - 'YouTube/Google Maps enrichment 2025-12-08T19:46:20Z: Maps: Czech Police Museum + (conf: 1.00); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:28Z ch_annotator: @@ -208,17 +210,20 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/110876883927358051472/reviews rating: 3 relative_time_description: 2 months ago - text: '1. I will start with the only & the biggest drawback in the Museum. There are NO English audio guides, NO information - display in English. It become completely impossible to translate each and every wonderful thing kept in the museum. + text: '1. I will start with the only & the biggest drawback in the Museum. There + are NO English audio guides, NO information display in English. It become completely + impossible to translate each and every wonderful thing kept in the museum. - 2. However, I do compliment the efforts which have gone behind in collecting & setting up each and every artefact with - such care & diligence. The aspects of homicide, forensics, uniforms, weapons, equipment’s used & being presently used - by the forces & shown for display is just commendable. + 2. However, I do compliment the efforts which have gone behind in collecting + & setting up each and every artefact with such care & diligence. The aspects + of homicide, forensics, uniforms, weapons, equipment’s used & being presently + used by the forces & shown for display is just commendable. - 3. If the museum is visited with due interest, then 2 hours are required, ticket per person is very minimal, paid parking - is available, washroom is available within the facility. + 3. If the museum is visited with due interest, then 2 hours are required, ticket + per person is very minimal, paid parking is available, washroom is available + within the facility. A must visit place.' @@ -227,31 +232,36 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/107353925454189365150/reviews rating: 5 relative_time_description: 3 months ago - text: Large museum, all possible aspects of police activities and history from the end of 18th century are covered. There - are some interactive parts (including musical instruments, Identikit computer tool, etc), although most of the exposition - are old-fashioned glass showcases. The building itself is also interesting, it's former Augustines monastery. + text: Large museum, all possible aspects of police activities and history from + the end of 18th century are covered. There are some interactive parts (including + musical instruments, Identikit computer tool, etc), although most of the exposition + are old-fashioned glass showcases. The building itself is also interesting, + it's former Augustines monastery. publish_time: '2025-08-14T13:30:50.341602510Z' - author_name: Francis author_uri: https://www.google.com/maps/contrib/112502021501125017989/reviews rating: 5 relative_time_description: 5 months ago - text: Interesting museum. The museum is huge and it will take you around an hour to get through. It’s also cheap to access. - The museum covers a substantial amount of history. The only disappointment is that it’s not friendly to English speakers + text: Interesting museum. The museum is huge and it will take you around an hour + to get through. It’s also cheap to access. The museum covers a substantial amount + of history. The only disappointment is that it’s not friendly to English speakers as mostly everything is in Czech only. Still worth a visit though. publish_time: '2025-07-03T08:19:55.748706435Z' - author_name: grace frances author_uri: https://www.google.com/maps/contrib/102980876546458200665/reviews rating: 5 relative_time_description: 4 months ago - text: This was amazing! There was so so much to see and there were interactive areas of the museum. Me and my friends - had a really fun time and i would highly recommend. + text: This was amazing! There was so so much to see and there were interactive + areas of the museum. Me and my friends had a really fun time and i would highly + recommend. publish_time: '2025-08-05T12:35:18.729156215Z' - author_name: Gordon Crawford author_uri: https://www.google.com/maps/contrib/117476350871498178843/reviews rating: 5 relative_time_description: 3 months ago - text: Really big museum so to do it justice give yourself plenty of time. Well laid out. Lots of motorcycles, uniforms - and all aspects of the history of the police. + text: Really big museum so to do it justice give yourself plenty of time. Well + laid out. Lots of motorcycles, uniforms and all aspects of the history of the + police. publish_time: '2025-08-21T18:57:16.423398324Z' opening_hours: open_now: false @@ -285,11 +295,13 @@ google_maps_enrichment: is_match: true confidence: 1.0 entity_type: GRP.HER - reasoning: 'NAME MATCH: The Google Place name ''Czech Police Museum'' is a direct English translation of the source name - ''Archiv Policie České republiky'' (Archive of the Police of the Czech Republic), as confirmed by the museum''s website - which states it is part of the police archives. TYPE MATCH: The Google Place type ''museum'' is in the list of expected - types for GRP.HER. LOCATION MATCH: The address is in Praha (Prague), Czechia, matching the country ''CZ'' of the source. - ENTITY TYPE: It is a museum, a type of heritage custodian.' + reasoning: 'NAME MATCH: The Google Place name ''Czech Police Museum'' is a direct + English translation of the source name ''Archiv Policie České republiky'' (Archive + of the Police of the Czech Republic), as confirmed by the museum''s website + which states it is part of the police archives. TYPE MATCH: The Google Place + type ''museum'' is in the list of expected types for GRP.HER. LOCATION MATCH: + The address is in Praha (Prague), Czechia, matching the country ''CZ'' of the + source. ENTITY TYPE: It is a museum, a type of heritage custodian.' agent: glm-4.6 verified: true ch_annotator_version: ch_annotator-v1_7_0 @@ -312,3 +324,23 @@ location: formatted_address: Ke Karlovu 453/1, Nové Město, 120 00 Praha-Praha 2, Czechia geonames_id: 3067696 normalization_timestamp: '2025-12-09T06:49:28.022072+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:50.538752+00:00' + source_url: https://www.muzeumpolicie.cz + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://www.muzeumpolicie.cz/img/SERVER_logo.png + source_url: https://www.muzeumpolicie.cz + css_selector: '[document] > html.fontawesome-i2svg-active.fontawesome-i2svg-complete + > head > link:nth-of-type(6)' + retrieved_on: '2025-12-23T20:19:50.538752+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: image/x-icon + favicon_sizes: '' + summary: + total_claims: 1 + has_primary_logo: false + has_favicon: true + has_og_image: false + favicon_count: 1 diff --git a/data/custodian/CZ-10-PRA-A-APH.yaml b/data/custodian/CZ-10-PRA-A-APH.yaml index 11b4b91db4..b3bd2d4efc 100644 --- a/data/custodian/CZ-10-PRA-A-APH.yaml +++ b/data/custodian/CZ-10-PRA-A-APH.yaml @@ -79,9 +79,11 @@ provenance: confidence_score: 0.85 notes: - 'Country resolved 2025-12-06T23:54:39Z: XX→CZ via Wikidata P17' - - 'City resolved 2025-12-07T00:35:11Z: XXX->PRA via Wikidata Q46996155 coords (50.0902,14.3987) -> Prague (GeoNames:3067696)' + - 'City resolved 2025-12-07T00:35:11Z: XXX->PRA via Wikidata Q46996155 coords (50.0902,14.3987) + -> Prague (GeoNames:3067696)' - 'Region resolved 2025-12-07T11:29:19Z: XX->10 via Wikidata P131 (CZ-10)' - - 'YouTube/Google Maps enrichment 2025-12-08T19:46:23Z: Maps: Archiv Pražského hradu (conf: 1.00); YouTube: not found' + - 'YouTube/Google Maps enrichment 2025-12-08T19:46:23Z: Maps: Archiv Pražského hradu + (conf: 1.00); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:28Z ch_annotator: @@ -256,20 +258,22 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/105730745460531884055/reviews rating: 4 relative_time_description: 5 years ago - text: 'The archives of prague castle with all documents are preserved here. The artifacts and scrolls were found in the - 1920 and preserved. + text: 'The archives of prague castle with all documents are preserved here. The + artifacts and scrolls were found in the 1920 and preserved. All the repairs , restoration about the place is maintained and organised well. - The tour is very interesting with the guide explaining all details about this place.' + The tour is very interesting with the guide explaining all details about this + place.' publish_time: '2020-08-14T16:07:35.186666Z' - author_name: Andy Stewart author_uri: https://www.google.com/maps/contrib/109939887611010111342/reviews rating: 5 relative_time_description: 7 years ago - text: Keep saying this, but so many beautifully kept historic buildings in one area, another one well worth visiting. + text: Keep saying this, but so many beautifully kept historic buildings in one + area, another one well worth visiting. publish_time: '2018-10-17T19:56:38.574233136Z' - author_name: Kevin Lu author_uri: https://www.google.com/maps/contrib/111795433458884676995/reviews @@ -281,15 +285,15 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/117295849426126661636/reviews rating: 5 relative_time_description: 2 years ago - text: The castle's documents are kept in the archives. The exterior is breathtaking. It is elegantly decorated. The entrance - overlooks the square. + text: The castle's documents are kept in the archives. The exterior is breathtaking. + It is elegantly decorated. The entrance overlooks the square. publish_time: '2023-01-03T21:16:17.193792Z' - author_name: Davide Nardoni author_uri: https://www.google.com/maps/contrib/111964551332611513527/reviews rating: 5 relative_time_description: 2 years ago - text: The Prague Castle archives are kept here. A modern building with no architectural impact, located to the side of - the famous Prague Cathedral. + text: The Prague Castle archives are kept here. A modern building with no architectural + impact, located to the side of the famous Prague Cathedral. publish_time: '2022-12-31T08:30:56.561469Z' photo_count: 10 photos_metadata: @@ -313,8 +317,9 @@ google_maps_enrichment: is_match: true confidence: 1.0 entity_type: GRP.HER - reasoning: Excellent match. The names are identical, the location is in the expected city and country (Czechia), and the - Google Place type 'library' is a perfect match for the expected heritage institution type. + reasoning: Excellent match. The names are identical, the location is in the expected + city and country (Czechia), and the Google Place type 'library' is a perfect + match for the expected heritage institution type. agent: glm-4.6 verified: true ch_annotator_version: ch_annotator-v1_7_0 @@ -337,3 +342,28 @@ location: formatted_address: III. nádvoří 119 08, 119 00 Praha 1-Hrad, Czechia geonames_id: 3067696 normalization_timestamp: '2025-12-09T06:49:28.061633+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:55.330200+00:00' + source_url: https://www.prazskyhradarchiv.cz + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://www.prazskyhradarchiv.cz/img/safari-pinned-tab.svg + source_url: https://www.prazskyhradarchiv.cz + css_selector: '[document] > html.js > head > link:nth-of-type(5)' + retrieved_on: '2025-12-23T20:19:55.330200+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: https://www.prazskyhradarchiv.cz/img/social-media-logo.png + source_url: https://www.prazskyhradarchiv.cz + css_selector: '[document] > html.js > head > meta:nth-of-type(14)' + retrieved_on: '2025-12-23T20:19:55.330200+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 4 diff --git a/data/custodian/CZ-10-PRA-A-APS.yaml b/data/custodian/CZ-10-PRA-A-APS.yaml index 8e80005b9b..d135ca772a 100644 --- a/data/custodian/CZ-10-PRA-A-APS.yaml +++ b/data/custodian/CZ-10-PRA-A-APS.yaml @@ -80,9 +80,10 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:40Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-07T00:00:23Z: XX->10 via Wikidata P131 (CZ-10)' - - 'City resolved 2025-12-07T00:31:16Z: XXX->PRA via Wikidata Q55025755 coords (50.0875,14.4214) -> Prague (GeoNames:3067696)' - - 'YouTube/Google Maps enrichment 2025-12-08T19:46:31Z: Maps: Parlamentní knihovna a Archiv Poslanecké sněmovny (conf: 1.00); - YouTube: not found' + - 'City resolved 2025-12-07T00:31:16Z: XXX->PRA via Wikidata Q55025755 coords (50.0875,14.4214) + -> Prague (GeoNames:3067696)' + - 'YouTube/Google Maps enrichment 2025-12-08T19:46:31Z: Maps: Parlamentní knihovna + a Archiv Poslanecké sněmovny (conf: 1.00); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:28Z ch_annotator: @@ -261,12 +262,14 @@ google_maps_enrichment: is_match: true confidence: 1.0 entity_type: GRP.HER - reasoning: 'NAME MATCH: The Google Maps name ''Parlamentní knihovna a Archiv Poslanecké sněmovny'' (Parliamentary Library - and Archive of the Chamber of Deputies) explicitly contains the source institution''s name ''Archiv Poslanecké sněmovny''. - LOCATION MATCH: Both are in the Czech Republic (CZ), with the Google Maps address specifying Prague. TYPE MATCH: Although - Google''s types (''government_office'') are not in the expected list, the name confirms it is an ''archiv'' (archive) - and ''knihovna'' (library), which are heritage institution types. ENTITY TYPE: The entity is an archive, which is a - type of Heritage Custodian (GRP.HER).' + reasoning: 'NAME MATCH: The Google Maps name ''Parlamentní knihovna a Archiv Poslanecké + sněmovny'' (Parliamentary Library and Archive of the Chamber of Deputies) explicitly + contains the source institution''s name ''Archiv Poslanecké sněmovny''. LOCATION + MATCH: Both are in the Czech Republic (CZ), with the Google Maps address specifying + Prague. TYPE MATCH: Although Google''s types (''government_office'') are not + in the expected list, the name confirms it is an ''archiv'' (archive) and ''knihovna'' + (library), which are heritage institution types. ENTITY TYPE: The entity is + an archive, which is a type of Heritage Custodian (GRP.HER).' agent: glm-4.6 verified: true ch_annotator_version: ch_annotator-v1_7_0 @@ -289,3 +292,22 @@ location: formatted_address: Komunardů 1634/44, 170 00 Praha 7-Holešovice, Czechia geonames_id: 3067696 normalization_timestamp: '2025-12-09T06:49:28.101039+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:00.532753+00:00' + source_url: https://www.psp.cz/sqw/hp.sqw + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://www.psp.cz/favicon.ico + source_url: https://www.psp.cz/sqw/hp.sqw + css_selector: '[document] > html > head > link' + retrieved_on: '2025-12-23T20:20:00.532753+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: image/x-icon + favicon_sizes: '' + summary: + total_claims: 1 + has_primary_logo: false + has_favicon: true + has_og_image: false + favicon_count: 1 diff --git a/data/custodian/CZ-10-PRA-A-ASMVCR.yaml b/data/custodian/CZ-10-PRA-A-ASMVCR.yaml index 744a2e5bff..4d1637c7bf 100644 --- a/data/custodian/CZ-10-PRA-A-ASMVCR.yaml +++ b/data/custodian/CZ-10-PRA-A-ASMVCR.yaml @@ -73,8 +73,10 @@ provenance: confidence_score: 0.85 notes: - 'Country resolved 2025-12-06T23:54:38Z: XX→CZ via Wikidata P17' - - Removed incorrect wikidata_enrichment on 2025-12-08T08:18:45.461153+00:00. Re-enrichment required with proper matching. - - 'YouTube/Google Maps enrichment 2025-12-08T19:46:35Z: Maps: National Archive (conf: 0.90); YouTube: not found' + - Removed incorrect wikidata_enrichment on 2025-12-08T08:18:45.461153+00:00. Re-enrichment + required with proper matching. + - 'YouTube/Google Maps enrichment 2025-12-08T19:46:35Z: Maps: National Archive (conf: + 0.90); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:28Z ch_annotator: @@ -197,15 +199,16 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/107425710254765644916/reviews rating: 5 relative_time_description: 10 months ago - text: The staff are always very friendly. There's even a lady who speaks English very well. Even without the language, - you can communicate. Everyone is helpful and extremely customer-friendly. I enjoy coming here. + text: The staff are always very friendly. There's even a lady who speaks English + very well. Even without the language, you can communicate. Everyone is helpful + and extremely customer-friendly. I enjoy coming here. publish_time: '2025-02-06T21:43:14.918876Z' - author_name: Barunka G. author_uri: https://www.google.com/maps/contrib/102003756317069132015/reviews rating: 4 relative_time_description: 5 years ago - text: In the research room of the 1st department (Milady Horákové Street), they are very helpful and willing to help you - search for archival materials. + text: In the research room of the 1st department (Milady Horákové Street), they + are very helpful and willing to help you search for archival materials. publish_time: '2020-08-12T16:42:30.619293Z' - author_name: Kamila Svobodova author_uri: https://www.google.com/maps/contrib/109299302074939194601/reviews @@ -257,11 +260,13 @@ google_maps_enrichment: is_match: true confidence: 0.9 entity_type: GRP.HER - reasoning: 'High confidence match. The names refer to the same institution: ''Achivní správa Ministerstva vnitra České - republiky'' (Archive Administration of the Ministry of the Interior of the Czech Republic) is commonly translated as - ''National Archive'', as confirmed by its website. The location is in Prague, Czechia, matching the source country. - The Google place type ''library'' is a strong proxy for an archive and fits the expected heritage institution types. - The source is an archive, and archives are a core heritage custodian type (glam:HeritageCustodian).' + reasoning: 'High confidence match. The names refer to the same institution: ''Achivní + správa Ministerstva vnitra České republiky'' (Archive Administration of the + Ministry of the Interior of the Czech Republic) is commonly translated as ''National + Archive'', as confirmed by its website. The location is in Prague, Czechia, + matching the source country. The Google place type ''library'' is a strong proxy + for an archive and fits the expected heritage institution types. The source + is an archive, and archives are a core heritage custodian type (glam:HeritageCustodian).' agent: glm-4.6 verified: true ch_annotator_version: ch_annotator-v1_7_0 @@ -282,3 +287,38 @@ location: street_address: M. Horákové 5, Praha 6-Hradčany formatted_address: 133, M. Horákové 5, 160 00 Praha 6-Hradčany, Czechia normalization_timestamp: '2025-12-09T06:49:28.148618+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:05.833479+00:00' + source_url: http://www.nacr.cz + extraction_method: crawl4ai + claims: + - claim_type: logo_url + claim_value: https://www.nacr.cz/wp-content/themes/narodni_archiv/img/logo_na_en.png + source_url: http://www.nacr.cz + css_selector: '#wrapper-navbar > header.header > div.header__main:nth-of-type(2) + > nav.navbar.navbar-expand-lg > div.navbar-mobile-top > div.navbar-brand > a + > img' + retrieved_on: '2025-12-23T20:20:05.833479+00:00' + extraction_method: crawl4ai_header_logo + detection_confidence: high + alt_text: National Archives + - claim_type: favicon_url + claim_value: https://www.nacr.cz/wp-content/themes/narodni_archiv/img/favicon/apple-icon-180x180.png + source_url: http://www.nacr.cz + css_selector: '[document] > html > head > link:nth-of-type(9)' + retrieved_on: '2025-12-23T20:20:05.833479+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: 180x180 + - claim_type: og_image_url + claim_value: https://www.nacr.cz/wp-content/uploads/2020/04/podatelna2_22-1.jpg + source_url: http://www.nacr.cz + css_selector: '[document] > html > head > meta:nth-of-type(17)' + retrieved_on: '2025-12-23T20:20:05.833479+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 3 + has_primary_logo: true + has_favicon: true + has_og_image: true + favicon_count: 13 diff --git a/data/custodian/CZ-10-PRA-A-AUPZSI.yaml b/data/custodian/CZ-10-PRA-A-AUPZSI.yaml index 43ce425384..fbb05fdf77 100644 --- a/data/custodian/CZ-10-PRA-A-AUPZSI.yaml +++ b/data/custodian/CZ-10-PRA-A-AUPZSI.yaml @@ -80,9 +80,10 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:40Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-07T00:01:05Z: XX->10 via Wikidata P131 (CZ-10)' - - 'City resolved 2025-12-07T00:37:04Z: XXX->PRA via Wikidata Q101475934 coords (50.0875,14.4214) -> Prague (GeoNames:3067696)' - - 'YouTube/Google Maps enrichment 2025-12-08T19:47:00Z: Maps: Office for Foreign Relations and Information (conf: 0.90); - YouTube: not found' + - 'City resolved 2025-12-07T00:37:04Z: XXX->PRA via Wikidata Q101475934 coords (50.0875,14.4214) + -> Prague (GeoNames:3067696)' + - 'YouTube/Google Maps enrichment 2025-12-08T19:47:00Z: Maps: Office for Foreign + Relations and Information (conf: 0.90); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:28Z ch_annotator: @@ -223,10 +224,12 @@ google_maps_enrichment: is_match: true confidence: 0.9 entity_type: GRP.HER - reasoning: The name is a direct translation (Archiv Úřadu pro zahraniční styky a informace -> Office for Foreign Relations - and Information). The location is a perfect match, being in the same district of Prague, Czech Republic. The website - (uzsi.cz) confirms the identity. The name 'Archiv...' explicitly states it is an archive, and the Wikidata ID Q101475934 - corresponds to the National Security Archive. While the Google Place types are generic, the contextual evidence strongly + reasoning: The name is a direct translation (Archiv Úřadu pro zahraniční styky + a informace -> Office for Foreign Relations and Information). The location is + a perfect match, being in the same district of Prague, Czech Republic. The website + (uzsi.cz) confirms the identity. The name 'Archiv...' explicitly states it is + an archive, and the Wikidata ID Q101475934 corresponds to the National Security + Archive. While the Google Place types are generic, the contextual evidence strongly supports this being an archive, which is a type of heritage institution. agent: glm-4.6 verified: true @@ -250,3 +253,22 @@ location: formatted_address: Střelničná 1673/10, 182 00 Praha 8-Kobylisy, Czechia geonames_id: 3067696 normalization_timestamp: '2025-12-09T06:49:28.326287+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:29.242351+00:00' + source_url: https://www.uzsi.cz + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://www.uzsi.cz/favicon.png + source_url: https://www.uzsi.cz + css_selector: '[document] > html > head > link' + retrieved_on: '2025-12-23T20:20:29.242351+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + summary: + total_claims: 1 + has_primary_logo: false + has_favicon: true + has_og_image: false + favicon_count: 1 diff --git a/data/custodian/CZ-10-PRA-A-AZMVP.yaml b/data/custodian/CZ-10-PRA-A-AZMVP.yaml index c918ec140a..0f485bdda0 100644 --- a/data/custodian/CZ-10-PRA-A-AZMVP.yaml +++ b/data/custodian/CZ-10-PRA-A-AZMVP.yaml @@ -80,9 +80,10 @@ provenance: notes: - 'Country resolved 2025-12-06T23:54:40Z: XX→CZ via Wikidata P17' - 'Region resolved 2025-12-07T00:02:00Z: XX->10 via Wikidata P131 (CZ-10)' - - 'City resolved 2025-12-07T00:31:18Z: XXX->PRA via Wikidata Q101493927 coords (50.0875,14.4214) -> Prague (GeoNames:3067696)' - - 'YouTube/Google Maps enrichment 2025-12-08T19:47:03Z: Maps: Archiv Židovského muzea v Praze (conf: 1.00); YouTube: not - found' + - 'City resolved 2025-12-07T00:31:18Z: XXX->PRA via Wikidata Q101493927 coords (50.0875,14.4214) + -> Prague (GeoNames:3067696)' + - 'YouTube/Google Maps enrichment 2025-12-08T19:47:03Z: Maps: Archiv Židovského + muzea v Praze (conf: 1.00); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:28Z ch_annotator: @@ -182,14 +183,16 @@ wikidata_enrichment: instance_of: &id004 - id: Q12161242 label: private archive - description: archival collection or institution that is not accessible to the public + description: archival collection or institution that is not accessible to the + public - id: Q53566456 label: museum archive - description: archive established by a museum to collect, organize, preserve, and provide access to its organizational - records + description: archive established by a museum to collect, organize, preserve, + and provide access to its organizational records - id: Q1307560 label: Jewish museum - description: type of museum that documents the history and culture of the Jewish people + description: type of museum that documents the history and culture of the Jewish + people wikidata_instance_of: *id004 wikidata_location: headquarters_location: @@ -242,7 +245,8 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/115657411803981337484/reviews rating: 5 relative_time_description: 7 years ago - text: Peace and prosperity, to those far and near (inscription on the synagogue wall) + text: Peace and prosperity, to those far and near (inscription on the synagogue + wall) publish_time: '2018-08-21T16:08:34.751790Z' - author_name: Stanka Černáková author_uri: https://www.google.com/maps/contrib/113668783615557513346/reviews @@ -290,8 +294,9 @@ google_maps_enrichment: is_match: true confidence: 1.0 entity_type: GRP.HER - reasoning: The candidate's name exactly matches the source institution. The location is in Prague, Czechia, matching the - expected country. The Google Place type 'museum' is an expected type for a heritage institution. The website confirms + reasoning: The candidate's name exactly matches the source institution. The location + is in Prague, Czechia, matching the expected country. The Google Place type + 'museum' is an expected type for a heritage institution. The website confirms this is the archive of the Jewish Museum in Prague, solidifying the match. agent: glm-4.6 verified: true @@ -315,3 +320,28 @@ location: formatted_address: 32, Stroupežnického 290, Anděl, 150 00 Praha-Praha 5, Czechia geonames_id: 3067696 normalization_timestamp: '2025-12-09T06:49:28.367270+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:34.817680+00:00' + source_url: https://www.jewishmuseum.cz/sbirky-a-vyzkum/sbirky-a-fondy/archiv-sbirky-a-fondy + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://c.jewishmuseum.cz/images/design/favicon.ico + source_url: https://www.jewishmuseum.cz/sbirky-a-vyzkum/sbirky-a-fondy/archiv-sbirky-a-fondy + css_selector: '[document] > html.show--consent > head > link' + retrieved_on: '2025-12-23T20:20:34.817680+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: https://c.jewishmuseum.cz/images/design/2013/zmp-logo-fb.png + source_url: https://www.jewishmuseum.cz/sbirky-a-vyzkum/sbirky-a-fondy/archiv-sbirky-a-fondy + css_selector: '[document] > html.show--consent > head > meta:nth-of-type(9)' + retrieved_on: '2025-12-23T20:20:34.817680+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 1 diff --git a/data/custodian/CZ-10-PRA-A-BAKPR.yaml b/data/custodian/CZ-10-PRA-A-BAKPR.yaml index 0bb0501f6f..d5dc937a23 100644 --- a/data/custodian/CZ-10-PRA-A-BAKPR.yaml +++ b/data/custodian/CZ-10-PRA-A-BAKPR.yaml @@ -74,10 +74,13 @@ provenance: confidence_score: 0.85 notes: - 'Country resolved 2025-12-06T23:54:40Z: XX→CZ via Wikidata P17' - - 'City resolved 2025-12-07T00:36:54Z: XXX->PRA via Wikidata Q46996293 coords (50.0913,14.4037) -> Prague (GeoNames:3067696)' + - 'City resolved 2025-12-07T00:36:54Z: XXX->PRA via Wikidata Q46996293 coords (50.0913,14.4037) + -> Prague (GeoNames:3067696)' - 'Region resolved 2025-12-07T11:30:40Z: XX->10 via Wikidata P131 (CZ-10)' - - Removed incorrect wikidata_enrichment on 2025-12-08T08:18:45.468069+00:00. Re-enrichment required with proper matching. - - 'YouTube/Google Maps enrichment 2025-12-08T19:47:07Z: Maps: Archiv Pražského hradu (conf: 0.95); YouTube: not found' + - Removed incorrect wikidata_enrichment on 2025-12-08T08:18:45.468069+00:00. Re-enrichment + required with proper matching. + - 'YouTube/Google Maps enrichment 2025-12-08T19:47:07Z: Maps: Archiv Pražského hradu + (conf: 0.95); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:28Z ch_annotator: @@ -234,20 +237,22 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/105730745460531884055/reviews rating: 4 relative_time_description: 5 years ago - text: 'The archives of prague castle with all documents are preserved here. The artifacts and scrolls were found in the - 1920 and preserved. + text: 'The archives of prague castle with all documents are preserved here. The + artifacts and scrolls were found in the 1920 and preserved. All the repairs , restoration about the place is maintained and organised well. - The tour is very interesting with the guide explaining all details about this place.' + The tour is very interesting with the guide explaining all details about this + place.' publish_time: '2020-08-14T16:07:35.186666Z' - author_name: Andy Stewart author_uri: https://www.google.com/maps/contrib/109939887611010111342/reviews rating: 5 relative_time_description: 7 years ago - text: Keep saying this, but so many beautifully kept historic buildings in one area, another one well worth visiting. + text: Keep saying this, but so many beautifully kept historic buildings in one + area, another one well worth visiting. publish_time: '2018-10-17T19:56:38.574233136Z' - author_name: Kevin Lu author_uri: https://www.google.com/maps/contrib/111795433458884676995/reviews @@ -259,15 +264,15 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/117295849426126661636/reviews rating: 5 relative_time_description: 2 years ago - text: The castle's documents are kept in the archives. The exterior is breathtaking. It is elegantly decorated. The entrance - overlooks the square. + text: The castle's documents are kept in the archives. The exterior is breathtaking. + It is elegantly decorated. The entrance overlooks the square. publish_time: '2023-01-03T21:16:17.193792Z' - author_name: Davide Nardoni author_uri: https://www.google.com/maps/contrib/111964551332611513527/reviews rating: 5 relative_time_description: 2 years ago - text: The Prague Castle archives are kept here. A modern building with no architectural impact, located to the side of - the famous Prague Cathedral. + text: The Prague Castle archives are kept here. A modern building with no architectural + impact, located to the side of the famous Prague Cathedral. publish_time: '2022-12-31T08:30:56.561469Z' photo_count: 10 photos_metadata: @@ -291,12 +296,15 @@ google_maps_enrichment: is_match: true confidence: 0.95 entity_type: GRP.HER - reasoning: The names refer to the same institution. The source name is 'Bezpečnostní archiv Kanceláře prezidenta republiky' - (Security Archive of the Office of the President of the Republic) and the Google Maps name is 'Archiv Pražského hradu' - (Archive of Prague Castle). Research confirms that the Bezpečnostní archiv is the legal successor to the Archive of - Prague Castle and is located within the Prague Castle complex. The location is a correct match (Praha 1, Czechia). The - Google Place type 'library' is consistent with the expected heritage institution types. The entity is a heritage custodian - institution. Therefore, it is a high-confidence match. + reasoning: The names refer to the same institution. The source name is 'Bezpečnostní + archiv Kanceláře prezidenta republiky' (Security Archive of the Office of the + President of the Republic) and the Google Maps name is 'Archiv Pražského hradu' + (Archive of Prague Castle). Research confirms that the Bezpečnostní archiv is + the legal successor to the Archive of Prague Castle and is located within the + Prague Castle complex. The location is a correct match (Praha 1, Czechia). The + Google Place type 'library' is consistent with the expected heritage institution + types. The entity is a heritage custodian institution. Therefore, it is a high-confidence + match. agent: glm-4.6 verified: true ch_annotator_version: ch_annotator-v1_7_0 @@ -319,3 +327,22 @@ location: formatted_address: III. nádvoří 119 08, 119 00 Praha 1-Hrad, Czechia geonames_id: 3067696 normalization_timestamp: '2025-12-09T06:49:28.406971+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:39.662727+00:00' + source_url: http://www.prazskyhradarchiv.cz/archivKPR/cz + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.prazskyhradarchiv.cz/img/safari-pinned-tab.svg + source_url: http://www.prazskyhradarchiv.cz/archivKPR/cz + css_selector: '[document] > html.js > head > link:nth-of-type(5)' + retrieved_on: '2025-12-23T20:20:39.662727+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + summary: + total_claims: 1 + has_primary_logo: false + has_favicon: true + has_og_image: false + favicon_count: 4 diff --git a/data/custodian/CZ-10-PRA-A-BAVZ.yaml b/data/custodian/CZ-10-PRA-A-BAVZ.yaml index 1b7a89ac81..1a184bda1a 100644 --- a/data/custodian/CZ-10-PRA-A-BAVZ.yaml +++ b/data/custodian/CZ-10-PRA-A-BAVZ.yaml @@ -36,7 +36,8 @@ ghcid: reason: 'Country resolved via Wikidata P17: XX→CZ' - ghcid: CZ-10-PRA-A-BAVZ valid_from: '2025-12-07T12:39:42.485462+00:00' - reason: 'Location resolved from institution name pattern: ''Prague'' → region 10, city PRA' + reason: 'Location resolved from institution name pattern: ''Prague'' → region + 10, city PRA' custodian_name: claim_type: custodian_name claim_value: Bezpečnostní archiv Vojenského zpravodajství @@ -62,7 +63,8 @@ provenance: confidence_score: 0.85 notes: - 'Country resolved 2025-12-06T23:56:12Z: XX→CZ via Wikidata P17' - - 'YouTube/Google Maps enrichment 2025-12-08T19:47:15Z: Maps: Central Military Archives (conf: 0.80); YouTube: not found' + - 'YouTube/Google Maps enrichment 2025-12-08T19:47:15Z: Maps: Central Military Archives + (conf: 0.80); YouTube: not found' - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:11:19Z - Canonical location added via normalize_custodian_files.py on 2025-12-08T23:48:04Z - Canonical location added via normalize_custodian_files.py on 2025-12-09T06:49:28Z @@ -156,22 +158,25 @@ google_maps_enrichment: author_uri: https://www.google.com/maps/contrib/110616501951342355701/reviews rating: 5 relative_time_description: 6 years ago - text: I visited my former colleagues at the reception desk here. The archive has beautiful plant decorations in front - of the entrance and in the lobby. It is peaceful and quiet. Many people come here to do research. + text: I visited my former colleagues at the reception desk here. The archive has + beautiful plant decorations in front of the entrance and in the lobby. It is + peaceful and quiet. Many people come here to do research. publish_time: '2019-10-04T22:45:28.959462Z' - author_name: Vasil “Ben Lee Meier” Mohorita author_uri: https://www.google.com/maps/contrib/108576684499626840750/reviews rating: 5 relative_time_description: 3 years ago - text: The first time I was in that area was for my cousin's swearing-in ceremony. Around 1967. Then I went there on the - occasion of the commemorations of November 17, 1939. Today I go there to the army archives. + text: The first time I was in that area was for my cousin's swearing-in ceremony. + Around 1967. Then I went there on the occasion of the commemorations of November + 17, 1939. Today I go there to the army archives. publish_time: '2022-01-11T21:48:03.858557Z' - author_name: Diiinka author_uri: https://www.google.com/maps/contrib/101993642396958778679/reviews rating: 2 relative_time_description: 3 years ago - text: There is no coffee machine in the archive, nor any facilities outside the research room. The reception staff is - reluctant and uninformed. The archival materials in the cardboard boxes were numbered, but they were very confusingly + text: There is no coffee machine in the archive, nor any facilities outside the + research room. The reception staff is reluctant and uninformed. The archival + materials in the cardboard boxes were numbered, but they were very confusingly shuffled. publish_time: '2022-02-05T13:49:32.848868Z' photo_count: 10 @@ -196,10 +201,12 @@ google_maps_enrichment: is_match: true confidence: 0.8 entity_type: GRP.HER - reasoning: The Google Maps candidate 'Central Military Archives' is a likely translation or official English name for - the source institution 'Bezpečnostní archiv Vojenského zpravodajství'. The location in Praha (CZ) is a match. The website - (vuapraha.cz) corresponds to the Vojenský ústřední archiv (Central Military Archive), confirming the institutional identity. - Although Google lacks a specific 'archive' type, the candidate is a military archive, which is a type of heritage institution + reasoning: The Google Maps candidate 'Central Military Archives' is a likely translation + or official English name for the source institution 'Bezpečnostní archiv Vojenského + zpravodajství'. The location in Praha (CZ) is a match. The website (vuapraha.cz) + corresponds to the Vojenský ústřední archiv (Central Military Archive), confirming + the institutional identity. Although Google lacks a specific 'archive' type, + the candidate is a military archive, which is a type of heritage institution (GRP.HER). agent: glm-4.6 verified: true @@ -221,3 +228,22 @@ location: street_address: Pilotů 217/12, Praha 6 formatted_address: Pilotů 217/12, 161 00 Praha 6, Czechia normalization_timestamp: '2025-12-09T06:49:28.469190+00:00' +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:47.483767+00:00' + source_url: https://www.vuapraha.cz + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://www.vuapraha.cz/wp-content/themes/iq-theme/dist/img/safari-pinned-tab.svg + source_url: https://www.vuapraha.cz + css_selector: '[document] > html > head > link:nth-of-type(4)' + retrieved_on: '2025-12-23T20:20:47.483767+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + summary: + total_claims: 1 + has_primary_logo: false + has_favicon: true + has_og_image: false + favicon_count: 7 diff --git a/data/custodian/JP-10-MAE-L-LITCGU.yaml b/data/custodian/JP-10-MAE-L-LITCGU.yaml index 13ff4b47ff..dfb50a109f 100644 --- a/data/custodian/JP-10-MAE-L-LITCGU.yaml +++ b/data/custodian/JP-10-MAE-L-LITCGU.yaml @@ -211,3 +211,22 @@ location: geonames_id: 1857843 geonames_name: Maebashi feature_code: PPLA +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:17:47.555064+00:00' + source_url: http://www.media.gunma-u.ac.jp + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: https://www.media.gunma-u.ac.jp/assets/templates/media/favicon.ico + source_url: http://www.media.gunma-u.ac.jp + css_selector: '[document] > html > head > link:nth-of-type(2)' + retrieved_on: '2025-12-23T20:17:47.555064+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + summary: + total_claims: 1 + has_primary_logo: false + has_favicon: true + has_og_image: false + favicon_count: 1 diff --git a/data/custodian/JP-10-MAE-L-ML-maebashikokadaigakufuzoku_library.yaml b/data/custodian/JP-10-MAE-L-ML-maebashikokadaigakufuzoku_library.yaml index 7c254741a3..d23e69de4a 100644 --- a/data/custodian/JP-10-MAE-L-ML-maebashikokadaigakufuzoku_library.yaml +++ b/data/custodian/JP-10-MAE-L-ML-maebashikokadaigakufuzoku_library.yaml @@ -207,3 +207,30 @@ location: geonames_id: 1857843 geonames_name: Maebashi feature_code: PPLA +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:18:16.281140+00:00' + source_url: http://www.maebashi-it.ac.jp/library + extraction_method: crawl4ai + claims: + - claim_type: logo_url + claim_value: https://www.maebashi-it.ac.jp/images/logo.gif + source_url: http://www.maebashi-it.ac.jp/library + css_selector: '#h_logo > a > img' + retrieved_on: '2025-12-23T20:18:16.281140+00:00' + extraction_method: crawl4ai_header_logo + detection_confidence: high + alt_text: 前橋工科大学 + - claim_type: favicon_url + claim_value: https://www.maebashi-it.ac.jp/favicon.ico + source_url: http://www.maebashi-it.ac.jp/library + css_selector: '[document] > html > head > link' + retrieved_on: '2025-12-23T20:18:16.281140+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: image/vnd.microsoft.icon + favicon_sizes: '' + summary: + total_claims: 2 + has_primary_logo: true + has_favicon: true + has_og_image: false + favicon_count: 1 diff --git a/data/custodian/JP-10-MAE-L-MLF.yaml b/data/custodian/JP-10-MAE-L-MLF.yaml index 0786a532a5..94111ef996 100644 --- a/data/custodian/JP-10-MAE-L-MLF.yaml +++ b/data/custodian/JP-10-MAE-L-MLF.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007096.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007096.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:18:33.146679+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007096.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007096.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:18:33.146679+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007096.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:18:33.146679+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLH.yaml b/data/custodian/JP-10-MAE-L-MLH.yaml index efd4bcb47a..d18f023eb4 100644 --- a/data/custodian/JP-10-MAE-L-MLH.yaml +++ b/data/custodian/JP-10-MAE-L-MLH.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007087.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007087.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:18:42.071653+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007087.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007087.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:18:42.071653+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007087.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:18:42.071653+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLJ.yaml b/data/custodian/JP-10-MAE-L-MLJ.yaml index 514ed59e06..df0698e3ce 100644 --- a/data/custodian/JP-10-MAE-L-MLJ.yaml +++ b/data/custodian/JP-10-MAE-L-MLJ.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007092.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007092.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:18:51.243654+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007092.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007092.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:18:51.243654+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007092.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:18:51.243654+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kaigayabunkan.yaml b/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kaigayabunkan.yaml index 543e58366a..6798e52809 100644 --- a/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kaigayabunkan.yaml +++ b/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kaigayabunkan.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007088.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007088.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:00.185807+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007088.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007088.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:19:00.185807+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007088.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:19:00.185807+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kasukawabunkan.yaml b/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kasukawabunkan.yaml index 45eacf076f..e4ed83d696 100644 --- a/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kasukawabunkan.yaml +++ b/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kasukawabunkan.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007095.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007095.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:08.942928+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007095.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007095.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:19:08.942928+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007095.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:19:08.942928+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kiyosatobunkan.yaml b/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kiyosatobunkan.yaml index 03f8b8db82..ac167f2767 100644 --- a/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kiyosatobunkan.yaml +++ b/data/custodian/JP-10-MAE-L-MLK-maebashishiritsu_library_kiyosatobunkan.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007091.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007091.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:17.937119+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007091.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007091.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:19:17.937119+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007091.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:19:17.937119+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLK.yaml b/data/custodian/JP-10-MAE-L-MLK.yaml index 0b8ed4c357..0c54f77dc6 100644 --- a/data/custodian/JP-10-MAE-L-MLK.yaml +++ b/data/custodian/JP-10-MAE-L-MLK.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007085.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007085.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:26.935729+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007085.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007085.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:19:26.935729+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007085.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:19:26.935729+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLM-maebashishiritsu_library_miyagibunkan.yaml b/data/custodian/JP-10-MAE-L-MLM-maebashishiritsu_library_miyagibunkan.yaml index fca6187b2a..79f29d9dbd 100644 --- a/data/custodian/JP-10-MAE-L-MLM-maebashishiritsu_library_miyagibunkan.yaml +++ b/data/custodian/JP-10-MAE-L-MLM-maebashishiritsu_library_miyagibunkan.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007094.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007094.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:35.422766+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007094.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007094.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:19:35.422766+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007094.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:19:35.422766+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLM.yaml b/data/custodian/JP-10-MAE-L-MLM.yaml index 69df3ff582..7b775a371b 100644 --- a/data/custodian/JP-10-MAE-L-MLM.yaml +++ b/data/custodian/JP-10-MAE-L-MLM.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007089.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007089.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:44.147868+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007089.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007089.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:19:44.147868+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007089.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:19:44.147868+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLN.yaml b/data/custodian/JP-10-MAE-L-MLN.yaml index a96207cff9..eaf9b7b606 100644 --- a/data/custodian/JP-10-MAE-L-MLN.yaml +++ b/data/custodian/JP-10-MAE-L-MLN.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007090.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007090.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:19:52.929296+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007090.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007090.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:19:52.929296+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007090.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:19:52.929296+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLO.yaml b/data/custodian/JP-10-MAE-L-MLO.yaml index 14a205dbfe..e6061f549d 100644 --- a/data/custodian/JP-10-MAE-L-MLO.yaml +++ b/data/custodian/JP-10-MAE-L-MLO.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007093.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007093.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:01.788226+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007093.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007093.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:20:01.788226+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007093.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:20:01.788226+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLS-maebashishiritsu_library_sogokyoikupurazabunkan.yaml b/data/custodian/JP-10-MAE-L-MLS-maebashishiritsu_library_sogokyoikupurazabunkan.yaml index ef494fc0cb..89588aa787 100644 --- a/data/custodian/JP-10-MAE-L-MLS-maebashishiritsu_library_sogokyoikupurazabunkan.yaml +++ b/data/custodian/JP-10-MAE-L-MLS-maebashishiritsu_library_sogokyoikupurazabunkan.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007097.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007097.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:10.619515+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007097.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007097.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:20:10.619515+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007097.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:20:10.619515+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLS-maebashishiritsu_library_sojabunkan.yaml b/data/custodian/JP-10-MAE-L-MLS-maebashishiritsu_library_sojabunkan.yaml index d1f16f8a76..1cb0b236f5 100644 --- a/data/custodian/JP-10-MAE-L-MLS-maebashishiritsu_library_sojabunkan.yaml +++ b/data/custodian/JP-10-MAE-L-MLS-maebashishiritsu_library_sojabunkan.yaml @@ -205,3 +205,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007750.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007750.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:19.504053+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007750.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007750.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:20:19.504053+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007750.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:20:19.504053+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-MLS.yaml b/data/custodian/JP-10-MAE-L-MLS.yaml index 0e737595d0..0efb39930f 100644 --- a/data/custodian/JP-10-MAE-L-MLS.yaml +++ b/data/custodian/JP-10-MAE-L-MLS.yaml @@ -204,3 +204,28 @@ wikidata_enrichment: wikidata_web: official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007086.html wikidata_official_website: http://www.city.maebashi.gunma.jp/shisetsu/425/p007086.html +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:28.730915+00:00' + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007086.html + extraction_method: crawl4ai + claims: + - claim_type: favicon_url + claim_value: http://www.city.maebashi.gunma.jp/smartphone.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007086.html + css_selector: '[document] > html > head > link:nth-of-type(3)' + retrieved_on: '2025-12-23T20:20:28.730915+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: '' + - claim_type: og_image_url + claim_value: http://www.city.maebashi.gunma.jp/material/images/group/10/banner.png + source_url: http://www.city.maebashi.gunma.jp/shisetsu/425/p007086.html + css_selector: '[document] > html > head > meta:nth-of-type(7)' + retrieved_on: '2025-12-23T20:20:28.730915+00:00' + extraction_method: crawl4ai_meta_og + summary: + total_claims: 2 + has_primary_logo: false + has_favicon: true + has_og_image: true + favicon_count: 2 diff --git a/data/custodian/JP-10-MAE-L-NITGCL.yaml b/data/custodian/JP-10-MAE-L-NITGCL.yaml index 7d9278c2bc..4d67ca0d3a 100644 --- a/data/custodian/JP-10-MAE-L-NITGCL.yaml +++ b/data/custodian/JP-10-MAE-L-NITGCL.yaml @@ -217,3 +217,30 @@ location: geonames_id: 1857843 geonames_name: Maebashi feature_code: PPLA +logo_enrichment: + enrichment_timestamp: '2025-12-23T20:20:40.682320+00:00' + source_url: https://www.lib.gunma-ct.ac.jp/index.htm + extraction_method: crawl4ai + claims: + - claim_type: logo_url + claim_value: https://www.gunma-ct.ac.jp/cms/wp-content/themes/kosen/images/common/header_logo.svg + source_url: https://www.lib.gunma-ct.ac.jp/index.htm + css_selector: '#mainmenu_sm_logo > a > img' + retrieved_on: '2025-12-23T20:20:40.682320+00:00' + extraction_method: crawl4ai_header_logo + detection_confidence: high + alt_text: 群馬工業高等専門学校 + - claim_type: favicon_url + claim_value: https://www.gunma-ct.ac.jp/cms/wp-content/uploads/2024/08/favicon.png + source_url: https://www.lib.gunma-ct.ac.jp/index.htm + css_selector: '[document] > html > head > link:nth-of-type(18)' + retrieved_on: '2025-12-23T20:20:40.682320+00:00' + extraction_method: crawl4ai_link_rel + favicon_type: '' + favicon_sizes: 192x192 + summary: + total_claims: 2 + has_primary_logo: true + has_favicon: true + has_og_image: false + favicon_count: 2