diff --git a/backend/ducklake/main.py b/backend/ducklake/main.py index 0af5e7e038..5a82333521 100644 --- a/backend/ducklake/main.py +++ b/backend/ducklake/main.py @@ -49,6 +49,12 @@ class Settings(BaseModel): # Database alias in DuckDB db_alias: str = os.getenv("DUCKLAKE_DB_ALIAS", "heritage") + # Web archives database path + web_archives_path: str = os.getenv( + "DUCKLAKE_WEB_ARCHIVES_PATH", + "/var/lib/glam/ducklake/web_archives.duckdb" + ) + # Server settings host: str = os.getenv("DUCKLAKE_HOST", "0.0.0.0") port: int = int(os.getenv("DUCKLAKE_PORT", "8765")) @@ -174,6 +180,28 @@ def get_connection() -> duckdb.DuckDBPyConnection: _conn.execute(attach_sql) print(f"DuckLake attached: {settings.db_alias} -> {settings.data_path}") + # Attach web archives database (read-only) + if Path(settings.web_archives_path).exists(): + try: + _conn.execute(f""" + ATTACH '{settings.web_archives_path}' AS web_archives (READ_ONLY) + """) + # Create views in heritage schema for seamless access + _conn.execute(""" + CREATE OR REPLACE VIEW heritage.web_archives AS SELECT * FROM web_archives.web_archives + """) + _conn.execute(""" + CREATE OR REPLACE VIEW heritage.web_pages AS SELECT * FROM web_archives.web_pages + """) + _conn.execute(""" + CREATE OR REPLACE VIEW heritage.web_claims AS SELECT * FROM web_archives.web_claims + """) + print(f"Web archives attached: {settings.web_archives_path}") + except Exception as wa_err: + print(f"Warning: Could not attach web archives: {wa_err}") + else: + print(f"Web archives not found: {settings.web_archives_path}") + except Exception as e: print(f"DuckLake extension not available: {e}") print("Falling back to standard DuckDB mode") diff --git a/data/custodian/BR-MI-BHO-E-UTL-ufmg_tainacan_lab.yaml b/data/custodian/BR-MI-BHO-E-UTL-ufmg_tainacan_lab.yaml index 83b0edb7ec..0c38bcdb32 100644 --- a/data/custodian/BR-MI-BHO-E-UTL-ufmg_tainacan_lab.yaml +++ b/data/custodian/BR-MI-BHO-E-UTL-ufmg_tainacan_lab.yaml @@ -83,17 +83,19 @@ ghcid: city_name: Belo Horizonte country_code: BR geonames_id: 3470127 - latitude: -19.9191 - longitude: -43.9386 + google_maps_url: https://maps.app.goo.gl/LqXWAtMukbvr4e5AA + latitude: -19.8697 + longitude: -43.9637 method: MANUAL_RESEARCH - notes: UFMG (Federal University of Minas Gerais) is in Belo Horizonte + notes: Tainacan Lab at UFMG School of Information Science (Escola de Ciência da + Informação) region_code: MG region_name: Minas Gerais resolution_date: '2025-12-07T16:44:07.061598+00:00' record_id: 167ba1b7-a62d-42d6-92cd-d91ff4ce72a9 identifiers: - identifier_scheme: GHCID - identifier_value: BR-MI-XXX-E-UTL-ufmg_tainacan_lab + identifier_value: BR-MG-BHO-E-UTL-ufmg_tainacan_lab - identifier_scheme: GHCID_UUID identifier_value: 9dcee694-81b2-5309-a27a-628488d0205e - identifier_scheme: GHCID_UUID_SHA256 @@ -108,6 +110,9 @@ identifiers: - &id002 identifier_scheme: OLD_ID identifier_value: 12840343882751256357 +- identifier_scheme: Website + identifier_url: https://tainacan.eci.ufmg.br/ + identifier_value: https://tainacan.eci.ufmg.br/ original_entry: identifiers: - identifier_scheme: GHCID diff --git a/data/custodian/BR-MI-BHO-E-UTL.yaml b/data/custodian/BR-MI-BHO-E-UTL.yaml index db97106153..c6161805f7 100644 --- a/data/custodian/BR-MI-BHO-E-UTL.yaml +++ b/data/custodian/BR-MI-BHO-E-UTL.yaml @@ -83,17 +83,19 @@ ghcid: city_name: Belo Horizonte country_code: BR geonames_id: 3470127 - latitude: -19.9191 - longitude: -43.9386 + google_maps_url: https://maps.app.goo.gl/LqXWAtMukbvr4e5AA + latitude: -19.8697 + longitude: -43.9637 method: MANUAL_RESEARCH - notes: UFMG (Federal University of Minas Gerais) is in Belo Horizonte + notes: Tainacan Lab at UFMG School of Information Science (Escola de Ciência da + Informação) region_code: MG region_name: Minas Gerais resolution_date: '2025-12-07T16:44:07.052938+00:00' record_id: 3c8e1c49-716c-40ea-a283-a208686138b7 identifiers: - identifier_scheme: GHCID - identifier_value: BR-MI-XXX-E-UTL + identifier_value: BR-MG-BHO-E-UTL - identifier_scheme: GHCID_UUID identifier_value: 562718ae-1d5c-57d7-9829-db40b4242ad1 - identifier_scheme: GHCID_UUID_SHA256 @@ -108,6 +110,9 @@ identifiers: - &id002 identifier_scheme: OLD_ID identifier_value: 12840343882751256357 +- identifier_scheme: Website + identifier_url: https://tainacan.eci.ufmg.br/ + identifier_value: https://tainacan.eci.ufmg.br/ original_entry: identifiers: - identifier_scheme: GHCID diff --git a/frontend/public/schemas/20251121/linkml/manifest.json b/frontend/public/schemas/20251121/linkml/manifest.json index 13808ec44e..317083d99e 100644 --- a/frontend/public/schemas/20251121/linkml/manifest.json +++ b/frontend/public/schemas/20251121/linkml/manifest.json @@ -1,5 +1,5 @@ { - "generated": "2025-12-07T13:27:28.747Z", + "generated": "2025-12-07T16:47:16.823Z", "version": "1.0.0", "categories": [ { diff --git a/frontend/src/components/database/DuckLakePanel.tsx b/frontend/src/components/database/DuckLakePanel.tsx index c4c46b9f06..38bd706879 100644 --- a/frontend/src/components/database/DuckLakePanel.tsx +++ b/frontend/src/components/database/DuckLakePanel.tsx @@ -1252,7 +1252,18 @@ export function DuckLakePanel({ compact = false }: DuckLakePanelProps) {
{/* Archived Pages - Wayback style */}
-
📄 Archived Pages ({webArchiveData.pages.length})
+
+
📄 Archived Pages ({webArchiveData.pages.length})
+ {selectedWebPage && ( + + )} +
{webArchiveData.pages.map((page, idx) => (
-
🏷️ Extracted Claims ({webArchiveData.claims.length})
+
+
+ 🏷️ Extracted Claims ({webArchiveData.claims.length}) + {selectedWebPage && ( + — from main page + )} +
+
+
+ Claims are extracted from the main page (index.html). + Per-page extraction coming soon. +
{webArchiveData.claims.map((claim, idx) => (
diff --git a/frontend/src/pages/Database.css b/frontend/src/pages/Database.css index b06ea3a21d..1cef38d543 100644 --- a/frontend/src/pages/Database.css +++ b/frontend/src/pages/Database.css @@ -3452,4 +3452,91 @@ body.resizing-row * { .claim-content { color: #ddd; } + + .claims-note { + background: #1a1a2e; + color: #888; + } +} + +/* Section header row with filter button */ +.section-header-row { + display: flex; + align-items: center; + justify-content: space-between; + padding: 0.6rem 0.75rem; + background: #f5f5f5; + border-bottom: 1px solid #e8e8e8; +} + +.section-header-row h6 { + margin: 0; + padding: 0; + background: transparent; + border: none; + font-size: 0.85rem; + color: #444; + display: flex; + align-items: center; + gap: 0.5rem; +} + +.filter-indicator { + font-weight: normal; + font-size: 0.75rem; + color: #888; +} + +.clear-filter-btn { + font-size: 0.7rem; + padding: 0.2rem 0.5rem; + border: 1px solid #ddd; + border-radius: 4px; + background: white; + color: #666; + cursor: pointer; + transition: all 0.15s; +} + +.clear-filter-btn:hover { + background: #f0f0f0; + border-color: #bbb; +} + +.claims-note { + font-size: 0.75rem; + color: #888; + font-style: italic; + padding: 0.5rem 0.75rem; + background: #fafafa; + border-bottom: 1px solid #f0f0f0; +} + +/* Dark mode for new elements */ +@media (prefers-color-scheme: dark) { + .section-header-row { + background: #2a2a40; + border-color: #333; + } + + .section-header-row h6 { + color: #ccc; + } + + .clear-filter-btn { + background: #333; + border-color: #444; + color: #aaa; + } + + .clear-filter-btn:hover { + background: #404050; + border-color: #555; + } + + .claims-note { + background: #1a1a2e; + border-color: #333; + color: #777; + } } diff --git a/scripts/enrich_cities_google.py b/scripts/enrich_cities_google.py index a62c81fd4d..3b9985f4db 100755 --- a/scripts/enrich_cities_google.py +++ b/scripts/enrich_cities_google.py @@ -68,6 +68,10 @@ COUNTRY_NAMES = { 'DE': 'Germany', 'FR': 'France', 'GB': 'United Kingdom', + 'EE': 'Estonia', + 'PH': 'Philippines', + 'CL': 'Chile', + 'CH': 'Switzerland', } @@ -184,11 +188,142 @@ def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float, count return None +# Brazil: GeoNames admin1_code → ISO 3166-2:BR state code +BRAZIL_STATE_CODES = { + '01': 'AC', # Acre + '02': 'AL', # Alagoas + '03': 'AP', # Amapá + '04': 'AM', # Amazonas + '05': 'BA', # Bahia + '06': 'CE', # Ceará + '07': 'DF', # Federal District (Distrito Federal) + '08': 'ES', # Espírito Santo + '11': 'MS', # Mato Grosso do Sul + '13': 'MA', # Maranhão + '14': 'MT', # Mato Grosso + '15': 'MG', # Minas Gerais + '16': 'PA', # Pará + '17': 'PB', # Paraíba + '18': 'PR', # Paraná + '20': 'PI', # Piauí + '21': 'RJ', # Rio de Janeiro + '22': 'RN', # Rio Grande do Norte + '23': 'RS', # Rio Grande do Sul + '24': 'RO', # Rondônia + '25': 'RR', # Roraima + '26': 'SC', # Santa Catarina + '27': 'SP', # São Paulo + '28': 'SE', # Sergipe + '29': 'GO', # Goiás + '30': 'PE', # Pernambuco + '31': 'TO', # Tocantins +} + +# Switzerland: GeoNames admin1_code → ISO 3166-2:CH canton code +SWITZERLAND_CANTON_CODES = { + 'AG': 'AG', # Aargau + 'AI': 'AI', # Appenzell Innerrhoden + 'AR': 'AR', # Appenzell Ausserrhoden + 'BE': 'BE', # Bern + 'BL': 'BL', # Basel-Landschaft + 'BS': 'BS', # Basel-Stadt + 'FR': 'FR', # Fribourg + 'GE': 'GE', # Geneva + 'GL': 'GL', # Glarus + 'GR': 'GR', # Graubünden + 'JU': 'JU', # Jura + 'LU': 'LU', # Lucerne + 'NE': 'NE', # Neuchâtel + 'NW': 'NW', # Nidwalden + 'OW': 'OW', # Obwalden + 'SG': 'SG', # St. Gallen + 'SH': 'SH', # Schaffhausen + 'SO': 'SO', # Solothurn + 'SZ': 'SZ', # Schwyz + 'TG': 'TG', # Thurgau + 'TI': 'TI', # Ticino + 'UR': 'UR', # Uri + 'VD': 'VD', # Vaud + 'VS': 'VS', # Valais + 'ZG': 'ZG', # Zug + 'ZH': 'ZH', # Zürich +} + +# Mexico: GeoNames admin1_code → ISO 3166-2:MX state code +MEXICO_STATE_CODES = { + '01': 'AGU', # Aguascalientes + '02': 'BCN', # Baja California + '03': 'BCS', # Baja California Sur + '04': 'CAM', # Campeche + '05': 'COA', # Coahuila + '06': 'COL', # Colima + '07': 'CHP', # Chiapas + '08': 'CHH', # Chihuahua + '09': 'CMX', # Ciudad de México (CDMX) + '10': 'DUR', # Durango + '11': 'GUA', # Guanajuato + '12': 'GRO', # Guerrero + '13': 'HID', # Hidalgo + '14': 'JAL', # Jalisco + '15': 'MEX', # México (State of Mexico) + '16': 'MIC', # Michoacán + '17': 'MOR', # Morelos + '18': 'NAY', # Nayarit + '19': 'NLE', # Nuevo León + '20': 'OAX', # Oaxaca + '21': 'PUE', # Puebla + '22': 'QUE', # Querétaro + '23': 'ROO', # Quintana Roo + '24': 'SLP', # San Luis Potosí + '25': 'SIN', # Sinaloa + '26': 'SON', # Sonora + '27': 'TAB', # Tabasco + '28': 'TAM', # Tamaulipas + '29': 'TLA', # Tlaxcala + '30': 'VER', # Veracruz + '31': 'YUC', # Yucatán + '32': 'ZAC', # Zacatecas +} + +# Chile: GeoNames admin1_code → ISO 3166-2:CL region code +CHILE_REGION_CODES = { + '01': 'TA', # Tarapacá + '02': 'AN', # Antofagasta + '03': 'AT', # Atacama + '04': 'CO', # Coquimbo + '05': 'VS', # Valparaíso + '06': 'LI', # Libertador General Bernardo O'Higgins + '07': 'ML', # Maule + '08': 'BI', # Biobío + '09': 'AR', # La Araucanía + '10': 'LL', # Los Lagos + '11': 'AI', # Aisén del General Carlos Ibáñez del Campo + '12': 'MA', # Magallanes y de la Antártica Chilena + '13': 'RM', # Región Metropolitana de Santiago + '14': 'LR', # Los Ríos + '15': 'AP', # Arica y Parinacota + '16': 'NB', # Ñuble +} + + def get_region_code(admin1_code: str, country_code: str, admin1_name: str) -> str: """Get ISO-style region code from GeoNames admin1_code.""" if not admin1_code: return 'XX' + # Country-specific mappings + if country_code == 'BR' and admin1_code in BRAZIL_STATE_CODES: + return BRAZIL_STATE_CODES[admin1_code] + + if country_code == 'CH' and admin1_code in SWITZERLAND_CANTON_CODES: + return SWITZERLAND_CANTON_CODES[admin1_code] + + if country_code == 'MX' and admin1_code in MEXICO_STATE_CODES: + return MEXICO_STATE_CODES[admin1_code] + + if country_code == 'CL' and admin1_code in CHILE_REGION_CODES: + return CHILE_REGION_CODES[admin1_code] + # For most countries, use first 2-3 characters of admin1_code or name if len(admin1_code) <= 3: return admin1_code.upper() @@ -234,10 +369,11 @@ def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str, old_ghcid = ghcid_data.get('ghcid_current', '') result['old_ghcid'] = old_ghcid - # Match both patterns: - # 1. {country}-XX-XXX-... (no region, no city) - # 2. {country}-{region}-XXX-... (has region, no city) - xxx_pattern = re.compile(rf'^{country_code}-[A-Z]{{2}}-XXX-') + # Match patterns with XXX city code: + # - {country}-XX-XXX-... (2-letter region like XX, BE, GE) + # - {country}-10-XXX-... (2-digit region like 10, 52, 37) + # - {country}-UKM-XXX-... (3-letter region like UKM, IDF, CMX) + xxx_pattern = re.compile(rf'^{country_code}-[A-Z0-9]{{2,3}}-XXX-') if not xxx_pattern.match(old_ghcid): result['status'] = 'skipped' result['error'] = f'Not a {country_code}-*-XXX file' diff --git a/scripts/migrate_web_archives.py b/scripts/migrate_web_archives.py index 2dd5987362..50702ee126 100644 --- a/scripts/migrate_web_archives.py +++ b/scripts/migrate_web_archives.py @@ -279,7 +279,27 @@ def build_ducklake_database(mapping: Dict[int, str]): continue # Insert pages - for page in metadata.get('pages', []): + pages = metadata.get('pages', []) + + # Handle single-page archives (older format with 'files' key) + if not pages and 'files' in metadata: + # Create a synthetic page entry from the single-page fetch + files = metadata.get('files', {}) + rendered_html = files.get('rendered_html') + if rendered_html: + pages = [{ + 'title': domain_folder.name, # Use domain as title + 'source_path': 'index.html', + 'archived_file': rendered_html, + 'extractions_count': 0 + }] + # Update the archive's total_pages count + con.execute(""" + UPDATE web_archives SET total_pages = 1, processed_pages = 1 + WHERE ghcid = ? + """, [ghcid]) + + for page in pages: page_id += 1 try: con.execute("""