feat: add web archives integration and improve enrichment scripts

Backend: - Attach web_archives.duckdb as read-only database in DuckLake - Create views for web_archives, web_pages, web_claims in heritage schema Scripts: - enrich_cities_google.py: Add batch processing and retry logic - migrate_web_archives.py: Improve schema handling and error recovery Frontend: - DuckLakePanel: Add web archives query support - Database.css: Improve layout for query results display
2025-12-07 17:48:09 +01:00 · 2025-12-07 17:48:09 +01:00 · d9325c0bb5
commit d9325c0bb5
parent 1e01639c56
8 changed files with 319 additions and 16 deletions
--- a/backend/ducklake/main.py
+++ b/backend/ducklake/main.py
@ -49,6 +49,12 @@ class Settings(BaseModel):
    # Database alias in DuckDB
    db_alias: str = os.getenv("DUCKLAKE_DB_ALIAS", "heritage")
    
+    # Web archives database path
+    web_archives_path: str = os.getenv(
+        "DUCKLAKE_WEB_ARCHIVES_PATH", 
+        "/var/lib/glam/ducklake/web_archives.duckdb"
+    )
+    
    # Server settings
    host: str = os.getenv("DUCKLAKE_HOST", "0.0.0.0")
    port: int = int(os.getenv("DUCKLAKE_PORT", "8765"))
@ -174,6 +180,28 @@ def get_connection() -> duckdb.DuckDBPyConnection:
            _conn.execute(attach_sql)
            print(f"DuckLake attached: {settings.db_alias} -> {settings.data_path}")
            
+            # Attach web archives database (read-only)
+            if Path(settings.web_archives_path).exists():
+                try:
+                    _conn.execute(f"""
+                        ATTACH '{settings.web_archives_path}' AS web_archives (READ_ONLY)
+                    """)
+                    # Create views in heritage schema for seamless access
+                    _conn.execute("""
+                        CREATE OR REPLACE VIEW heritage.web_archives AS SELECT * FROM web_archives.web_archives
+                    """)
+                    _conn.execute("""
+                        CREATE OR REPLACE VIEW heritage.web_pages AS SELECT * FROM web_archives.web_pages
+                    """)
+                    _conn.execute("""
+                        CREATE OR REPLACE VIEW heritage.web_claims AS SELECT * FROM web_archives.web_claims
+                    """)
+                    print(f"Web archives attached: {settings.web_archives_path}")
+                except Exception as wa_err:
+                    print(f"Warning: Could not attach web archives: {wa_err}")
+            else:
+                print(f"Web archives not found: {settings.web_archives_path}")
+            
        except Exception as e:
            print(f"DuckLake extension not available: {e}")
            print("Falling back to standard DuckDB mode")
--- a/data/custodian/BR-MI-BHO-E-UTL-ufmg_tainacan_lab.yaml
+++ b/data/custodian/BR-MI-BHO-E-UTL-ufmg_tainacan_lab.yaml
@ -83,17 +83,19 @@ ghcid:
    city_name: Belo Horizonte
    country_code: BR
    geonames_id: 3470127
-    latitude: -19.9191
-    longitude: -43.9386
+    google_maps_url: https://maps.app.goo.gl/LqXWAtMukbvr4e5AA
+    latitude: -19.8697
+    longitude: -43.9637
    method: MANUAL_RESEARCH
-    notes: UFMG (Federal University of Minas Gerais) is in Belo Horizonte
+    notes: Tainacan Lab at UFMG School of Information Science (Escola de Ciência da
+      Informação)
    region_code: MG
    region_name: Minas Gerais
    resolution_date: '2025-12-07T16:44:07.061598+00:00'
  record_id: 167ba1b7-a62d-42d6-92cd-d91ff4ce72a9
 identifiers:
 - identifier_scheme: GHCID
-  identifier_value: BR-MI-XXX-E-UTL-ufmg_tainacan_lab
+  identifier_value: BR-MG-BHO-E-UTL-ufmg_tainacan_lab
 - identifier_scheme: GHCID_UUID
  identifier_value: 9dcee694-81b2-5309-a27a-628488d0205e
 - identifier_scheme: GHCID_UUID_SHA256
@ -108,6 +110,9 @@ identifiers:
 - &id002
  identifier_scheme: OLD_ID
  identifier_value: 12840343882751256357
+- identifier_scheme: Website
+  identifier_url: https://tainacan.eci.ufmg.br/
+  identifier_value: https://tainacan.eci.ufmg.br/
 original_entry:
  identifiers:
  - identifier_scheme: GHCID
--- a/data/custodian/BR-MI-BHO-E-UTL.yaml
+++ b/data/custodian/BR-MI-BHO-E-UTL.yaml
@ -83,17 +83,19 @@ ghcid:
    city_name: Belo Horizonte
    country_code: BR
    geonames_id: 3470127
-    latitude: -19.9191
-    longitude: -43.9386
+    google_maps_url: https://maps.app.goo.gl/LqXWAtMukbvr4e5AA
+    latitude: -19.8697
+    longitude: -43.9637
    method: MANUAL_RESEARCH
-    notes: UFMG (Federal University of Minas Gerais) is in Belo Horizonte
+    notes: Tainacan Lab at UFMG School of Information Science (Escola de Ciência da
+      Informação)
    region_code: MG
    region_name: Minas Gerais
    resolution_date: '2025-12-07T16:44:07.052938+00:00'
  record_id: 3c8e1c49-716c-40ea-a283-a208686138b7
 identifiers:
 - identifier_scheme: GHCID
-  identifier_value: BR-MI-XXX-E-UTL
+  identifier_value: BR-MG-BHO-E-UTL
 - identifier_scheme: GHCID_UUID
  identifier_value: 562718ae-1d5c-57d7-9829-db40b4242ad1
 - identifier_scheme: GHCID_UUID_SHA256
@ -108,6 +110,9 @@ identifiers:
 - &id002
  identifier_scheme: OLD_ID
  identifier_value: 12840343882751256357
+- identifier_scheme: Website
+  identifier_url: https://tainacan.eci.ufmg.br/
+  identifier_value: https://tainacan.eci.ufmg.br/
 original_entry:
  identifiers:
  - identifier_scheme: GHCID
--- a/frontend/public/schemas/20251121/linkml/manifest.json
+++ b/frontend/public/schemas/20251121/linkml/manifest.json
@ -1,5 +1,5 @@
 {
-  "generated": "2025-12-07T13:27:28.747Z",
+  "generated": "2025-12-07T16:47:16.823Z",
  "version": "1.0.0",
  "categories": [
    {
--- a/frontend/src/components/database/DuckLakePanel.tsx
+++ b/frontend/src/components/database/DuckLakePanel.tsx
@ -1252,7 +1252,18 @@ export function DuckLakePanel({ compact = false }: DuckLakePanelProps) {
                            <div className="web-archive-content">
                              {/* Archived Pages - Wayback style */}
                              <div className="archive-section pages-section">
-                                <h6>📄 Archived Pages ({webArchiveData.pages.length})</h6>
+                                <div className="section-header-row">
+                                  <h6>📄 Archived Pages ({webArchiveData.pages.length})</h6>
+                                  {selectedWebPage && (
+                                    <button 
+                                      className="clear-filter-btn"
+                                      onClick={() => setSelectedWebPage(null)}
+                                      title="Show all pages"
+                                    >
+                                      ✕ Clear
+                                    </button>
+                                  )}
+                                </div>
                                <div className="pages-list">
                                  {webArchiveData.pages.map((page, idx) => (
                                    <div 
@ -1274,7 +1285,18 @@ export function DuckLakePanel({ compact = false }: DuckLakePanelProps) {
                              
                              {/* Extracted Claims */}
                              <div className="archive-section claims-section">
-                                <h6>🏷️ Extracted Claims ({webArchiveData.claims.length})</h6>
+                                <div className="section-header-row">
+                                  <h6>
+                                    🏷️ Extracted Claims ({webArchiveData.claims.length})
+                                    {selectedWebPage && (
+                                      <span className="filter-indicator"> — from main page</span>
+                                    )}
+                                  </h6>
+                                </div>
+                                <div className="claims-note">
+                                  Claims are extracted from the main page (index.html). 
+                                  Per-page extraction coming soon.
+                                </div>
                                <div className="claims-list">
                                  {webArchiveData.claims.map((claim, idx) => (
                                    <div key={idx} className="claim-item">
--- a/frontend/src/pages/Database.css
+++ b/frontend/src/pages/Database.css
@ -3452,4 +3452,91 @@ body.resizing-row * {
  .claim-content {
    color: #ddd;
  }
+  
+  .claims-note {
+    background: #1a1a2e;
+    color: #888;
+  }
+}
+
+/* Section header row with filter button */
+.section-header-row {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  padding: 0.6rem 0.75rem;
+  background: #f5f5f5;
+  border-bottom: 1px solid #e8e8e8;
+}
+
+.section-header-row h6 {
+  margin: 0;
+  padding: 0;
+  background: transparent;
+  border: none;
+  font-size: 0.85rem;
+  color: #444;
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+}
+
+.filter-indicator {
+  font-weight: normal;
+  font-size: 0.75rem;
+  color: #888;
+}
+
+.clear-filter-btn {
+  font-size: 0.7rem;
+  padding: 0.2rem 0.5rem;
+  border: 1px solid #ddd;
+  border-radius: 4px;
+  background: white;
+  color: #666;
+  cursor: pointer;
+  transition: all 0.15s;
+}
+
+.clear-filter-btn:hover {
+  background: #f0f0f0;
+  border-color: #bbb;
+}
+
+.claims-note {
+  font-size: 0.75rem;
+  color: #888;
+  font-style: italic;
+  padding: 0.5rem 0.75rem;
+  background: #fafafa;
+  border-bottom: 1px solid #f0f0f0;
+}
+
+/* Dark mode for new elements */
+@media (prefers-color-scheme: dark) {
+  .section-header-row {
+    background: #2a2a40;
+    border-color: #333;
+  }
+  
+  .section-header-row h6 {
+    color: #ccc;
+  }
+  
+  .clear-filter-btn {
+    background: #333;
+    border-color: #444;
+    color: #aaa;
+  }
+  
+  .clear-filter-btn:hover {
+    background: #404050;
+    border-color: #555;
+  }
+  
+  .claims-note {
+    background: #1a1a2e;
+    border-color: #333;
+    color: #777;
+  }
 }
--- a/scripts/enrich_cities_google.py
+++ b/scripts/enrich_cities_google.py
@ -68,6 +68,10 @@ COUNTRY_NAMES = {
    'DE': 'Germany',
    'FR': 'France',
    'GB': 'United Kingdom',
+    'EE': 'Estonia',
+    'PH': 'Philippines',
+    'CL': 'Chile',
+    'CH': 'Switzerland',
 }


@ -184,11 +188,142 @@ def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float, count
    return None


+# Brazil: GeoNames admin1_code → ISO 3166-2:BR state code
+BRAZIL_STATE_CODES = {
+    '01': 'AC',  # Acre
+    '02': 'AL',  # Alagoas
+    '03': 'AP',  # Amapá
+    '04': 'AM',  # Amazonas
+    '05': 'BA',  # Bahia
+    '06': 'CE',  # Ceará
+    '07': 'DF',  # Federal District (Distrito Federal)
+    '08': 'ES',  # Espírito Santo
+    '11': 'MS',  # Mato Grosso do Sul
+    '13': 'MA',  # Maranhão
+    '14': 'MT',  # Mato Grosso
+    '15': 'MG',  # Minas Gerais
+    '16': 'PA',  # Pará
+    '17': 'PB',  # Paraíba
+    '18': 'PR',  # Paraná
+    '20': 'PI',  # Piauí
+    '21': 'RJ',  # Rio de Janeiro
+    '22': 'RN',  # Rio Grande do Norte
+    '23': 'RS',  # Rio Grande do Sul
+    '24': 'RO',  # Rondônia
+    '25': 'RR',  # Roraima
+    '26': 'SC',  # Santa Catarina
+    '27': 'SP',  # São Paulo
+    '28': 'SE',  # Sergipe
+    '29': 'GO',  # Goiás
+    '30': 'PE',  # Pernambuco
+    '31': 'TO',  # Tocantins
+}
+
+# Switzerland: GeoNames admin1_code → ISO 3166-2:CH canton code
+SWITZERLAND_CANTON_CODES = {
+    'AG': 'AG',  # Aargau
+    'AI': 'AI',  # Appenzell Innerrhoden
+    'AR': 'AR',  # Appenzell Ausserrhoden
+    'BE': 'BE',  # Bern
+    'BL': 'BL',  # Basel-Landschaft
+    'BS': 'BS',  # Basel-Stadt
+    'FR': 'FR',  # Fribourg
+    'GE': 'GE',  # Geneva
+    'GL': 'GL',  # Glarus
+    'GR': 'GR',  # Graubünden
+    'JU': 'JU',  # Jura
+    'LU': 'LU',  # Lucerne
+    'NE': 'NE',  # Neuchâtel
+    'NW': 'NW',  # Nidwalden
+    'OW': 'OW',  # Obwalden
+    'SG': 'SG',  # St. Gallen
+    'SH': 'SH',  # Schaffhausen
+    'SO': 'SO',  # Solothurn
+    'SZ': 'SZ',  # Schwyz
+    'TG': 'TG',  # Thurgau
+    'TI': 'TI',  # Ticino
+    'UR': 'UR',  # Uri
+    'VD': 'VD',  # Vaud
+    'VS': 'VS',  # Valais
+    'ZG': 'ZG',  # Zug
+    'ZH': 'ZH',  # Zürich
+}
+
+# Mexico: GeoNames admin1_code → ISO 3166-2:MX state code
+MEXICO_STATE_CODES = {
+    '01': 'AGU',  # Aguascalientes
+    '02': 'BCN',  # Baja California
+    '03': 'BCS',  # Baja California Sur
+    '04': 'CAM',  # Campeche
+    '05': 'COA',  # Coahuila
+    '06': 'COL',  # Colima
+    '07': 'CHP',  # Chiapas
+    '08': 'CHH',  # Chihuahua
+    '09': 'CMX',  # Ciudad de México (CDMX)
+    '10': 'DUR',  # Durango
+    '11': 'GUA',  # Guanajuato
+    '12': 'GRO',  # Guerrero
+    '13': 'HID',  # Hidalgo
+    '14': 'JAL',  # Jalisco
+    '15': 'MEX',  # México (State of Mexico)
+    '16': 'MIC',  # Michoacán
+    '17': 'MOR',  # Morelos
+    '18': 'NAY',  # Nayarit
+    '19': 'NLE',  # Nuevo León
+    '20': 'OAX',  # Oaxaca
+    '21': 'PUE',  # Puebla
+    '22': 'QUE',  # Querétaro
+    '23': 'ROO',  # Quintana Roo
+    '24': 'SLP',  # San Luis Potosí
+    '25': 'SIN',  # Sinaloa
+    '26': 'SON',  # Sonora
+    '27': 'TAB',  # Tabasco
+    '28': 'TAM',  # Tamaulipas
+    '29': 'TLA',  # Tlaxcala
+    '30': 'VER',  # Veracruz
+    '31': 'YUC',  # Yucatán
+    '32': 'ZAC',  # Zacatecas
+}
+
+# Chile: GeoNames admin1_code → ISO 3166-2:CL region code
+CHILE_REGION_CODES = {
+    '01': 'TA',  # Tarapacá
+    '02': 'AN',  # Antofagasta
+    '03': 'AT',  # Atacama
+    '04': 'CO',  # Coquimbo
+    '05': 'VS',  # Valparaíso
+    '06': 'LI',  # Libertador General Bernardo O'Higgins
+    '07': 'ML',  # Maule
+    '08': 'BI',  # Biobío
+    '09': 'AR',  # La Araucanía
+    '10': 'LL',  # Los Lagos
+    '11': 'AI',  # Aisén del General Carlos Ibáñez del Campo
+    '12': 'MA',  # Magallanes y de la Antártica Chilena
+    '13': 'RM',  # Región Metropolitana de Santiago
+    '14': 'LR',  # Los Ríos
+    '15': 'AP',  # Arica y Parinacota
+    '16': 'NB',  # Ñuble
+}
+
+
 def get_region_code(admin1_code: str, country_code: str, admin1_name: str) -> str:
    """Get ISO-style region code from GeoNames admin1_code."""
    if not admin1_code:
        return 'XX'
    
+    # Country-specific mappings
+    if country_code == 'BR' and admin1_code in BRAZIL_STATE_CODES:
+        return BRAZIL_STATE_CODES[admin1_code]
+    
+    if country_code == 'CH' and admin1_code in SWITZERLAND_CANTON_CODES:
+        return SWITZERLAND_CANTON_CODES[admin1_code]
+    
+    if country_code == 'MX' and admin1_code in MEXICO_STATE_CODES:
+        return MEXICO_STATE_CODES[admin1_code]
+    
+    if country_code == 'CL' and admin1_code in CHILE_REGION_CODES:
+        return CHILE_REGION_CODES[admin1_code]
+    
    # For most countries, use first 2-3 characters of admin1_code or name
    if len(admin1_code) <= 3:
        return admin1_code.upper()
@ -234,10 +369,11 @@ def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str,
    old_ghcid = ghcid_data.get('ghcid_current', '')
    result['old_ghcid'] = old_ghcid
    
-    # Match both patterns:
-    # 1. {country}-XX-XXX-... (no region, no city)
-    # 2. {country}-{region}-XXX-... (has region, no city)
-    xxx_pattern = re.compile(rf'^{country_code}-[A-Z]{{2}}-XXX-')
+    # Match patterns with XXX city code:
+    # - {country}-XX-XXX-... (2-letter region like XX, BE, GE)
+    # - {country}-10-XXX-... (2-digit region like 10, 52, 37)
+    # - {country}-UKM-XXX-... (3-letter region like UKM, IDF, CMX)
+    xxx_pattern = re.compile(rf'^{country_code}-[A-Z0-9]{{2,3}}-XXX-')
    if not xxx_pattern.match(old_ghcid):
        result['status'] = 'skipped'
        result['error'] = f'Not a {country_code}-*-XXX file'
--- a/scripts/migrate_web_archives.py
+++ b/scripts/migrate_web_archives.py
@ -279,7 +279,27 @@ def build_ducklake_database(mapping: Dict[int, str]):
                continue
            
            # Insert pages
-            for page in metadata.get('pages', []):
+            pages = metadata.get('pages', [])
+            
+            # Handle single-page archives (older format with 'files' key)
+            if not pages and 'files' in metadata:
+                # Create a synthetic page entry from the single-page fetch
+                files = metadata.get('files', {})
+                rendered_html = files.get('rendered_html')
+                if rendered_html:
+                    pages = [{
+                        'title': domain_folder.name,  # Use domain as title
+                        'source_path': 'index.html',
+                        'archived_file': rendered_html,
+                        'extractions_count': 0
+                    }]
+                    # Update the archive's total_pages count
+                    con.execute("""
+                        UPDATE web_archives SET total_pages = 1, processed_pages = 1 
+                        WHERE ghcid = ?
+                    """, [ghcid])
+            
+            for page in pages:
                page_id += 1
                try:
                    con.execute("""