feat: add web archives integration and improve enrichment scripts

Backend:
- Attach web_archives.duckdb as read-only database in DuckLake
- Create views for web_archives, web_pages, web_claims in heritage schema

Scripts:
- enrich_cities_google.py: Add batch processing and retry logic
- migrate_web_archives.py: Improve schema handling and error recovery

Frontend:
- DuckLakePanel: Add web archives query support
- Database.css: Improve layout for query results display
This commit is contained in:
kempersc 2025-12-07 17:48:09 +01:00
parent 1e01639c56
commit d9325c0bb5
8 changed files with 319 additions and 16 deletions

View file

@ -49,6 +49,12 @@ class Settings(BaseModel):
# Database alias in DuckDB
db_alias: str = os.getenv("DUCKLAKE_DB_ALIAS", "heritage")
# Web archives database path
web_archives_path: str = os.getenv(
"DUCKLAKE_WEB_ARCHIVES_PATH",
"/var/lib/glam/ducklake/web_archives.duckdb"
)
# Server settings
host: str = os.getenv("DUCKLAKE_HOST", "0.0.0.0")
port: int = int(os.getenv("DUCKLAKE_PORT", "8765"))
@ -174,6 +180,28 @@ def get_connection() -> duckdb.DuckDBPyConnection:
_conn.execute(attach_sql)
print(f"DuckLake attached: {settings.db_alias} -> {settings.data_path}")
# Attach web archives database (read-only)
if Path(settings.web_archives_path).exists():
try:
_conn.execute(f"""
ATTACH '{settings.web_archives_path}' AS web_archives (READ_ONLY)
""")
# Create views in heritage schema for seamless access
_conn.execute("""
CREATE OR REPLACE VIEW heritage.web_archives AS SELECT * FROM web_archives.web_archives
""")
_conn.execute("""
CREATE OR REPLACE VIEW heritage.web_pages AS SELECT * FROM web_archives.web_pages
""")
_conn.execute("""
CREATE OR REPLACE VIEW heritage.web_claims AS SELECT * FROM web_archives.web_claims
""")
print(f"Web archives attached: {settings.web_archives_path}")
except Exception as wa_err:
print(f"Warning: Could not attach web archives: {wa_err}")
else:
print(f"Web archives not found: {settings.web_archives_path}")
except Exception as e:
print(f"DuckLake extension not available: {e}")
print("Falling back to standard DuckDB mode")

View file

@ -83,17 +83,19 @@ ghcid:
city_name: Belo Horizonte
country_code: BR
geonames_id: 3470127
latitude: -19.9191
longitude: -43.9386
google_maps_url: https://maps.app.goo.gl/LqXWAtMukbvr4e5AA
latitude: -19.8697
longitude: -43.9637
method: MANUAL_RESEARCH
notes: UFMG (Federal University of Minas Gerais) is in Belo Horizonte
notes: Tainacan Lab at UFMG School of Information Science (Escola de Ciência da
Informação)
region_code: MG
region_name: Minas Gerais
resolution_date: '2025-12-07T16:44:07.061598+00:00'
record_id: 167ba1b7-a62d-42d6-92cd-d91ff4ce72a9
identifiers:
- identifier_scheme: GHCID
identifier_value: BR-MI-XXX-E-UTL-ufmg_tainacan_lab
identifier_value: BR-MG-BHO-E-UTL-ufmg_tainacan_lab
- identifier_scheme: GHCID_UUID
identifier_value: 9dcee694-81b2-5309-a27a-628488d0205e
- identifier_scheme: GHCID_UUID_SHA256
@ -108,6 +110,9 @@ identifiers:
- &id002
identifier_scheme: OLD_ID
identifier_value: 12840343882751256357
- identifier_scheme: Website
identifier_url: https://tainacan.eci.ufmg.br/
identifier_value: https://tainacan.eci.ufmg.br/
original_entry:
identifiers:
- identifier_scheme: GHCID

View file

@ -83,17 +83,19 @@ ghcid:
city_name: Belo Horizonte
country_code: BR
geonames_id: 3470127
latitude: -19.9191
longitude: -43.9386
google_maps_url: https://maps.app.goo.gl/LqXWAtMukbvr4e5AA
latitude: -19.8697
longitude: -43.9637
method: MANUAL_RESEARCH
notes: UFMG (Federal University of Minas Gerais) is in Belo Horizonte
notes: Tainacan Lab at UFMG School of Information Science (Escola de Ciência da
Informação)
region_code: MG
region_name: Minas Gerais
resolution_date: '2025-12-07T16:44:07.052938+00:00'
record_id: 3c8e1c49-716c-40ea-a283-a208686138b7
identifiers:
- identifier_scheme: GHCID
identifier_value: BR-MI-XXX-E-UTL
identifier_value: BR-MG-BHO-E-UTL
- identifier_scheme: GHCID_UUID
identifier_value: 562718ae-1d5c-57d7-9829-db40b4242ad1
- identifier_scheme: GHCID_UUID_SHA256
@ -108,6 +110,9 @@ identifiers:
- &id002
identifier_scheme: OLD_ID
identifier_value: 12840343882751256357
- identifier_scheme: Website
identifier_url: https://tainacan.eci.ufmg.br/
identifier_value: https://tainacan.eci.ufmg.br/
original_entry:
identifiers:
- identifier_scheme: GHCID

View file

@ -1,5 +1,5 @@
{
"generated": "2025-12-07T13:27:28.747Z",
"generated": "2025-12-07T16:47:16.823Z",
"version": "1.0.0",
"categories": [
{

View file

@ -1252,7 +1252,18 @@ export function DuckLakePanel({ compact = false }: DuckLakePanelProps) {
<div className="web-archive-content">
{/* Archived Pages - Wayback style */}
<div className="archive-section pages-section">
<h6>📄 Archived Pages ({webArchiveData.pages.length})</h6>
<div className="section-header-row">
<h6>📄 Archived Pages ({webArchiveData.pages.length})</h6>
{selectedWebPage && (
<button
className="clear-filter-btn"
onClick={() => setSelectedWebPage(null)}
title="Show all pages"
>
Clear
</button>
)}
</div>
<div className="pages-list">
{webArchiveData.pages.map((page, idx) => (
<div
@ -1274,7 +1285,18 @@ export function DuckLakePanel({ compact = false }: DuckLakePanelProps) {
{/* Extracted Claims */}
<div className="archive-section claims-section">
<h6>🏷 Extracted Claims ({webArchiveData.claims.length})</h6>
<div className="section-header-row">
<h6>
🏷 Extracted Claims ({webArchiveData.claims.length})
{selectedWebPage && (
<span className="filter-indicator"> from main page</span>
)}
</h6>
</div>
<div className="claims-note">
Claims are extracted from the main page (index.html).
Per-page extraction coming soon.
</div>
<div className="claims-list">
{webArchiveData.claims.map((claim, idx) => (
<div key={idx} className="claim-item">

View file

@ -3452,4 +3452,91 @@ body.resizing-row * {
.claim-content {
color: #ddd;
}
.claims-note {
background: #1a1a2e;
color: #888;
}
}
/* Section header row with filter button */
.section-header-row {
display: flex;
align-items: center;
justify-content: space-between;
padding: 0.6rem 0.75rem;
background: #f5f5f5;
border-bottom: 1px solid #e8e8e8;
}
.section-header-row h6 {
margin: 0;
padding: 0;
background: transparent;
border: none;
font-size: 0.85rem;
color: #444;
display: flex;
align-items: center;
gap: 0.5rem;
}
.filter-indicator {
font-weight: normal;
font-size: 0.75rem;
color: #888;
}
.clear-filter-btn {
font-size: 0.7rem;
padding: 0.2rem 0.5rem;
border: 1px solid #ddd;
border-radius: 4px;
background: white;
color: #666;
cursor: pointer;
transition: all 0.15s;
}
.clear-filter-btn:hover {
background: #f0f0f0;
border-color: #bbb;
}
.claims-note {
font-size: 0.75rem;
color: #888;
font-style: italic;
padding: 0.5rem 0.75rem;
background: #fafafa;
border-bottom: 1px solid #f0f0f0;
}
/* Dark mode for new elements */
@media (prefers-color-scheme: dark) {
.section-header-row {
background: #2a2a40;
border-color: #333;
}
.section-header-row h6 {
color: #ccc;
}
.clear-filter-btn {
background: #333;
border-color: #444;
color: #aaa;
}
.clear-filter-btn:hover {
background: #404050;
border-color: #555;
}
.claims-note {
background: #1a1a2e;
border-color: #333;
color: #777;
}
}

View file

@ -68,6 +68,10 @@ COUNTRY_NAMES = {
'DE': 'Germany',
'FR': 'France',
'GB': 'United Kingdom',
'EE': 'Estonia',
'PH': 'Philippines',
'CL': 'Chile',
'CH': 'Switzerland',
}
@ -184,11 +188,142 @@ def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float, count
return None
# Brazil: GeoNames admin1_code → ISO 3166-2:BR state code
BRAZIL_STATE_CODES = {
'01': 'AC', # Acre
'02': 'AL', # Alagoas
'03': 'AP', # Amapá
'04': 'AM', # Amazonas
'05': 'BA', # Bahia
'06': 'CE', # Ceará
'07': 'DF', # Federal District (Distrito Federal)
'08': 'ES', # Espírito Santo
'11': 'MS', # Mato Grosso do Sul
'13': 'MA', # Maranhão
'14': 'MT', # Mato Grosso
'15': 'MG', # Minas Gerais
'16': 'PA', # Pará
'17': 'PB', # Paraíba
'18': 'PR', # Paraná
'20': 'PI', # Piauí
'21': 'RJ', # Rio de Janeiro
'22': 'RN', # Rio Grande do Norte
'23': 'RS', # Rio Grande do Sul
'24': 'RO', # Rondônia
'25': 'RR', # Roraima
'26': 'SC', # Santa Catarina
'27': 'SP', # São Paulo
'28': 'SE', # Sergipe
'29': 'GO', # Goiás
'30': 'PE', # Pernambuco
'31': 'TO', # Tocantins
}
# Switzerland: GeoNames admin1_code → ISO 3166-2:CH canton code
SWITZERLAND_CANTON_CODES = {
'AG': 'AG', # Aargau
'AI': 'AI', # Appenzell Innerrhoden
'AR': 'AR', # Appenzell Ausserrhoden
'BE': 'BE', # Bern
'BL': 'BL', # Basel-Landschaft
'BS': 'BS', # Basel-Stadt
'FR': 'FR', # Fribourg
'GE': 'GE', # Geneva
'GL': 'GL', # Glarus
'GR': 'GR', # Graubünden
'JU': 'JU', # Jura
'LU': 'LU', # Lucerne
'NE': 'NE', # Neuchâtel
'NW': 'NW', # Nidwalden
'OW': 'OW', # Obwalden
'SG': 'SG', # St. Gallen
'SH': 'SH', # Schaffhausen
'SO': 'SO', # Solothurn
'SZ': 'SZ', # Schwyz
'TG': 'TG', # Thurgau
'TI': 'TI', # Ticino
'UR': 'UR', # Uri
'VD': 'VD', # Vaud
'VS': 'VS', # Valais
'ZG': 'ZG', # Zug
'ZH': 'ZH', # Zürich
}
# Mexico: GeoNames admin1_code → ISO 3166-2:MX state code
MEXICO_STATE_CODES = {
'01': 'AGU', # Aguascalientes
'02': 'BCN', # Baja California
'03': 'BCS', # Baja California Sur
'04': 'CAM', # Campeche
'05': 'COA', # Coahuila
'06': 'COL', # Colima
'07': 'CHP', # Chiapas
'08': 'CHH', # Chihuahua
'09': 'CMX', # Ciudad de México (CDMX)
'10': 'DUR', # Durango
'11': 'GUA', # Guanajuato
'12': 'GRO', # Guerrero
'13': 'HID', # Hidalgo
'14': 'JAL', # Jalisco
'15': 'MEX', # México (State of Mexico)
'16': 'MIC', # Michoacán
'17': 'MOR', # Morelos
'18': 'NAY', # Nayarit
'19': 'NLE', # Nuevo León
'20': 'OAX', # Oaxaca
'21': 'PUE', # Puebla
'22': 'QUE', # Querétaro
'23': 'ROO', # Quintana Roo
'24': 'SLP', # San Luis Potosí
'25': 'SIN', # Sinaloa
'26': 'SON', # Sonora
'27': 'TAB', # Tabasco
'28': 'TAM', # Tamaulipas
'29': 'TLA', # Tlaxcala
'30': 'VER', # Veracruz
'31': 'YUC', # Yucatán
'32': 'ZAC', # Zacatecas
}
# Chile: GeoNames admin1_code → ISO 3166-2:CL region code
CHILE_REGION_CODES = {
'01': 'TA', # Tarapacá
'02': 'AN', # Antofagasta
'03': 'AT', # Atacama
'04': 'CO', # Coquimbo
'05': 'VS', # Valparaíso
'06': 'LI', # Libertador General Bernardo O'Higgins
'07': 'ML', # Maule
'08': 'BI', # Biobío
'09': 'AR', # La Araucanía
'10': 'LL', # Los Lagos
'11': 'AI', # Aisén del General Carlos Ibáñez del Campo
'12': 'MA', # Magallanes y de la Antártica Chilena
'13': 'RM', # Región Metropolitana de Santiago
'14': 'LR', # Los Ríos
'15': 'AP', # Arica y Parinacota
'16': 'NB', # Ñuble
}
def get_region_code(admin1_code: str, country_code: str, admin1_name: str) -> str:
"""Get ISO-style region code from GeoNames admin1_code."""
if not admin1_code:
return 'XX'
# Country-specific mappings
if country_code == 'BR' and admin1_code in BRAZIL_STATE_CODES:
return BRAZIL_STATE_CODES[admin1_code]
if country_code == 'CH' and admin1_code in SWITZERLAND_CANTON_CODES:
return SWITZERLAND_CANTON_CODES[admin1_code]
if country_code == 'MX' and admin1_code in MEXICO_STATE_CODES:
return MEXICO_STATE_CODES[admin1_code]
if country_code == 'CL' and admin1_code in CHILE_REGION_CODES:
return CHILE_REGION_CODES[admin1_code]
# For most countries, use first 2-3 characters of admin1_code or name
if len(admin1_code) <= 3:
return admin1_code.upper()
@ -234,10 +369,11 @@ def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str,
old_ghcid = ghcid_data.get('ghcid_current', '')
result['old_ghcid'] = old_ghcid
# Match both patterns:
# 1. {country}-XX-XXX-... (no region, no city)
# 2. {country}-{region}-XXX-... (has region, no city)
xxx_pattern = re.compile(rf'^{country_code}-[A-Z]{{2}}-XXX-')
# Match patterns with XXX city code:
# - {country}-XX-XXX-... (2-letter region like XX, BE, GE)
# - {country}-10-XXX-... (2-digit region like 10, 52, 37)
# - {country}-UKM-XXX-... (3-letter region like UKM, IDF, CMX)
xxx_pattern = re.compile(rf'^{country_code}-[A-Z0-9]{{2,3}}-XXX-')
if not xxx_pattern.match(old_ghcid):
result['status'] = 'skipped'
result['error'] = f'Not a {country_code}-*-XXX file'

View file

@ -279,7 +279,27 @@ def build_ducklake_database(mapping: Dict[int, str]):
continue
# Insert pages
for page in metadata.get('pages', []):
pages = metadata.get('pages', [])
# Handle single-page archives (older format with 'files' key)
if not pages and 'files' in metadata:
# Create a synthetic page entry from the single-page fetch
files = metadata.get('files', {})
rendered_html = files.get('rendered_html')
if rendered_html:
pages = [{
'title': domain_folder.name, # Use domain as title
'source_path': 'index.html',
'archived_file': rendered_html,
'extractions_count': 0
}]
# Update the archive's total_pages count
con.execute("""
UPDATE web_archives SET total_pages = 1, processed_pages = 1
WHERE ghcid = ?
""", [ghcid])
for page in pages:
page_id += 1
try:
con.execute("""