feat: add web archives integration and improve enrichment scripts
Backend: - Attach web_archives.duckdb as read-only database in DuckLake - Create views for web_archives, web_pages, web_claims in heritage schema Scripts: - enrich_cities_google.py: Add batch processing and retry logic - migrate_web_archives.py: Improve schema handling and error recovery Frontend: - DuckLakePanel: Add web archives query support - Database.css: Improve layout for query results display
This commit is contained in:
parent
1e01639c56
commit
d9325c0bb5
8 changed files with 319 additions and 16 deletions
|
|
@ -49,6 +49,12 @@ class Settings(BaseModel):
|
|||
# Database alias in DuckDB
|
||||
db_alias: str = os.getenv("DUCKLAKE_DB_ALIAS", "heritage")
|
||||
|
||||
# Web archives database path
|
||||
web_archives_path: str = os.getenv(
|
||||
"DUCKLAKE_WEB_ARCHIVES_PATH",
|
||||
"/var/lib/glam/ducklake/web_archives.duckdb"
|
||||
)
|
||||
|
||||
# Server settings
|
||||
host: str = os.getenv("DUCKLAKE_HOST", "0.0.0.0")
|
||||
port: int = int(os.getenv("DUCKLAKE_PORT", "8765"))
|
||||
|
|
@ -174,6 +180,28 @@ def get_connection() -> duckdb.DuckDBPyConnection:
|
|||
_conn.execute(attach_sql)
|
||||
print(f"DuckLake attached: {settings.db_alias} -> {settings.data_path}")
|
||||
|
||||
# Attach web archives database (read-only)
|
||||
if Path(settings.web_archives_path).exists():
|
||||
try:
|
||||
_conn.execute(f"""
|
||||
ATTACH '{settings.web_archives_path}' AS web_archives (READ_ONLY)
|
||||
""")
|
||||
# Create views in heritage schema for seamless access
|
||||
_conn.execute("""
|
||||
CREATE OR REPLACE VIEW heritage.web_archives AS SELECT * FROM web_archives.web_archives
|
||||
""")
|
||||
_conn.execute("""
|
||||
CREATE OR REPLACE VIEW heritage.web_pages AS SELECT * FROM web_archives.web_pages
|
||||
""")
|
||||
_conn.execute("""
|
||||
CREATE OR REPLACE VIEW heritage.web_claims AS SELECT * FROM web_archives.web_claims
|
||||
""")
|
||||
print(f"Web archives attached: {settings.web_archives_path}")
|
||||
except Exception as wa_err:
|
||||
print(f"Warning: Could not attach web archives: {wa_err}")
|
||||
else:
|
||||
print(f"Web archives not found: {settings.web_archives_path}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"DuckLake extension not available: {e}")
|
||||
print("Falling back to standard DuckDB mode")
|
||||
|
|
|
|||
|
|
@ -83,17 +83,19 @@ ghcid:
|
|||
city_name: Belo Horizonte
|
||||
country_code: BR
|
||||
geonames_id: 3470127
|
||||
latitude: -19.9191
|
||||
longitude: -43.9386
|
||||
google_maps_url: https://maps.app.goo.gl/LqXWAtMukbvr4e5AA
|
||||
latitude: -19.8697
|
||||
longitude: -43.9637
|
||||
method: MANUAL_RESEARCH
|
||||
notes: UFMG (Federal University of Minas Gerais) is in Belo Horizonte
|
||||
notes: Tainacan Lab at UFMG School of Information Science (Escola de Ciência da
|
||||
Informação)
|
||||
region_code: MG
|
||||
region_name: Minas Gerais
|
||||
resolution_date: '2025-12-07T16:44:07.061598+00:00'
|
||||
record_id: 167ba1b7-a62d-42d6-92cd-d91ff4ce72a9
|
||||
identifiers:
|
||||
- identifier_scheme: GHCID
|
||||
identifier_value: BR-MI-XXX-E-UTL-ufmg_tainacan_lab
|
||||
identifier_value: BR-MG-BHO-E-UTL-ufmg_tainacan_lab
|
||||
- identifier_scheme: GHCID_UUID
|
||||
identifier_value: 9dcee694-81b2-5309-a27a-628488d0205e
|
||||
- identifier_scheme: GHCID_UUID_SHA256
|
||||
|
|
@ -108,6 +110,9 @@ identifiers:
|
|||
- &id002
|
||||
identifier_scheme: OLD_ID
|
||||
identifier_value: 12840343882751256357
|
||||
- identifier_scheme: Website
|
||||
identifier_url: https://tainacan.eci.ufmg.br/
|
||||
identifier_value: https://tainacan.eci.ufmg.br/
|
||||
original_entry:
|
||||
identifiers:
|
||||
- identifier_scheme: GHCID
|
||||
|
|
|
|||
|
|
@ -83,17 +83,19 @@ ghcid:
|
|||
city_name: Belo Horizonte
|
||||
country_code: BR
|
||||
geonames_id: 3470127
|
||||
latitude: -19.9191
|
||||
longitude: -43.9386
|
||||
google_maps_url: https://maps.app.goo.gl/LqXWAtMukbvr4e5AA
|
||||
latitude: -19.8697
|
||||
longitude: -43.9637
|
||||
method: MANUAL_RESEARCH
|
||||
notes: UFMG (Federal University of Minas Gerais) is in Belo Horizonte
|
||||
notes: Tainacan Lab at UFMG School of Information Science (Escola de Ciência da
|
||||
Informação)
|
||||
region_code: MG
|
||||
region_name: Minas Gerais
|
||||
resolution_date: '2025-12-07T16:44:07.052938+00:00'
|
||||
record_id: 3c8e1c49-716c-40ea-a283-a208686138b7
|
||||
identifiers:
|
||||
- identifier_scheme: GHCID
|
||||
identifier_value: BR-MI-XXX-E-UTL
|
||||
identifier_value: BR-MG-BHO-E-UTL
|
||||
- identifier_scheme: GHCID_UUID
|
||||
identifier_value: 562718ae-1d5c-57d7-9829-db40b4242ad1
|
||||
- identifier_scheme: GHCID_UUID_SHA256
|
||||
|
|
@ -108,6 +110,9 @@ identifiers:
|
|||
- &id002
|
||||
identifier_scheme: OLD_ID
|
||||
identifier_value: 12840343882751256357
|
||||
- identifier_scheme: Website
|
||||
identifier_url: https://tainacan.eci.ufmg.br/
|
||||
identifier_value: https://tainacan.eci.ufmg.br/
|
||||
original_entry:
|
||||
identifiers:
|
||||
- identifier_scheme: GHCID
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
{
|
||||
"generated": "2025-12-07T13:27:28.747Z",
|
||||
"generated": "2025-12-07T16:47:16.823Z",
|
||||
"version": "1.0.0",
|
||||
"categories": [
|
||||
{
|
||||
|
|
|
|||
|
|
@ -1252,7 +1252,18 @@ export function DuckLakePanel({ compact = false }: DuckLakePanelProps) {
|
|||
<div className="web-archive-content">
|
||||
{/* Archived Pages - Wayback style */}
|
||||
<div className="archive-section pages-section">
|
||||
<h6>📄 Archived Pages ({webArchiveData.pages.length})</h6>
|
||||
<div className="section-header-row">
|
||||
<h6>📄 Archived Pages ({webArchiveData.pages.length})</h6>
|
||||
{selectedWebPage && (
|
||||
<button
|
||||
className="clear-filter-btn"
|
||||
onClick={() => setSelectedWebPage(null)}
|
||||
title="Show all pages"
|
||||
>
|
||||
✕ Clear
|
||||
</button>
|
||||
)}
|
||||
</div>
|
||||
<div className="pages-list">
|
||||
{webArchiveData.pages.map((page, idx) => (
|
||||
<div
|
||||
|
|
@ -1274,7 +1285,18 @@ export function DuckLakePanel({ compact = false }: DuckLakePanelProps) {
|
|||
|
||||
{/* Extracted Claims */}
|
||||
<div className="archive-section claims-section">
|
||||
<h6>🏷️ Extracted Claims ({webArchiveData.claims.length})</h6>
|
||||
<div className="section-header-row">
|
||||
<h6>
|
||||
🏷️ Extracted Claims ({webArchiveData.claims.length})
|
||||
{selectedWebPage && (
|
||||
<span className="filter-indicator"> — from main page</span>
|
||||
)}
|
||||
</h6>
|
||||
</div>
|
||||
<div className="claims-note">
|
||||
Claims are extracted from the main page (index.html).
|
||||
Per-page extraction coming soon.
|
||||
</div>
|
||||
<div className="claims-list">
|
||||
{webArchiveData.claims.map((claim, idx) => (
|
||||
<div key={idx} className="claim-item">
|
||||
|
|
|
|||
|
|
@ -3452,4 +3452,91 @@ body.resizing-row * {
|
|||
.claim-content {
|
||||
color: #ddd;
|
||||
}
|
||||
|
||||
.claims-note {
|
||||
background: #1a1a2e;
|
||||
color: #888;
|
||||
}
|
||||
}
|
||||
|
||||
/* Section header row with filter button */
|
||||
.section-header-row {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
padding: 0.6rem 0.75rem;
|
||||
background: #f5f5f5;
|
||||
border-bottom: 1px solid #e8e8e8;
|
||||
}
|
||||
|
||||
.section-header-row h6 {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
background: transparent;
|
||||
border: none;
|
||||
font-size: 0.85rem;
|
||||
color: #444;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
}
|
||||
|
||||
.filter-indicator {
|
||||
font-weight: normal;
|
||||
font-size: 0.75rem;
|
||||
color: #888;
|
||||
}
|
||||
|
||||
.clear-filter-btn {
|
||||
font-size: 0.7rem;
|
||||
padding: 0.2rem 0.5rem;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 4px;
|
||||
background: white;
|
||||
color: #666;
|
||||
cursor: pointer;
|
||||
transition: all 0.15s;
|
||||
}
|
||||
|
||||
.clear-filter-btn:hover {
|
||||
background: #f0f0f0;
|
||||
border-color: #bbb;
|
||||
}
|
||||
|
||||
.claims-note {
|
||||
font-size: 0.75rem;
|
||||
color: #888;
|
||||
font-style: italic;
|
||||
padding: 0.5rem 0.75rem;
|
||||
background: #fafafa;
|
||||
border-bottom: 1px solid #f0f0f0;
|
||||
}
|
||||
|
||||
/* Dark mode for new elements */
|
||||
@media (prefers-color-scheme: dark) {
|
||||
.section-header-row {
|
||||
background: #2a2a40;
|
||||
border-color: #333;
|
||||
}
|
||||
|
||||
.section-header-row h6 {
|
||||
color: #ccc;
|
||||
}
|
||||
|
||||
.clear-filter-btn {
|
||||
background: #333;
|
||||
border-color: #444;
|
||||
color: #aaa;
|
||||
}
|
||||
|
||||
.clear-filter-btn:hover {
|
||||
background: #404050;
|
||||
border-color: #555;
|
||||
}
|
||||
|
||||
.claims-note {
|
||||
background: #1a1a2e;
|
||||
border-color: #333;
|
||||
color: #777;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -68,6 +68,10 @@ COUNTRY_NAMES = {
|
|||
'DE': 'Germany',
|
||||
'FR': 'France',
|
||||
'GB': 'United Kingdom',
|
||||
'EE': 'Estonia',
|
||||
'PH': 'Philippines',
|
||||
'CL': 'Chile',
|
||||
'CH': 'Switzerland',
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -184,11 +188,142 @@ def lookup_city_geonames(conn: sqlite3.Connection, lat: float, lon: float, count
|
|||
return None
|
||||
|
||||
|
||||
# Brazil: GeoNames admin1_code → ISO 3166-2:BR state code
|
||||
BRAZIL_STATE_CODES = {
|
||||
'01': 'AC', # Acre
|
||||
'02': 'AL', # Alagoas
|
||||
'03': 'AP', # Amapá
|
||||
'04': 'AM', # Amazonas
|
||||
'05': 'BA', # Bahia
|
||||
'06': 'CE', # Ceará
|
||||
'07': 'DF', # Federal District (Distrito Federal)
|
||||
'08': 'ES', # Espírito Santo
|
||||
'11': 'MS', # Mato Grosso do Sul
|
||||
'13': 'MA', # Maranhão
|
||||
'14': 'MT', # Mato Grosso
|
||||
'15': 'MG', # Minas Gerais
|
||||
'16': 'PA', # Pará
|
||||
'17': 'PB', # Paraíba
|
||||
'18': 'PR', # Paraná
|
||||
'20': 'PI', # Piauí
|
||||
'21': 'RJ', # Rio de Janeiro
|
||||
'22': 'RN', # Rio Grande do Norte
|
||||
'23': 'RS', # Rio Grande do Sul
|
||||
'24': 'RO', # Rondônia
|
||||
'25': 'RR', # Roraima
|
||||
'26': 'SC', # Santa Catarina
|
||||
'27': 'SP', # São Paulo
|
||||
'28': 'SE', # Sergipe
|
||||
'29': 'GO', # Goiás
|
||||
'30': 'PE', # Pernambuco
|
||||
'31': 'TO', # Tocantins
|
||||
}
|
||||
|
||||
# Switzerland: GeoNames admin1_code → ISO 3166-2:CH canton code
|
||||
SWITZERLAND_CANTON_CODES = {
|
||||
'AG': 'AG', # Aargau
|
||||
'AI': 'AI', # Appenzell Innerrhoden
|
||||
'AR': 'AR', # Appenzell Ausserrhoden
|
||||
'BE': 'BE', # Bern
|
||||
'BL': 'BL', # Basel-Landschaft
|
||||
'BS': 'BS', # Basel-Stadt
|
||||
'FR': 'FR', # Fribourg
|
||||
'GE': 'GE', # Geneva
|
||||
'GL': 'GL', # Glarus
|
||||
'GR': 'GR', # Graubünden
|
||||
'JU': 'JU', # Jura
|
||||
'LU': 'LU', # Lucerne
|
||||
'NE': 'NE', # Neuchâtel
|
||||
'NW': 'NW', # Nidwalden
|
||||
'OW': 'OW', # Obwalden
|
||||
'SG': 'SG', # St. Gallen
|
||||
'SH': 'SH', # Schaffhausen
|
||||
'SO': 'SO', # Solothurn
|
||||
'SZ': 'SZ', # Schwyz
|
||||
'TG': 'TG', # Thurgau
|
||||
'TI': 'TI', # Ticino
|
||||
'UR': 'UR', # Uri
|
||||
'VD': 'VD', # Vaud
|
||||
'VS': 'VS', # Valais
|
||||
'ZG': 'ZG', # Zug
|
||||
'ZH': 'ZH', # Zürich
|
||||
}
|
||||
|
||||
# Mexico: GeoNames admin1_code → ISO 3166-2:MX state code
|
||||
MEXICO_STATE_CODES = {
|
||||
'01': 'AGU', # Aguascalientes
|
||||
'02': 'BCN', # Baja California
|
||||
'03': 'BCS', # Baja California Sur
|
||||
'04': 'CAM', # Campeche
|
||||
'05': 'COA', # Coahuila
|
||||
'06': 'COL', # Colima
|
||||
'07': 'CHP', # Chiapas
|
||||
'08': 'CHH', # Chihuahua
|
||||
'09': 'CMX', # Ciudad de México (CDMX)
|
||||
'10': 'DUR', # Durango
|
||||
'11': 'GUA', # Guanajuato
|
||||
'12': 'GRO', # Guerrero
|
||||
'13': 'HID', # Hidalgo
|
||||
'14': 'JAL', # Jalisco
|
||||
'15': 'MEX', # México (State of Mexico)
|
||||
'16': 'MIC', # Michoacán
|
||||
'17': 'MOR', # Morelos
|
||||
'18': 'NAY', # Nayarit
|
||||
'19': 'NLE', # Nuevo León
|
||||
'20': 'OAX', # Oaxaca
|
||||
'21': 'PUE', # Puebla
|
||||
'22': 'QUE', # Querétaro
|
||||
'23': 'ROO', # Quintana Roo
|
||||
'24': 'SLP', # San Luis Potosí
|
||||
'25': 'SIN', # Sinaloa
|
||||
'26': 'SON', # Sonora
|
||||
'27': 'TAB', # Tabasco
|
||||
'28': 'TAM', # Tamaulipas
|
||||
'29': 'TLA', # Tlaxcala
|
||||
'30': 'VER', # Veracruz
|
||||
'31': 'YUC', # Yucatán
|
||||
'32': 'ZAC', # Zacatecas
|
||||
}
|
||||
|
||||
# Chile: GeoNames admin1_code → ISO 3166-2:CL region code
|
||||
CHILE_REGION_CODES = {
|
||||
'01': 'TA', # Tarapacá
|
||||
'02': 'AN', # Antofagasta
|
||||
'03': 'AT', # Atacama
|
||||
'04': 'CO', # Coquimbo
|
||||
'05': 'VS', # Valparaíso
|
||||
'06': 'LI', # Libertador General Bernardo O'Higgins
|
||||
'07': 'ML', # Maule
|
||||
'08': 'BI', # Biobío
|
||||
'09': 'AR', # La Araucanía
|
||||
'10': 'LL', # Los Lagos
|
||||
'11': 'AI', # Aisén del General Carlos Ibáñez del Campo
|
||||
'12': 'MA', # Magallanes y de la Antártica Chilena
|
||||
'13': 'RM', # Región Metropolitana de Santiago
|
||||
'14': 'LR', # Los Ríos
|
||||
'15': 'AP', # Arica y Parinacota
|
||||
'16': 'NB', # Ñuble
|
||||
}
|
||||
|
||||
|
||||
def get_region_code(admin1_code: str, country_code: str, admin1_name: str) -> str:
|
||||
"""Get ISO-style region code from GeoNames admin1_code."""
|
||||
if not admin1_code:
|
||||
return 'XX'
|
||||
|
||||
# Country-specific mappings
|
||||
if country_code == 'BR' and admin1_code in BRAZIL_STATE_CODES:
|
||||
return BRAZIL_STATE_CODES[admin1_code]
|
||||
|
||||
if country_code == 'CH' and admin1_code in SWITZERLAND_CANTON_CODES:
|
||||
return SWITZERLAND_CANTON_CODES[admin1_code]
|
||||
|
||||
if country_code == 'MX' and admin1_code in MEXICO_STATE_CODES:
|
||||
return MEXICO_STATE_CODES[admin1_code]
|
||||
|
||||
if country_code == 'CL' and admin1_code in CHILE_REGION_CODES:
|
||||
return CHILE_REGION_CODES[admin1_code]
|
||||
|
||||
# For most countries, use first 2-3 characters of admin1_code or name
|
||||
if len(admin1_code) <= 3:
|
||||
return admin1_code.upper()
|
||||
|
|
@ -234,10 +369,11 @@ def process_file(filepath: Path, conn: sqlite3.Connection, api_key: str,
|
|||
old_ghcid = ghcid_data.get('ghcid_current', '')
|
||||
result['old_ghcid'] = old_ghcid
|
||||
|
||||
# Match both patterns:
|
||||
# 1. {country}-XX-XXX-... (no region, no city)
|
||||
# 2. {country}-{region}-XXX-... (has region, no city)
|
||||
xxx_pattern = re.compile(rf'^{country_code}-[A-Z]{{2}}-XXX-')
|
||||
# Match patterns with XXX city code:
|
||||
# - {country}-XX-XXX-... (2-letter region like XX, BE, GE)
|
||||
# - {country}-10-XXX-... (2-digit region like 10, 52, 37)
|
||||
# - {country}-UKM-XXX-... (3-letter region like UKM, IDF, CMX)
|
||||
xxx_pattern = re.compile(rf'^{country_code}-[A-Z0-9]{{2,3}}-XXX-')
|
||||
if not xxx_pattern.match(old_ghcid):
|
||||
result['status'] = 'skipped'
|
||||
result['error'] = f'Not a {country_code}-*-XXX file'
|
||||
|
|
|
|||
|
|
@ -279,7 +279,27 @@ def build_ducklake_database(mapping: Dict[int, str]):
|
|||
continue
|
||||
|
||||
# Insert pages
|
||||
for page in metadata.get('pages', []):
|
||||
pages = metadata.get('pages', [])
|
||||
|
||||
# Handle single-page archives (older format with 'files' key)
|
||||
if not pages and 'files' in metadata:
|
||||
# Create a synthetic page entry from the single-page fetch
|
||||
files = metadata.get('files', {})
|
||||
rendered_html = files.get('rendered_html')
|
||||
if rendered_html:
|
||||
pages = [{
|
||||
'title': domain_folder.name, # Use domain as title
|
||||
'source_path': 'index.html',
|
||||
'archived_file': rendered_html,
|
||||
'extractions_count': 0
|
||||
}]
|
||||
# Update the archive's total_pages count
|
||||
con.execute("""
|
||||
UPDATE web_archives SET total_pages = 1, processed_pages = 1
|
||||
WHERE ghcid = ?
|
||||
""", [ghcid])
|
||||
|
||||
for page in pages:
|
||||
page_id += 1
|
||||
try:
|
||||
con.execute("""
|
||||
|
|
|
|||
Loading…
Reference in a new issue