glam/scripts/test_link_extraction.py
2025-12-30 23:07:03 +01:00

51 lines
1.4 KiB
Python

#!/usr/bin/env python3
"""Test script for debugging link extraction."""
import asyncio
import sys
import os
from pathlib import Path
# Add project root to path
PROJECT_ROOT = Path(__file__).parent.parent
sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)
from batch_extract_mission_statements import (
get_api_tokens,
LinkupWebReader,
extract_links_from_markdown,
filter_mission_links,
MISSION_LINK_KEYWORDS
)
from urllib.parse import urlparse
async def test():
tokens = get_api_tokens()
reader = LinkupWebReader(tokens['linkup'])
print("Fetching homepage...")
result = await reader.read_webpage('https://grachten.museum/')
content = result.get('content', '')
print(f"Content length: {len(content)}")
print(f"\n--- Content preview ---\n{content[:1000]}\n---\n")
links = extract_links_from_markdown(content, 'https://grachten.museum/')
print(f"\nFound {len(links)} links:")
for link in links:
parsed = urlparse(link)
print(f" {link}")
print(f" path: {parsed.path.lower()}")
print(f"\nMission link keywords: {MISSION_LINK_KEYWORDS}")
mission_links = filter_mission_links(links, 'grachten.museum')
print(f"\nFiltered to {len(mission_links)} mission links:")
for link in mission_links:
print(f" {link}")
if __name__ == "__main__":
asyncio.run(test())