47 lines
1.3 KiB
Python
47 lines
1.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Test script for debugging link extraction."""
|
|
|
|
import asyncio
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from scripts.batch_extract_mission_statements import (
|
|
get_api_tokens,
|
|
LinkupWebReader,
|
|
extract_links_from_markdown,
|
|
filter_mission_links,
|
|
MISSION_LINK_KEYWORDS
|
|
)
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
async def test():
|
|
tokens = get_api_tokens()
|
|
reader = LinkupWebReader(tokens['linkup'])
|
|
|
|
print("Fetching homepage...")
|
|
result = await reader.read_webpage('https://grachten.museum/')
|
|
content = result.get('content', '')
|
|
|
|
print(f"Content length: {len(content)}")
|
|
print(f"\n--- Content preview ---\n{content[:1000]}\n---\n")
|
|
|
|
links = extract_links_from_markdown(content, 'https://grachten.museum/')
|
|
print(f"\nFound {len(links)} links:")
|
|
for link in links:
|
|
parsed = urlparse(link)
|
|
print(f" {link}")
|
|
print(f" path: {parsed.path.lower()}")
|
|
|
|
print(f"\nMission link keywords: {MISSION_LINK_KEYWORDS}")
|
|
|
|
mission_links = filter_mission_links(links, 'grachten.museum')
|
|
print(f"\nFiltered to {len(mission_links)} mission links:")
|
|
for link in mission_links:
|
|
print(f" {link}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test())
|