feat: Reworked chunking and retrieval logic to operate on entire stories instead of chunks.

2025-07-01 13:08:45 +02:00
parent f093a488f3
commit b55fd6a021
8 changed files with 442 additions and 920 deletions
--- a/scrape.py
+++ b/scrape.py
@@ -1,3 +1,5 @@
+import asyncio
+import logging
 from abc import ABC, abstractmethod
 from typing import override

@@ -6,17 +8,41 @@ import httpx

 class TextScraper(ABC):
    def __init__(self):
-        self._client = httpx.AsyncClient()
+        self._client = httpx.AsyncClient(timeout=httpx.Timeout(5.0))

    async def _fetch_text(self, url: str) -> str:
        """Fetch the raw HTML content from the URL."""
-        response = await self._client.get(url)
-        response.raise_for_status()
-        return response.text
+        response = None
+        try:
+            response = await self._client.get(url)
+            response.raise_for_status()
+            return response.text
+        except Exception:
+            logging.warning(f"Failed to fetch text from {url}", exc_info=True)
+            raise
+        finally:
+            if response:
+                await response.aclose()

    @abstractmethod
    async def get_content(self, url: str) -> str: ...

+    async def close(self):
+        """Close the underlying HTTP client."""
+        if self._client and not self._client.is_closed:
+            await self._client.aclose()
+
+    def __del__(self):
+        """Ensure the HTTP client is closed when the object is deleted."""
+        try:
+            loop = asyncio.get_event_loop()
+            if loop.is_running():
+                loop.create_task(self.close())
+            else:
+                loop.run_until_complete(self.close())
+        except Exception:
+            pass
+

 class Html2textScraper(TextScraper):
    @override
@@ -39,12 +65,13 @@ class JinaScraper(TextScraper):
    def __init__(self, api_key: str | None = None):
        super().__init__()
        if api_key:
-            self._client.headers.update({"Authorization": "Bearer {api_key}"})
+            self._client.headers.update({"Authorization": f"Bearer {api_key}"})

    @override
    async def get_content(self, url: str) -> str:
        print(f"Fetching content from: {url}")
        try:
            return await self._fetch_text(f"https://r.jina.ai/{url}")
-        except httpx.HTTPStatusError:
+        except Exception:
+            logging.warning(f"Failed to fetch content from {url}", exc_info=True)
            return ""