feat: Reworked chunking and retrieval logic to operate on entire stories instead of chunks.
This commit is contained in:
39
scrape.py
39
scrape.py
@@ -1,3 +1,5 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import override
|
||||
|
||||
@@ -6,17 +8,41 @@ import httpx
|
||||
|
||||
class TextScraper(ABC):
|
||||
def __init__(self):
|
||||
self._client = httpx.AsyncClient()
|
||||
self._client = httpx.AsyncClient(timeout=httpx.Timeout(5.0))
|
||||
|
||||
async def _fetch_text(self, url: str) -> str:
|
||||
"""Fetch the raw HTML content from the URL."""
|
||||
response = await self._client.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
response = None
|
||||
try:
|
||||
response = await self._client.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
except Exception:
|
||||
logging.warning(f"Failed to fetch text from {url}", exc_info=True)
|
||||
raise
|
||||
finally:
|
||||
if response:
|
||||
await response.aclose()
|
||||
|
||||
@abstractmethod
|
||||
async def get_content(self, url: str) -> str: ...
|
||||
|
||||
async def close(self):
|
||||
"""Close the underlying HTTP client."""
|
||||
if self._client and not self._client.is_closed:
|
||||
await self._client.aclose()
|
||||
|
||||
def __del__(self):
|
||||
"""Ensure the HTTP client is closed when the object is deleted."""
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
if loop.is_running():
|
||||
loop.create_task(self.close())
|
||||
else:
|
||||
loop.run_until_complete(self.close())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class Html2textScraper(TextScraper):
|
||||
@override
|
||||
@@ -39,12 +65,13 @@ class JinaScraper(TextScraper):
|
||||
def __init__(self, api_key: str | None = None):
|
||||
super().__init__()
|
||||
if api_key:
|
||||
self._client.headers.update({"Authorization": "Bearer {api_key}"})
|
||||
self._client.headers.update({"Authorization": f"Bearer {api_key}"})
|
||||
|
||||
@override
|
||||
async def get_content(self, url: str) -> str:
|
||||
print(f"Fetching content from: {url}")
|
||||
try:
|
||||
return await self._fetch_text(f"https://r.jina.ai/{url}")
|
||||
except httpx.HTTPStatusError:
|
||||
except Exception:
|
||||
logging.warning(f"Failed to fetch content from {url}", exc_info=True)
|
||||
return ""
|
||||
|
||||
Reference in New Issue
Block a user