fix: Fix aio resource leaks

This commit is contained in:
Adrian Rumpold
2025-07-02 12:22:05 +02:00
parent 87a17331fd
commit 259c9699ad

View File

@@ -1,4 +1,3 @@
import asyncio
import logging import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import override from typing import override
@@ -8,41 +7,22 @@ import httpx
class TextScraper(ABC): class TextScraper(ABC):
def __init__(self): def __init__(self):
self._client = httpx.AsyncClient(timeout=httpx.Timeout(5.0)) self._http_headers = {}
async def _fetch_text(self, url: str) -> str: async def _fetch_text(self, url: str) -> str:
"""Fetch the raw HTML content from the URL.""" """Fetch the raw HTML content from the URL."""
response = None
try: try:
response = await self._client.get(url) async with httpx.AsyncClient(headers=self._http_headers) as client:
response.raise_for_status() response = await client.get(url)
return response.text response.raise_for_status()
return response.text
except Exception: except Exception:
logging.warning(f"Failed to fetch text from {url}", exc_info=True) logging.warning(f"Failed to fetch text from {url}", exc_info=True)
raise raise
finally:
if response:
await response.aclose()
@abstractmethod @abstractmethod
async def get_content(self, url: str) -> str: ... async def get_content(self, url: str) -> str: ...
async def close(self):
"""Close the underlying HTTP client."""
if self._client and not self._client.is_closed:
await self._client.aclose()
def __del__(self):
"""Ensure the HTTP client is closed when the object is deleted."""
try:
loop = asyncio.get_event_loop()
if loop.is_running():
loop.create_task(self.close())
else:
loop.run_until_complete(self.close())
except Exception:
pass
class Html2textScraper(TextScraper): class Html2textScraper(TextScraper):
@override @override
@@ -65,7 +45,7 @@ class JinaScraper(TextScraper):
def __init__(self, api_key: str | None = None): def __init__(self, api_key: str | None = None):
super().__init__() super().__init__()
if api_key: if api_key:
self._client.headers.update({"Authorization": f"Bearer {api_key}"}) self._http_headers.update({"Authorization": f"Bearer {api_key}"})
@override @override
async def get_content(self, url: str) -> str: async def get_content(self, url: str) -> str: