fix: Fix aio resource leaks
This commit is contained in:
32
scrape.py
32
scrape.py
@@ -1,4 +1,3 @@
|
|||||||
import asyncio
|
|
||||||
import logging
|
import logging
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import override
|
from typing import override
|
||||||
@@ -8,41 +7,22 @@ import httpx
|
|||||||
|
|
||||||
class TextScraper(ABC):
|
class TextScraper(ABC):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._client = httpx.AsyncClient(timeout=httpx.Timeout(5.0))
|
self._http_headers = {}
|
||||||
|
|
||||||
async def _fetch_text(self, url: str) -> str:
|
async def _fetch_text(self, url: str) -> str:
|
||||||
"""Fetch the raw HTML content from the URL."""
|
"""Fetch the raw HTML content from the URL."""
|
||||||
response = None
|
|
||||||
try:
|
try:
|
||||||
response = await self._client.get(url)
|
async with httpx.AsyncClient(headers=self._http_headers) as client:
|
||||||
response.raise_for_status()
|
response = await client.get(url)
|
||||||
return response.text
|
response.raise_for_status()
|
||||||
|
return response.text
|
||||||
except Exception:
|
except Exception:
|
||||||
logging.warning(f"Failed to fetch text from {url}", exc_info=True)
|
logging.warning(f"Failed to fetch text from {url}", exc_info=True)
|
||||||
raise
|
raise
|
||||||
finally:
|
|
||||||
if response:
|
|
||||||
await response.aclose()
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
async def get_content(self, url: str) -> str: ...
|
async def get_content(self, url: str) -> str: ...
|
||||||
|
|
||||||
async def close(self):
|
|
||||||
"""Close the underlying HTTP client."""
|
|
||||||
if self._client and not self._client.is_closed:
|
|
||||||
await self._client.aclose()
|
|
||||||
|
|
||||||
def __del__(self):
|
|
||||||
"""Ensure the HTTP client is closed when the object is deleted."""
|
|
||||||
try:
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
if loop.is_running():
|
|
||||||
loop.create_task(self.close())
|
|
||||||
else:
|
|
||||||
loop.run_until_complete(self.close())
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class Html2textScraper(TextScraper):
|
class Html2textScraper(TextScraper):
|
||||||
@override
|
@override
|
||||||
@@ -65,7 +45,7 @@ class JinaScraper(TextScraper):
|
|||||||
def __init__(self, api_key: str | None = None):
|
def __init__(self, api_key: str | None = None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
if api_key:
|
if api_key:
|
||||||
self._client.headers.update({"Authorization": f"Bearer {api_key}"})
|
self._http_headers.update({"Authorization": f"Bearer {api_key}"})
|
||||||
|
|
||||||
@override
|
@override
|
||||||
async def get_content(self, url: str) -> str:
|
async def get_content(self, url: str) -> str:
|
||||||
|
|||||||
Reference in New Issue
Block a user