fix: Fix aio resource leaks
This commit is contained in:
		
							
								
								
									
										32
									
								
								scrape.py
									
									
									
									
									
								
							
							
						
						
									
										32
									
								
								scrape.py
									
									
									
									
									
								
							| @@ -1,4 +1,3 @@ | |||||||
| import asyncio |  | ||||||
| import logging | import logging | ||||||
| from abc import ABC, abstractmethod | from abc import ABC, abstractmethod | ||||||
| from typing import override | from typing import override | ||||||
| @@ -8,41 +7,22 @@ import httpx | |||||||
|  |  | ||||||
| class TextScraper(ABC): | class TextScraper(ABC): | ||||||
|     def __init__(self): |     def __init__(self): | ||||||
|         self._client = httpx.AsyncClient(timeout=httpx.Timeout(5.0)) |         self._http_headers = {} | ||||||
|  |  | ||||||
|     async def _fetch_text(self, url: str) -> str: |     async def _fetch_text(self, url: str) -> str: | ||||||
|         """Fetch the raw HTML content from the URL.""" |         """Fetch the raw HTML content from the URL.""" | ||||||
|         response = None |  | ||||||
|         try: |         try: | ||||||
|             response = await self._client.get(url) |             async with httpx.AsyncClient(headers=self._http_headers) as client: | ||||||
|             response.raise_for_status() |                 response = await client.get(url) | ||||||
|             return response.text |                 response.raise_for_status() | ||||||
|  |                 return response.text | ||||||
|         except Exception: |         except Exception: | ||||||
|             logging.warning(f"Failed to fetch text from {url}", exc_info=True) |             logging.warning(f"Failed to fetch text from {url}", exc_info=True) | ||||||
|             raise |             raise | ||||||
|         finally: |  | ||||||
|             if response: |  | ||||||
|                 await response.aclose() |  | ||||||
|  |  | ||||||
|     @abstractmethod |     @abstractmethod | ||||||
|     async def get_content(self, url: str) -> str: ... |     async def get_content(self, url: str) -> str: ... | ||||||
|  |  | ||||||
|     async def close(self): |  | ||||||
|         """Close the underlying HTTP client.""" |  | ||||||
|         if self._client and not self._client.is_closed: |  | ||||||
|             await self._client.aclose() |  | ||||||
|  |  | ||||||
|     def __del__(self): |  | ||||||
|         """Ensure the HTTP client is closed when the object is deleted.""" |  | ||||||
|         try: |  | ||||||
|             loop = asyncio.get_event_loop() |  | ||||||
|             if loop.is_running(): |  | ||||||
|                 loop.create_task(self.close()) |  | ||||||
|             else: |  | ||||||
|                 loop.run_until_complete(self.close()) |  | ||||||
|         except Exception: |  | ||||||
|             pass |  | ||||||
|  |  | ||||||
|  |  | ||||||
| class Html2textScraper(TextScraper): | class Html2textScraper(TextScraper): | ||||||
|     @override |     @override | ||||||
| @@ -65,7 +45,7 @@ class JinaScraper(TextScraper): | |||||||
|     def __init__(self, api_key: str | None = None): |     def __init__(self, api_key: str | None = None): | ||||||
|         super().__init__() |         super().__init__() | ||||||
|         if api_key: |         if api_key: | ||||||
|             self._client.headers.update({"Authorization": f"Bearer {api_key}"}) |             self._http_headers.update({"Authorization": f"Bearer {api_key}"}) | ||||||
|  |  | ||||||
|     @override |     @override | ||||||
|     async def get_content(self, url: str) -> str: |     async def get_content(self, url: str) -> str: | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user