import asyncio import logging from abc import ABC, abstractmethod from typing import override import httpx class TextScraper(ABC): def __init__(self): self._client = httpx.AsyncClient(timeout=httpx.Timeout(5.0)) async def _fetch_text(self, url: str) -> str: """Fetch the raw HTML content from the URL.""" response = None try: response = await self._client.get(url) response.raise_for_status() return response.text except Exception: logging.warning(f"Failed to fetch text from {url}", exc_info=True) raise finally: if response: await response.aclose() @abstractmethod async def get_content(self, url: str) -> str: ... async def close(self): """Close the underlying HTTP client.""" if self._client and not self._client.is_closed: await self._client.aclose() def __del__(self): """Ensure the HTTP client is closed when the object is deleted.""" try: loop = asyncio.get_event_loop() if loop.is_running(): loop.create_task(self.close()) else: loop.run_until_complete(self.close()) except Exception: pass class Html2textScraper(TextScraper): @override async def get_content(self, url: str) -> str: import html2text return html2text.html2text(await self._fetch_text(url)) class ReadabilityScraper(TextScraper): @override async def get_content(self, url: str) -> str: import readability doc = readability.Document(await self._fetch_text(url)) return doc.summary(html_partial=True) class JinaScraper(TextScraper): def __init__(self, api_key: str | None = None): super().__init__() if api_key: self._client.headers.update({"Authorization": f"Bearer {api_key}"}) @override async def get_content(self, url: str) -> str: print(f"Fetching content from: {url}") try: return await self._fetch_text(f"https://r.jina.ai/{url}") except Exception: logging.warning(f"Failed to fetch content from {url}", exc_info=True) return ""