58 lines
1.6 KiB
Python
58 lines
1.6 KiB
Python
import logging
|
|
from abc import ABC, abstractmethod
|
|
from typing import override
|
|
|
|
import httpx
|
|
|
|
|
|
class TextScraper(ABC):
|
|
def __init__(self):
|
|
self._http_headers = {}
|
|
|
|
async def _fetch_text(self, url: str) -> str:
|
|
"""Fetch the raw HTML content from the URL."""
|
|
try:
|
|
async with httpx.AsyncClient(headers=self._http_headers) as client:
|
|
response = await client.get(url)
|
|
response.raise_for_status()
|
|
return response.text
|
|
except Exception:
|
|
logging.warning(f"Failed to fetch text from {url}", exc_info=True)
|
|
raise
|
|
|
|
@abstractmethod
|
|
async def get_content(self, url: str) -> str: ...
|
|
|
|
|
|
class Html2textScraper(TextScraper):
|
|
@override
|
|
async def get_content(self, url: str) -> str:
|
|
import html2text
|
|
|
|
return html2text.html2text(await self._fetch_text(url))
|
|
|
|
|
|
class ReadabilityScraper(TextScraper):
|
|
@override
|
|
async def get_content(self, url: str) -> str:
|
|
import readability
|
|
|
|
doc = readability.Document(await self._fetch_text(url))
|
|
return doc.summary(html_partial=True)
|
|
|
|
|
|
class JinaScraper(TextScraper):
|
|
def __init__(self, api_key: str | None = None):
|
|
super().__init__()
|
|
if api_key:
|
|
self._http_headers.update({"Authorization": f"Bearer {api_key}"})
|
|
|
|
@override
|
|
async def get_content(self, url: str) -> str:
|
|
print(f"Fetching content from: {url}")
|
|
try:
|
|
return await self._fetch_text(f"https://r.jina.ai/{url}")
|
|
except Exception:
|
|
logging.warning(f"Failed to fetch content from {url}", exc_info=True)
|
|
return ""
|