feat: Langchain HN RAG demo
This commit is contained in:
47
scrape.py
Normal file
47
scrape.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import override
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
class TextScraper(ABC):
|
||||
def __init__(self):
|
||||
self._client = httpx.AsyncClient()
|
||||
|
||||
async def _fetch_text(self, url: str) -> str:
|
||||
"""Fetch the raw HTML content from the URL."""
|
||||
response = await self._client.get(url)
|
||||
response.raise_for_status()
|
||||
return response.text
|
||||
|
||||
@abstractmethod
|
||||
async def get_content(self, url: str) -> str: ...
|
||||
|
||||
|
||||
class Html2textScraper(TextScraper):
|
||||
@override
|
||||
async def get_content(self, url: str) -> str:
|
||||
import html2text
|
||||
|
||||
return html2text.html2text(await self._fetch_text(url))
|
||||
|
||||
|
||||
class ReadabilityScraper(TextScraper):
|
||||
@override
|
||||
async def get_content(self, url: str) -> str:
|
||||
import readability
|
||||
|
||||
doc = readability.Document(await self._fetch_text(url))
|
||||
return doc.summary(html_partial=True)
|
||||
|
||||
|
||||
class JinaScraper(TextScraper):
|
||||
def __init__(self, api_key: str | None = None):
|
||||
super().__init__()
|
||||
if api_key:
|
||||
self._client.headers.update({"Authorization": "Bearer {api_key}"})
|
||||
|
||||
@override
|
||||
async def get_content(self, url: str) -> str:
|
||||
print(f"Fetching content from: {url}")
|
||||
return await self._fetch_text(f"https://r.jina.ai/{url}")
|
||||
Reference in New Issue
Block a user