feat: Langchain HN RAG demo

This commit is contained in:
Adrian Rumpold
2025-07-01 09:26:52 +02:00
commit 648baf3263
6 changed files with 2193 additions and 0 deletions

47
scrape.py Normal file
View File

@@ -0,0 +1,47 @@
from abc import ABC, abstractmethod
from typing import override
import httpx
class TextScraper(ABC):
def __init__(self):
self._client = httpx.AsyncClient()
async def _fetch_text(self, url: str) -> str:
"""Fetch the raw HTML content from the URL."""
response = await self._client.get(url)
response.raise_for_status()
return response.text
@abstractmethod
async def get_content(self, url: str) -> str: ...
class Html2textScraper(TextScraper):
@override
async def get_content(self, url: str) -> str:
import html2text
return html2text.html2text(await self._fetch_text(url))
class ReadabilityScraper(TextScraper):
@override
async def get_content(self, url: str) -> str:
import readability
doc = readability.Document(await self._fetch_text(url))
return doc.summary(html_partial=True)
class JinaScraper(TextScraper):
def __init__(self, api_key: str | None = None):
super().__init__()
if api_key:
self._client.headers.update({"Authorization": "Bearer {api_key}"})
@override
async def get_content(self, url: str) -> str:
print(f"Fetching content from: {url}")
return await self._fetch_text(f"https://r.jina.ai/{url}")