feat: Reworked chunking and retrieval logic to operate on entire stories instead of chunks.

This commit is contained in:
Adrian Rumpold
2025-07-01 13:08:45 +02:00
parent f093a488f3
commit b55fd6a021
8 changed files with 442 additions and 920 deletions

21
classify.py Normal file
View File

@@ -0,0 +1,21 @@
from langchain_core.documents import Document
from langchain_core.language_models import BaseChatModel
def categorize(doc: Document, llm: BaseChatModel) -> set[str]:
# Create a prompt for category extraction
prompt = f"""
Extract up to 3 relevant categories from the following document.
Return only the category names as a list of JSON strings.
If you cannot find any relevant categories, return an empty list.
Title: {doc.metadata.get('title', 'No title')}
Content: {doc.page_content}...
Categories:"""
# Get response from LLM
result = llm.with_structured_output(method="json_mode").invoke(prompt)
categories = result.get("categories", [])
return set(categories)