feat: Reworked chunking and retrieval logic to operate on entire stories instead of chunks.

2025-07-01 13:08:45 +02:00
parent f093a488f3
commit b55fd6a021
8 changed files with 442 additions and 920 deletions
--- a/classify.py
+++ b/classify.py
@@ -0,0 +1,21 @@
+from langchain_core.documents import Document
+from langchain_core.language_models import BaseChatModel
+
+
+def categorize(doc: Document, llm: BaseChatModel) -> set[str]:
+    # Create a prompt for category extraction
+    prompt = f"""
+    Extract up to 3 relevant categories from the following document.
+    Return only the category names as a list of JSON strings.
+
+    If you cannot find any relevant categories, return an empty list.
+
+    Title: {doc.metadata.get('title', 'No title')}
+    Content: {doc.page_content}...
+
+    Categories:"""
+
+    # Get response from LLM
+    result = llm.with_structured_output(method="json_mode").invoke(prompt)
+    categories = result.get("categories", [])
+    return set(categories)