feat: Reworked chunking and retrieval logic to operate on entire stories instead of chunks.
This commit is contained in:
21
classify.py
Normal file
21
classify.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.language_models import BaseChatModel
|
||||
|
||||
|
||||
def categorize(doc: Document, llm: BaseChatModel) -> set[str]:
|
||||
# Create a prompt for category extraction
|
||||
prompt = f"""
|
||||
Extract up to 3 relevant categories from the following document.
|
||||
Return only the category names as a list of JSON strings.
|
||||
|
||||
If you cannot find any relevant categories, return an empty list.
|
||||
|
||||
Title: {doc.metadata.get('title', 'No title')}
|
||||
Content: {doc.page_content}...
|
||||
|
||||
Categories:"""
|
||||
|
||||
# Get response from LLM
|
||||
result = llm.with_structured_output(method="json_mode").invoke(prompt)
|
||||
categories = result.get("categories", [])
|
||||
return set(categories)
|
||||
Reference in New Issue
Block a user