From 87a17331fd1342b216a3a5b6f91b98dcecaaf022 Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Tue, 1 Jul 2025 14:37:52 +0200 Subject: [PATCH] Improved retrieval step with relevance ranking --- indexing.py | 56 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 43 insertions(+), 13 deletions(-) diff --git a/indexing.py b/indexing.py index 9fb2d4e..2f0c94f 100644 --- a/indexing.py +++ b/indexing.py @@ -18,7 +18,7 @@ from hn import HackerNewsClient, Story from scrape import JinaScraper NUM_STORIES = 20 -USER_PREFERENCES = ["Machine Learning", "Linux", "Open-Source"] +USER_PREFERENCES = ["Machine Learning", "Programming", "Robotics"] ENABLE_SLACK = True # Send updates to Slack, need to set SLACK_BOT_TOKEN env var ENABLE_MLFLOW_TRACING = False # Use MLflow (at http://localhost:5000) for tracing @@ -44,27 +44,45 @@ class State(TypedDict): summaries: list[dict] -def retrieve(state: State, top_n: int = 5) -> State: - # Search for relevant documents +def retrieve(state: State, top_n: int = 2 * len(USER_PREFERENCES)) -> State: + # Search for relevant documents (with scores if available) retrieved_docs = vector_store.similarity_search( - "Categories: " + ", ".join(state["preferences"]), k=20 + "Show the most interesting articles about the following topics: " + + ", ".join(state["preferences"]), + k=top_n * 20, # Chunks, not complete stories + return_score=True + if hasattr(vector_store, "similarity_search_with_score") + else False, ) - # If you're using chunks, group them back into complete stories + # If scores are returned, unpack (doc, score) tuples; else, set score to None + docs_with_scores = [] + if retrieved_docs and isinstance(retrieved_docs[0], tuple): + for doc, score in retrieved_docs: + docs_with_scores.append((doc, score)) + else: + for doc in retrieved_docs: + docs_with_scores.append((doc, None)) + + # Group chunks by story_id and collect their scores story_groups = {} - for doc in retrieved_docs: + for doc, score in docs_with_scores: story_id = doc.metadata.get("story_id") if story_id not in story_groups: story_groups[story_id] = [] - story_groups[story_id].append(doc) + story_groups[story_id].append((doc, score)) - # Reconstruct complete stories or use the best chunk per story + # Aggregate max score per story and reconstruct complete stories + story_scores = {} complete_stories = [] - for story_id, chunks in story_groups.items(): + for story_id, chunks_scores in story_groups.items(): + chunks = [doc for doc, _ in chunks_scores] + scores = [s for _, s in chunks_scores if s is not None] + max_score = max(scores) if scores else None + story_scores[story_id] = max_score if len(chunks) == 1: - complete_stories.append(chunks[0]) + complete_stories.append((chunks[0], max_score)) else: - # Combine chunks back into complete story combined_content = "\n\n".join( chunk.page_content for chunk in sorted( @@ -75,9 +93,21 @@ def retrieve(state: State, top_n: int = 5) -> State: page_content=combined_content, metadata=chunks[0].metadata, # Use metadata from first chunk ) - complete_stories.append(complete_story) + complete_stories.append((complete_story, max_score)) - return {"context": complete_stories[:top_n]} + # Sort stories by max_score descending (None scores go last) + complete_stories_sorted = sorted( + complete_stories, key=lambda x: (x[1] is not None, x[1]), reverse=True + ) + + # Return top_n stories + top_stories = [doc for doc, _ in complete_stories_sorted[:top_n]] + return { + "preferences": state["preferences"], + "context": top_stories, + "answer": state.get("answer", ""), + "summaries": state.get("summaries", []), + } def generate_structured_summaries(state: State):