From 87a17331fd1342b216a3a5b6f91b98dcecaaf022 Mon Sep 17 00:00:00 2001
From: Adrian Rumpold <a.rumpold@gmail.com>
Date: Tue, 1 Jul 2025 14:37:52 +0200
Subject: [PATCH] Improved retrieval step with relevance ranking

---
 indexing.py | 56 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/indexing.py b/indexing.py
index 9fb2d4e..2f0c94f 100644
--- a/indexing.py
+++ b/indexing.py
@@ -18,7 +18,7 @@ from hn import HackerNewsClient, Story
 from scrape import JinaScraper
 
 NUM_STORIES = 20
-USER_PREFERENCES = ["Machine Learning", "Linux", "Open-Source"]
+USER_PREFERENCES = ["Machine Learning", "Programming", "Robotics"]
 ENABLE_SLACK = True  # Send updates to Slack, need to set SLACK_BOT_TOKEN env var
 ENABLE_MLFLOW_TRACING = False  # Use MLflow (at http://localhost:5000) for tracing
 
@@ -44,27 +44,45 @@ class State(TypedDict):
     summaries: list[dict]
 
 
-def retrieve(state: State, top_n: int = 5) -> State:
-    # Search for relevant documents
+def retrieve(state: State, top_n: int = 2 * len(USER_PREFERENCES)) -> State:
+    # Search for relevant documents (with scores if available)
     retrieved_docs = vector_store.similarity_search(
-        "Categories: " + ", ".join(state["preferences"]), k=20
+        "Show the most interesting articles about the following topics: "
+        + ", ".join(state["preferences"]),
+        k=top_n * 20,  # Chunks, not complete stories
+        return_score=True
+        if hasattr(vector_store, "similarity_search_with_score")
+        else False,
     )
 
-    # If you're using chunks, group them back into complete stories
+    # If scores are returned, unpack (doc, score) tuples; else, set score to None
+    docs_with_scores = []
+    if retrieved_docs and isinstance(retrieved_docs[0], tuple):
+        for doc, score in retrieved_docs:
+            docs_with_scores.append((doc, score))
+    else:
+        for doc in retrieved_docs:
+            docs_with_scores.append((doc, None))
+
+    # Group chunks by story_id and collect their scores
     story_groups = {}
-    for doc in retrieved_docs:
+    for doc, score in docs_with_scores:
         story_id = doc.metadata.get("story_id")
         if story_id not in story_groups:
             story_groups[story_id] = []
-        story_groups[story_id].append(doc)
+        story_groups[story_id].append((doc, score))
 
-    # Reconstruct complete stories or use the best chunk per story
+    # Aggregate max score per story and reconstruct complete stories
+    story_scores = {}
     complete_stories = []
-    for story_id, chunks in story_groups.items():
+    for story_id, chunks_scores in story_groups.items():
+        chunks = [doc for doc, _ in chunks_scores]
+        scores = [s for _, s in chunks_scores if s is not None]
+        max_score = max(scores) if scores else None
+        story_scores[story_id] = max_score
         if len(chunks) == 1:
-            complete_stories.append(chunks[0])
+            complete_stories.append((chunks[0], max_score))
         else:
-            # Combine chunks back into complete story
             combined_content = "\n\n".join(
                 chunk.page_content
                 for chunk in sorted(
@@ -75,9 +93,21 @@ def retrieve(state: State, top_n: int = 5) -> State:
                 page_content=combined_content,
                 metadata=chunks[0].metadata,  # Use metadata from first chunk
             )
-            complete_stories.append(complete_story)
+            complete_stories.append((complete_story, max_score))
 
-    return {"context": complete_stories[:top_n]}
+    # Sort stories by max_score descending (None scores go last)
+    complete_stories_sorted = sorted(
+        complete_stories, key=lambda x: (x[1] is not None, x[1]), reverse=True
+    )
+
+    # Return top_n stories
+    top_stories = [doc for doc, _ in complete_stories_sorted[:top_n]]
+    return {
+        "preferences": state["preferences"],
+        "context": top_stories,
+        "answer": state.get("answer", ""),
+        "summaries": state.get("summaries", []),
+    }
 
 
 def generate_structured_summaries(state: State):