Add readme
This commit is contained in:
21
indexing.py
21
indexing.py
@@ -16,6 +16,11 @@ import weaviate
|
||||
from hn import HackerNewsClient, Story
|
||||
from scrape import JinaScraper
|
||||
|
||||
NUM_STORIES = 20
|
||||
USER_PREFERENCES = []
|
||||
ENABLE_MLFLOW_TRACING = False # Set to True if you want to use MLflow for tracing
|
||||
|
||||
|
||||
llm = langchain.chat_models.init_chat_model(
|
||||
model="gpt-4o-mini", model_provider="openai"
|
||||
)
|
||||
@@ -180,14 +185,15 @@ async def fetch_hn_top_stories(
|
||||
|
||||
|
||||
async def main():
|
||||
import mlflow
|
||||
if ENABLE_MLFLOW_TRACING:
|
||||
import mlflow
|
||||
|
||||
mlflow.set_tracking_uri("http://localhost:5000")
|
||||
mlflow.set_experiment("langchain-rag-hn")
|
||||
mlflow.langchain.autolog()
|
||||
mlflow.set_tracking_uri("http://localhost:5000")
|
||||
mlflow.set_experiment("langchain-rag-hn")
|
||||
mlflow.langchain.autolog()
|
||||
|
||||
# 1. Load only new stories
|
||||
new_stories = await fetch_hn_top_stories(limit=20)
|
||||
new_stories = await fetch_hn_top_stories(limit=NUM_STORIES)
|
||||
|
||||
if new_stories:
|
||||
print(f"Processing {len(new_stories)} new stories")
|
||||
@@ -204,7 +210,7 @@ async def main():
|
||||
documents_to_store = []
|
||||
for story in new_stories:
|
||||
# If article is short enough, store as-is
|
||||
if len(story.page_content) <= 3000: # Adjust threshold as needed
|
||||
if len(story.page_content) <= 3000:
|
||||
documents_to_store.append(story)
|
||||
else:
|
||||
# For very long articles, chunk but keep story metadata
|
||||
@@ -227,8 +233,7 @@ async def main():
|
||||
print("No new stories to process")
|
||||
|
||||
# 4. Query
|
||||
preferences = ["Software Engineering", "Machine Learning", "Games"]
|
||||
run_query(preferences)
|
||||
run_query(USER_PREFERENCES)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user