HarshKalia-24 commited on
Commit
0fc7ca5
Β·
1 Parent(s): 5966700

some updates 2.0

Browse files
Files changed (2) hide show
  1. pipelines.py +21 -35
  2. requirements.txt +11 -20
pipelines.py CHANGED
@@ -1,17 +1,15 @@
1
  import os
2
- os.environ["HAYSTACK_TELEMETRY_ENABLED"] = "False"
3
  import logging
4
  from haystack.utils import Secret
5
  from haystack.dataclasses import Document
6
  from haystack.document_stores.in_memory import InMemoryDocumentStore
7
  from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
8
  from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
9
-
10
- # CORRECT IMPORT FOR HAYSTACK 2.1.0
11
- from haystack.nodes.ranker import SentenceTransformersRanker
12
-
13
- from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
14
  from haystack.components.preprocessors import DocumentSplitter
 
 
 
 
15
 
16
  # Set up logging
17
  logger = logging.getLogger(__name__)
@@ -29,12 +27,12 @@ text_embedder = SentenceTransformersTextEmbedder(
29
  model="BAAI/bge-base-en-v1.5",
30
  use_gpu=False
31
  )
32
- retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=3)
33
 
34
- # Initialize ranker - DIFFERENT INITIALIZATION FOR 2.1.0
35
  reranker = SentenceTransformersRanker(
36
- model_name_or_path="cross-encoder/ms-marco-TinyBERT-L-2-v2",
37
- use_gpu=False
38
  )
39
 
40
  # Initialize generator
@@ -60,7 +58,8 @@ try:
60
  logger.info("Warming up components...")
61
  doc_embedder.warm_up()
62
  text_embedder.warm_up()
63
- reranker.prepared = True # Different warmup for 2.1.0
 
64
  logger.info("Components warmed up")
65
  except Exception as e:
66
  logger.error(f"Warmup failed: {e}")
@@ -76,22 +75,12 @@ def add_documents(texts: list[str], meta_list: list[dict]) -> int:
76
  if not docs:
77
  return 0
78
 
79
- split_result = splitter.run(docs)
80
- split_docs = split_result.get("documents", [])
81
 
82
  if not split_docs:
83
  return 0
84
 
85
- embedded_docs = []
86
- batch_size = 8
87
-
88
- for i in range(0, len(split_docs), batch_size):
89
- batch = split_docs[i:i+batch_size]
90
- try:
91
- embedded_batch = doc_embedder.run(batch).get("documents", [])
92
- embedded_docs.extend(embedded_batch)
93
- except Exception as e:
94
- logger.error(f"Embedding failed: {e}")
95
 
96
  if embedded_docs:
97
  document_store.write_documents(embedded_docs)
@@ -103,7 +92,7 @@ def query_rag(question: str, session_id: str) -> dict:
103
  if not question.strip():
104
  return {"answer": "Please provide a non-empty question.", "sources": []}
105
 
106
- embedding_result = text_embedder.run(question)
107
  query_emb = embedding_result.get("embedding")
108
 
109
  if not query_emb:
@@ -115,23 +104,20 @@ def query_rag(question: str, session_id: str) -> dict:
115
  if not retrieved_docs:
116
  return {"answer": "No documents found. Upload a file first.", "sources": []}
117
 
118
- # DIFFERENT USAGE FOR 2.1.0 RANKER
119
- reranked_docs = reranker.predict(
120
- query=question,
121
- documents=retrieved_docs[:5],
122
- top_k=3
123
- )
124
 
125
  context = "\n\n".join([doc.content for doc in reranked_docs])
126
- prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
127
 
128
- response = generator.run(parts=[prompt])
129
- answer = response.get("replies", [""])[0] if response and response.get("replies") else "No response"
130
 
131
  sources = [
132
  {
133
- "filename": d.meta.get("filename", "Unknown"),
134
- "page": d.meta.get("page", 1),
135
  "snippet": d.content[:200] + "..." if len(d.content) > 200 else d.content
136
  }
137
  for d in reranked_docs
 
1
  import os
 
2
  import logging
3
  from haystack.utils import Secret
4
  from haystack.dataclasses import Document
5
  from haystack.document_stores.in_memory import InMemoryDocumentStore
6
  from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
7
  from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
 
 
 
 
 
8
  from haystack.components.preprocessors import DocumentSplitter
9
+ from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
10
+
11
+ # βœ… CORRECTED IMPORT FOR HAYSTACK 2.x
12
+ from haystack.components.rankers import SentenceTransformersRanker
13
 
14
  # Set up logging
15
  logger = logging.getLogger(__name__)
 
27
  model="BAAI/bge-base-en-v1.5",
28
  use_gpu=False
29
  )
30
+ retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=5) # Retrieve more to give reranker more options
31
 
32
+ # βœ… CORRECTED INITIALIZATION FOR HAYSTACK 2.x RERANKER
33
  reranker = SentenceTransformersRanker(
34
+ model="cross-encoder/ms-marco-TinyBERT-L-2-v2",
35
+ top_k=3 # Set top_k during initialization or run
36
  )
37
 
38
  # Initialize generator
 
58
  logger.info("Warming up components...")
59
  doc_embedder.warm_up()
60
  text_embedder.warm_up()
61
+ # βœ… CORRECTED WARMUP FOR HAYSTACK 2.x
62
+ reranker.warm_up()
63
  logger.info("Components warmed up")
64
  except Exception as e:
65
  logger.error(f"Warmup failed: {e}")
 
75
  if not docs:
76
  return 0
77
 
78
+ split_docs = splitter.run(documents=docs).get("documents", [])
 
79
 
80
  if not split_docs:
81
  return 0
82
 
83
+ embedded_docs = doc_embedder.run(documents=split_docs).get("documents", [])
 
 
 
 
 
 
 
 
 
84
 
85
  if embedded_docs:
86
  document_store.write_documents(embedded_docs)
 
92
  if not question.strip():
93
  return {"answer": "Please provide a non-empty question.", "sources": []}
94
 
95
+ embedding_result = text_embedder.run(text=question)
96
  query_emb = embedding_result.get("embedding")
97
 
98
  if not query_emb:
 
104
  if not retrieved_docs:
105
  return {"answer": "No documents found. Upload a file first.", "sources": []}
106
 
107
+ # βœ… CORRECTED USAGE FOR HAYSTACK 2.x RERANKER
108
+ rerank_result = reranker.run(query=question, documents=retrieved_docs)
109
+ reranked_docs = rerank_result.get("documents", [])
 
 
 
110
 
111
  context = "\n\n".join([doc.content for doc in reranked_docs])
112
+ prompt = f"Given the following context, please answer the question.\n\nContext:\n{context}\n\nQuestion: {question}"
113
 
114
+ response = generator.run(prompt=prompt)
115
+ answer = response["replies"][0] if response.get("replies") else "Sorry, I couldn't generate an answer."
116
 
117
  sources = [
118
  {
119
+ "filename": d.meta.get("file_name", "Unknown"), # Standardized meta key
120
+ "page": d.meta.get("page_number", "N/A"),
121
  "snippet": d.content[:200] + "..." if len(d.content) > 200 else d.content
122
  }
123
  for d in reranked_docs
requirements.txt CHANGED
@@ -1,22 +1,13 @@
1
  # Core dependencies
2
- fastapi==0.111.0
3
- uvicorn==0.30.1
4
- python-multipart==0.0.9
5
- pillow==10.3.0
6
- pdfplumber==0.11.0
7
- pytesseract==0.3.10
 
8
 
9
- # Optimized sentence transformers
10
- sentence-transformers==3.0.1
11
- onnxruntime==1.17.3
12
-
13
- # Google AI and Haystack
14
- google-generativeai==0.7.2
15
- haystack-ai==2.1.0
16
- psutil==5.9.8
17
-
18
- # Compatible dependencies
19
- protobuf==4.25.3
20
- grpcio==1.64.0
21
- python-dotenv==1.0.1
22
- rpds-py==0.18.0
 
1
  # Core dependencies
2
+ fastapi
3
+ uvicorn
4
+ python-multipart
5
+ pillow
6
+ pdfplumber
7
+ pytesseract
8
+ python-dotenv
9
 
10
+ # Haystack and Integrations
11
+ haystack-ai
12
+ google-ai-haystack
13
+ sentence-transformers