HarshKalia-24 commited on
Commit
5966700
·
1 Parent(s): ef8bcc2

Fix Haystack 2.1.0 compatibility

Browse files
Files changed (1) hide show
  1. pipelines.py +22 -79
pipelines.py CHANGED
@@ -7,17 +7,8 @@ from haystack.document_stores.in_memory import InMemoryDocumentStore
7
  from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
8
  from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
9
 
10
- # Robust import handling for Haystack 2.1.0
11
- try:
12
- # Try the new import structure (Haystack >= 2.1.1)
13
- from haystack.components.rankers.sentence_transformers import SentenceTransformersSimilarityRanker
14
- except ImportError:
15
- try:
16
- # Try the direct import (Haystack 2.1.0)
17
- from haystack.components.rankers import SentenceTransformersSimilarityRanker
18
- except ImportError:
19
- # Fallback to legacy import
20
- from haystack.nodes.ranker import SentenceTransformersRanker as SentenceTransformersSimilarityRanker
21
 
22
  from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
23
  from haystack.components.preprocessors import DocumentSplitter
@@ -32,29 +23,19 @@ document_store = InMemoryDocumentStore()
32
  # Optimized for CPU
33
  doc_embedder = SentenceTransformersDocumentEmbedder(
34
  model="BAAI/bge-base-en-v1.5",
35
- use_gpu=False,
36
- onnx_execution_provider="CPUExecutionProvider"
37
  )
38
  text_embedder = SentenceTransformersTextEmbedder(
39
  model="BAAI/bge-base-en-v1.5",
40
- use_gpu=False,
41
- onnx_execution_provider="CPUExecutionProvider"
42
  )
43
  retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=3)
44
 
45
- # Initialize ranker based on what was imported
46
- if "SentenceTransformersRanker" in globals():
47
- # Using the legacy ranker
48
- reranker = SentenceTransformersRanker(
49
- model_name_or_path="cross-encoder/ms-marco-TinyBERT-L-2-v2",
50
- use_gpu=False
51
- )
52
- else:
53
- # Using the new ranker
54
- reranker = SentenceTransformersSimilarityRanker(
55
- model="cross-encoder/ms-marco-TinyBERT-L-2-v2",
56
- use_gpu=False
57
- )
58
 
59
  # Initialize generator
60
  generator = GoogleAIGeminiGenerator(
@@ -79,20 +60,13 @@ try:
79
  logger.info("Warming up components...")
80
  doc_embedder.warm_up()
81
  text_embedder.warm_up()
82
-
83
- # Handle different warm_up methods
84
- if hasattr(reranker, 'warm_up'):
85
- reranker.warm_up()
86
- elif hasattr(reranker, 'prepared'):
87
- reranker.prepared = True # Legacy versions didn't require warm_up
88
-
89
  logger.info("Components warmed up")
90
  except Exception as e:
91
  logger.error(f"Warmup failed: {e}")
92
 
93
  def add_documents(texts: list[str], meta_list: list[dict]) -> int:
94
  """Process and store documents with chunking"""
95
- # Create base documents
96
  docs = [
97
  Document(content=text, meta=meta)
98
  for text, meta in zip(texts, meta_list)
@@ -102,14 +76,12 @@ def add_documents(texts: list[str], meta_list: list[dict]) -> int:
102
  if not docs:
103
  return 0
104
 
105
- # Split into chunks
106
  split_result = splitter.run(docs)
107
  split_docs = split_result.get("documents", [])
108
 
109
  if not split_docs:
110
  return 0
111
 
112
- # Batch embedding with reduced batch size
113
  embedded_docs = []
114
  batch_size = 8
115
 
@@ -128,60 +100,34 @@ def add_documents(texts: list[str], meta_list: list[dict]) -> int:
128
  def query_rag(question: str, session_id: str) -> dict:
129
  """Query the RAG system with session filtering"""
130
  try:
131
- # Validate input
132
  if not question.strip():
133
- return {
134
- "answer": "Please provide a non-empty question.",
135
- "sources": []
136
- }
137
 
138
- # Embed question
139
  embedding_result = text_embedder.run(question)
140
  query_emb = embedding_result.get("embedding")
141
 
142
  if not query_emb:
143
- return {
144
- "answer": "Failed to process your question.",
145
- "sources": []
146
- }
147
 
148
- # Retrieve documents with session filter
149
  filters = {"field": "meta.session_id", "operator": "==", "value": session_id}
150
- retrieved_docs = retriever.run(
151
- query_embedding=query_emb,
152
- filters=filters
153
- ).get("documents", [])
154
 
155
  if not retrieved_docs:
156
- return {
157
- "answer": "No documents found for this session. Please upload a file first.",
158
- "sources": []
159
- }
160
 
161
- # Handle different ranker interfaces
162
- if hasattr(reranker, 'run'):
163
- # New interface
164
- reranked_docs = reranker.run(
165
- query=question,
166
- documents=retrieved_docs[:5]
167
- ).get("documents", [])[:3]
168
- else:
169
- # Legacy interface
170
- reranked_docs = reranker.predict(
171
- query=question,
172
- documents=retrieved_docs[:5],
173
- top_k=3
174
- )
175
 
176
- # Generate answer with context
177
  context = "\n\n".join([doc.content for doc in reranked_docs])
178
  prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
179
 
180
- # Handle generator response
181
  response = generator.run(parts=[prompt])
182
- answer = response.get("replies", [""])[0] if response and response.get("replies") else "No response generated"
183
 
184
- # Format sources
185
  sources = [
186
  {
187
  "filename": d.meta.get("filename", "Unknown"),
@@ -195,7 +141,4 @@ def query_rag(question: str, session_id: str) -> dict:
195
 
196
  except Exception as e:
197
  logger.exception(f"Query failed: {e}")
198
- return {
199
- "answer": "Sorry, I encountered an error processing your request.",
200
- "sources": []
201
- }
 
7
  from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder
8
  from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
9
 
10
+ # CORRECT IMPORT FOR HAYSTACK 2.1.0
11
+ from haystack.nodes.ranker import SentenceTransformersRanker
 
 
 
 
 
 
 
 
 
12
 
13
  from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
14
  from haystack.components.preprocessors import DocumentSplitter
 
23
  # Optimized for CPU
24
  doc_embedder = SentenceTransformersDocumentEmbedder(
25
  model="BAAI/bge-base-en-v1.5",
26
+ use_gpu=False
 
27
  )
28
  text_embedder = SentenceTransformersTextEmbedder(
29
  model="BAAI/bge-base-en-v1.5",
30
+ use_gpu=False
 
31
  )
32
  retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=3)
33
 
34
+ # Initialize ranker - DIFFERENT INITIALIZATION FOR 2.1.0
35
+ reranker = SentenceTransformersRanker(
36
+ model_name_or_path="cross-encoder/ms-marco-TinyBERT-L-2-v2",
37
+ use_gpu=False
38
+ )
 
 
 
 
 
 
 
 
39
 
40
  # Initialize generator
41
  generator = GoogleAIGeminiGenerator(
 
60
  logger.info("Warming up components...")
61
  doc_embedder.warm_up()
62
  text_embedder.warm_up()
63
+ reranker.prepared = True # Different warmup for 2.1.0
 
 
 
 
 
 
64
  logger.info("Components warmed up")
65
  except Exception as e:
66
  logger.error(f"Warmup failed: {e}")
67
 
68
  def add_documents(texts: list[str], meta_list: list[dict]) -> int:
69
  """Process and store documents with chunking"""
 
70
  docs = [
71
  Document(content=text, meta=meta)
72
  for text, meta in zip(texts, meta_list)
 
76
  if not docs:
77
  return 0
78
 
 
79
  split_result = splitter.run(docs)
80
  split_docs = split_result.get("documents", [])
81
 
82
  if not split_docs:
83
  return 0
84
 
 
85
  embedded_docs = []
86
  batch_size = 8
87
 
 
100
  def query_rag(question: str, session_id: str) -> dict:
101
  """Query the RAG system with session filtering"""
102
  try:
 
103
  if not question.strip():
104
+ return {"answer": "Please provide a non-empty question.", "sources": []}
 
 
 
105
 
 
106
  embedding_result = text_embedder.run(question)
107
  query_emb = embedding_result.get("embedding")
108
 
109
  if not query_emb:
110
+ return {"answer": "Failed to process your question.", "sources": []}
 
 
 
111
 
 
112
  filters = {"field": "meta.session_id", "operator": "==", "value": session_id}
113
+ retrieved_docs = retriever.run(query_embedding=query_emb, filters=filters).get("documents", [])
 
 
 
114
 
115
  if not retrieved_docs:
116
+ return {"answer": "No documents found. Upload a file first.", "sources": []}
 
 
 
117
 
118
+ # DIFFERENT USAGE FOR 2.1.0 RANKER
119
+ reranked_docs = reranker.predict(
120
+ query=question,
121
+ documents=retrieved_docs[:5],
122
+ top_k=3
123
+ )
 
 
 
 
 
 
 
 
124
 
 
125
  context = "\n\n".join([doc.content for doc in reranked_docs])
126
  prompt = f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
127
 
 
128
  response = generator.run(parts=[prompt])
129
+ answer = response.get("replies", [""])[0] if response and response.get("replies") else "No response"
130
 
 
131
  sources = [
132
  {
133
  "filename": d.meta.get("filename", "Unknown"),
 
141
 
142
  except Exception as e:
143
  logger.exception(f"Query failed: {e}")
144
+ return {"answer": "Sorry, I encountered an error.", "sources": []}