In [1]:
from pathlib import Path
import os
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder,SentenceTransformersTextEmbedder
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever

HERE = Path(os.getcwd())
print(HERE)

data_path = HERE / "data"
file_paths = [str(data_path / name) for name in os.listdir(data_path)]

chroma_store = ChromaDocumentStore()

pipeline = Pipeline()
pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
pipeline.add_component("TextFileConverter", TextFileToDocument())
pipeline.add_component("PdfFileConverter", PyPDFToDocument())

pipeline.add_component("Joiner", DocumentJoiner())
pipeline.add_component("Cleaner", DocumentCleaner())
pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
# pipeline.add_component("TextEmbedder", SentenceTransformersTextEmbedder())
pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))

pipeline.add_component("Writer", DocumentWriter(document_store=chroma_store))

pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
pipeline.connect("TextFileConverter.documents", "Joiner.documents")
pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
pipeline.connect("Joiner.documents", "Cleaner.documents")
pipeline.connect("Cleaner.documents", "Splitter.documents")
pipeline.connect("Splitter.documents", "Embedder.documents")
# pipeline.connect("TextEmbedder.embeddings", "Embedder.documents")
pipeline.connect("Embedder.documents", "Writer.documents")

pipeline.run(
 {"FileTypeRouter": {"sources": file_paths}},
)

# Querying pipeline
querying = Pipeline()
querying.add_component("retriever", ChromaQueryTextRetriever(chroma_store))
results = querying.run({"retriever": {"query": "Vishwam", "top_k": 3}})
print(results)


 from .autonotebook import tqdm as notebook_tqdm


/Users/adityasugandhi/Documents/GitHub/LLM_Playground


Batches: 100%|██████████| 1/1 [00:03<00:00, 3.22s/it]
/Users/adityasugandhi/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:11<00:00, 7.14MiB/s]


{'retriever': {'documents': [Document(id=fee80856fdb487fb694c739e089614d733502a7bd6d8b192f29ed6dad2088f44, content: 'Vishwam Shah is a highly motivated and skilled Computer Science professional currently pursuing a Ma...', meta: {'file_path': '/Users/adityasugandhi/Documents/GitHub/LLM_Playground/data/mf.txt', 'source_id': '99393e97120fcb9e88daa2d490060e9a91385ae63c7890d12b351978c02d3d93'}, score: 1.0066444873809814, embedding: vector of size 384), Document(id=e700bf2b5df175311a60ca00ffb6ed77b65b09c4221a2466b68e4802d90a831a, content: 'VISHWAM SHAH
Tallahassee, FL |shahvishwam7@gmail.com |+1 (850) 666 - 0095 |https://www.linkedin.com/...', meta: {'file_path': '/Users/adityasugandhi/Documents/GitHub/LLM_Playground/data/Resume_Vishwam_Shah_Back_end.pdf', 'source_id': 'd23089ee94ea955eb9ef0045999019220184668c96631b25686fc002722e8753'}, score: 1.5628944635391235, embedding: vector of size 384), Document(id=299afa7bfc84e7700fd38b178933ab2bf3a67b09298662651b173af03fde7968, content: ' The
“ECM

#Information Retriver

In [4]:
# # Querying pipeline
# querying = Pipeline()
# querying.add_component("retriever", ChromaQueryTextRetriever(chroma_store))
# results = querying.run({"retriever": {"query": "Aditya", "top_k": 3}})
# print(results)


{'retriever': {'documents': [Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.1221085786819458, embedding: vector of size 384), Document(id=11f7061bb8c56ae79965f1ba0d1a0283188dc031309394e1a03470d5d72207a9, content: 'Aditya Sugandhi is a seasoned Software Engineer with a rich background and diverse skill set, encomp...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_test.txt', 'source_id': 'c85a2287836cae980897693decb5e9d07e80f60b7c96b4e542ef3057e11fc228'}, score: 1.2236461639404297, embedding: vector of size 384), Document(id=a6ad41c3febd74d1f6825aac59c2d6dd7589ae8088bb3b449ea239c97d6f1b1c, content: ' . . . . . . . . . . . . . . . . .

In [28]:
from dotenv import load_dotenv

load_dotenv() 
from haystack import Pipeline
from haystack.utils import Secret
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.generators import GPTGenerator
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import OpenAIGenerator

template = """
 ` Answer the question using the provided context based on Aditya.

 Context:
 {% for context in answers %}
 {{ context }}
 {% endfor %}
 Question: {{question}}
 Answer:
 """

prompt_builder = PromptBuilder(template=template)
retriever = ChromaQueryTextRetriever(document_store = chroma_store)
api_key = os.environ.get("OPENAI_API_KEY")

#ExtractiveReader to extract answers from the relevant context
api_key = Secret.from_token(api_key)
llm = OpenAIGenerator(model="gpt-3.5-turbo-0125",api_key=api_key)
reader = ExtractiveReader(model="deepset/roberta-base-squad2-distilled")

extractive_qa_pipeline = Pipeline()
extractive_qa_pipeline.add_component("retriever", retriever)
extractive_qa_pipeline.add_component('reader', reader)
extractive_qa_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
extractive_qa_pipeline.add_component("llm", llm)

extractive_qa_pipeline.connect("retriever.documents", "reader.documents")
extractive_qa_pipeline.connect("reader.answers", "prompt_builder.answers")
extractive_qa_pipeline.connect("prompt_builder", "llm")


query = "what is Aditya Pursuing ?"
print(query)
# Define the input data for the pipeline components
input_data = {
 "retriever": {"query": query, "top_k": 2},
 "reader": {"query": query, "top_k": 2},
 "prompt_builder": {"question": query},
 # Use 'max_tokens' instead of 'max_new_tokens'
}

# Run the pipeline with the updated input data
results = extractive_qa_pipeline.run(input_data)
print(results)

AttributeError: 'str' object has no attribute 'resolve_value'

In [5]:
import json

class ExtractedAnswerEncoder(json.JSONEncoder):
 def default(self, obj):
 if isinstance(obj, results):
 # Convert ExtractedAnswer to a dictionary
 return obj.__dict__
 return super().default(obj)
json_results = json.dumps(results, indent=2, cls=ExtractedAnswerEncoder)

print(json_results)

TypeError: isinstance() arg 2 must be a type, a tuple of types, or a union

In [None]:
p