For text TextFileToDocument
for pdf PyPDFToDocument

In [None]:
import os
from haystack import Pipeline, Document
from haystack.components.converters import TextFileToDocument, PyPDFToDocument
from haystack.components.writers import DocumentWriter
from haystack.components.readers import ExtractiveReader
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from pathlib import Path
HERE = Path(os.getcwd())
print(HERE)

data_path = HERE / "data"
file_paths = [data_path / Path(name) for name in os.listdir("data")]
print()
chroma_store = ChromaDocumentStore()
# Resolve the absolute path
# absolute_file_path = file_path.resolve()
# print(absolute_file_path)
pipeline = Pipeline()
pipeline.add_component("PdfFileConverter", PyPDFToDocument())
pipeline.add_component("TextFileConverter", TextFileToDocument())
pipeline.add_component("Pdfwriter_chroma", DocumentWriter(document_store=chroma_store))
pipeline.add_component("writer_chroma", DocumentWriter(document_store=chroma_store))

pipeline.connect("PdfFileConverter","Pdfwriter_chroma")
pipeline.connect("TextFileConverter", "writer_chroma")
pipeline.run(
 {"PdfFileConverter": {"sources": file_paths, "batch_size": 1}},
 {"TextFileConverter": {"sources": file_paths, "batch_size": 1}},
)
 
 
querying = Pipeline()
reader = ExtractiveReader(model="deepset/roberta-base-squad2-distilled")
querying.add_component("retriever", ChromaQueryTextRetriever(chroma_store))
querying.add_component("reader",reader)
results = querying.run({"retriever": {"query": "Vishwam", "top_k": 3}})


/unity/f2/asugandhi/Downloads/LLM_Playground



ValueError: Input batch_size not found in component PdfFileConverter.

In [3]:
from pathlib import Path
import os
from haystack import Pipeline
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.readers import ExtractiveReader
from haystack.components.routers import FileTypeRouter
from haystack.components.joiners import DocumentJoiner
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.chroma import ChromaDocumentStore
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever

HERE = Path(os.getcwd())
print(HERE)

data_path = HERE / "data"
file_paths = [str(data_path / name) for name in os.listdir(data_path)]

chroma_store = ChromaDocumentStore()

pipeline = Pipeline()
pipeline.add_component("FileTypeRouter", FileTypeRouter(mime_types=["text/plain", "application/pdf"]))
pipeline.add_component("TextFileConverter", TextFileToDocument())
pipeline.add_component("PdfFileConverter", PyPDFToDocument())
pipeline.add_component("Joiner", DocumentJoiner())
pipeline.add_component("Cleaner", DocumentCleaner())
pipeline.add_component("Splitter", DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30))
# pipeline.add_component("Embedder", SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2"))
pipeline.add_component("Writer", DocumentWriter(document_store=chroma_store))

pipeline.connect("FileTypeRouter.text/plain", "TextFileConverter.sources")
pipeline.connect("FileTypeRouter.application/pdf", "PdfFileConverter.sources")
pipeline.connect("TextFileConverter.documents", "Joiner.documents")
pipeline.connect("PdfFileConverter.documents", "Joiner.documents")
pipeline.connect("Joiner.documents", "Cleaner.documents")
pipeline.connect("Cleaner.documents", "Splitter.documents")
pipeline.connect("Splitter.documents", "Writer.documents")
# pipeline.connect("Embedder.documents", "Writer.documents")

pipeline.run(
 {"FileTypeRouter": {"sources": file_paths}},
)

# Querying pipeline
querying = Pipeline()
reader = ExtractiveReader(model="deepset/roberta-base-squad2-distilled")
querying.add_component("retriever", ChromaQueryTextRetriever(chroma_store))
querying.add_component("reader", reader)
querying.connect("retriever", "reader")
query = "Who is Aditya?"
input_data = {
 "retriever": {"query": query, "top_k": 1},
 "reader": {"query": query},
 # Use 'max_tokens' instead of 'max_new_tokens'
 }
results = querying.run(input_data)
print(results)


/unity/f2/asugandhi/Downloads/LLM_Playground
{'reader': {'answers': [ExtractedAnswer(query='Who is Aditya?', score=0.6858945488929749, data='Software Engineer', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': '/unity/f2/asugandhi/Downloads/LLM_Playground/data/Aditya_train.txt', 'source_id': '228fb178549cb032d67e0b2da301131f48d7c88c814b6d6920c92727b1c8f5fd'}, score: 1.191292405128479, embedding: vector of size 384), context=None, document_offset=ExtractedAnswer.Span(start=31, end=48), context_offset=None, meta={}), ExtractedAnswer(query='Who is Aditya?', score=0.627069890499115, data='Sugandhi', document=Document(id=ce02ebe3fa97972f0c76b2c175f658873b2d0e19987e9cbc38dcacb25b8ebdba, content: 'Aditya Sugandhi's journey as a Software Engineer is characterized by a deep commitment to excellence...', meta: {'file_path': 

#DON'T RUN

In [7]:
from haystack import Pipeline
from haystack.utils import Secret
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack.components.readers import ExtractiveReader
from haystack.components.generators import GPTGenerator
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.generators import OpenAIGenerator

template = """
Answer all the questions in the following format and based on Aditya.

Context:
{% for doc in documents %}
 {{ doc.content }}
{% endfor %}
Question: {{question}}
Answer:
"""

prompt_builder = PromptBuilder(template=template)
retriever = ChromaQueryTextRetriever(document_store = chroma_store)
#ExtractiveReader to extract answers from the relevant context
api_key = Secret.from_token("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")
llm = OpenAIGenerator(model="gpt-3.5-turbo-0125",api_key=api_key)
reader = ExtractiveReader(model="deepset/roberta-base-squad2-distilled")

extractive_qa_pipeline = Pipeline()
extractive_qa_pipeline.add_component("retriever", retriever)
# extractive_qa_pipeline.add_component("reader",reader)
extractive_qa_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
extractive_qa_pipeline.add_component("llm", llm)


# extractive_qa_pipeline.connect("retriever", "reader")
extractive_qa_pipeline.connect("retriever", "prompt_builder.documents")
extractive_qa_pipeline.connect("prompt_builder", "llm")


query = "who is Aditya?"
print(query)
# Define the input data for the pipeline components
input_data = {
 "retriever": {"query": query, "top_k": 1},
 "prompt_builder": {"question": query},
 # Use 'max_tokens' instead of 'max_new_tokens'
}

# Run the pipeline with the updated input data
results = extractive_qa_pipeline.run(input_data)
print(results)

who is Aditya?
{'llm': {'replies': ['Aditya Sugandhi is a Software Engineer with a strong foundation in both theoretical knowledge and practical application, known for his commitment to excellence, passion for technological advancements, and dedication to pushing boundaries in software development. He has experience in various roles such as a Research Assistant, Full Stack Developer, Customer Service Executive, and Web Developer Intern. Aditya is currently pursuing a Master’s of Science in Computer Science at Florida State University and holds a Bachelor of Technology in Computer Science Engineering from SRM University. He is characterized by technical excellence, innovation, and a holistic understanding of software development. Aditya enjoys spending time with his friends SAS, Hunterr, MF, and Rocco.'], 'meta': [{'model': 'gpt-3.5-turbo-0125', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 138, 'prompt_tokens': 917, 'total_tokens': 1055}}]}}


In [20]:
from haystack import Pipeline
from haystack.utils import Secret
from haystack_integrations.components.retrievers.chroma import ChromaQueryTextRetriever
from haystack_integrations.components.generators.llama_cpp import LlamaCppGenerator
from haystack.components.readers import ExtractiveReader
from haystack.components.generators import GPTGenerator
from haystack.components.builders.prompt_builder import PromptBuilder
from haystack.components.builders.answer_builder import AnswerBuilder
from haystack.components.generators import OpenAIGenerator




template = """
Answer all the questions in the following format and based on Aditya 
and if not found generate answer accordingly using the given information.

Context:
{% for doc in documents %}
{{ doc.content }}
{% endfor %}
Question: {{question}}
Answer:
"""

prompt_builder = PromptBuilder(template=template)
retriever = ChromaQueryTextRetriever(document_store = chroma_store)
#ExtractiveReader to extract answers from the relevant context

llm = LlamaCppGenerator(
model_path="openchat-3.5-1210.Q3_K_S.ggml", 
n_ctx=10000,
n_batch=256,
model_kwargs={"n_gpu_layers": -1},
generation_kwargs={"max_tokens": 250, "temperature": 0.9},
)

reader = ExtractiveReader(model="deepset/roberta-base-squad2-distilled",)

extractive_qa_pipeline = Pipeline()
extractive_qa_pipeline.add_component("retriever", ChromaQueryTextRetriever(chroma_store))
# extractive_qa_pipeline.add_component("reader",reader)
extractive_qa_pipeline.add_component(instance=prompt_builder, name="prompt_builder")
extractive_qa_pipeline.add_component("llm", llm)
extractive_qa_pipeline.add_component(instance=AnswerBuilder(), name="answer_builder")

# extractive_qa_pipeline.connect("retriever.documents", "reader")
extractive_qa_pipeline.connect("retriever", "prompt_builder.documents") 
extractive_qa_pipeline.connect("prompt_builder", "llm")
extractive_qa_pipeline.connect("llm.replies", "answer_builder.replies")
extractive_qa_pipeline.connect("retriever", "answer_builder.documents")

query = "who is Aditya did Aditya Pursued his Masters from?"

# Define the input data for the pipeline components
input_data = {
 "retriever": {"query": query, "top_k": 3},
 # "reader": {"query": query},
 "prompt_builder": {"question": query},
 "answer_builder": {"query": query},
 # Use 'max_tokens' instead of 'max_new_tokens'
}

# Run the pipeline with the updated input data
results = extractive_qa_pipeline.run(input_data)

 
 

llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from openchat-3.5-1210.Q3_K_S.ggml (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = llama
llama_model_loader: - kv 1: general.name str = openchat_openchat-3.5-1210
llama_model_loader: - kv 2: llama.context_length u32 = 8192
llama_model_loader: - kv 3: llama.embedding_length u32 = 4096
llama_model_loader: - kv 4: llama.block_count u32 = 32
llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336
llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128
llama_model_loader: - kv 7: llama.attention.head_count u32 = 32
llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8
llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000
llama_model_loader: - kv 11: general.fi

: 

In [19]:
# Assuming results is the dictionary containing the output
generated_content = results['llm']['meta'][0]['choices'][0]['text']
#print(results)
# Print the generated content
print(generated_content)


 Aditya pursued his Masters from Florida State University.
