Spaces:

Razzaqi3143
/

RAG_based_Chatbot_with_multiple_PDFs

Runtime error

App Files Files Community

RAG_based_Chatbot_with_multiple_PDFs / app.py

Razzaqi3143

Update app.py

6f87096 verified 11 months ago

raw

history blame contribute delete

3.17 kB

	from pdfminer.high_level import extract_text

	def extract_pdf_text(file_path):
	return extract_text(file_path)
	#from google.colab import drive

	import zipfile
	import os

	# Path to the uploaded zip file
	zip_file_path = './data.zip'
	extract_folder = './data'

	# Unzip the file if the directory does not already exist
	if not os.path.exists(extract_folder):
	with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
	zip_ref.extractall(extract_folder)

	import os

	pdf_folder = './data'

	# List all PDF files in the directory
	pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

	# Process each PDF file
	for pdf_file in pdf_files:
	pdf_path = os.path.join(pdf_folder, pdf_file)
	# Your code to process PDF


	#drive.mount('/content/drive')
	#import os

	#pdf_folder = '/content/drive/MyDrive'
	pdf_texts = []

	for pdf_file in os.listdir(pdf_folder):
	if pdf_file.endswith('.pdf'):
	full_path = os.path.join(pdf_folder, pdf_file)
	pdf_texts.append(extract_pdf_text(full_path))
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
	chunks = [text_splitter.split_text(text) for text in pdf_texts]
	chunks = [chunk for sublist in chunks for chunk in sublist] # Flatten list
	from sentence_transformers import SentenceTransformer
	import torch
	import torch.nn.functional as F

	# Load the model
	model = SentenceTransformer('all-MiniLM-L6-v2') # You can replace this with another model if needed

	# Define your text chunks
	chunks = [
	'Sample text chunk 1',
	'Sample text chunk 2',
	'Sample text chunk 3'
	]

	# Get embeddings for the text chunks
	embeddings = model.encode(chunks)

	# Convert embeddings to tensor and normalize
	embeddings_tensor = torch.tensor(embeddings)
	embeddings_tensor = F.normalize(embeddings_tensor, p=2, dim=-1)

	# Print the embeddings tensor
	print(embeddings_tensor)
	import faiss
	import numpy as np

	dimension = len(embeddings[0])
	index = faiss.IndexFlatL2(dimension)
	index.add(np.array(embeddings))
	def retrieve(query, k=5):
	query_embedding = embedding_model.embed_query(query)
	distances, indices = index.search(np.array([query_embedding]), k)
	return [chunks[i] for i in indices[0]]
	import os
	from groq import Groq

	# Set up the API key
	os.environ["GROQ_API_KEY"] = "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"

	# Initialize the Groq client
	client = Groq(api_key=os.environ["GROQ_API_KEY"])

	# Define the chat completion request
	def generate_response(prompt):
	chat_completion = client.chat.completions.create(
	messages=[
	{
	"role": "user",
	"content": prompt,
	}
	],
	model="llama3-8b-8192", # Replace with your specific model if needed
	)
	return chat_completion.choices[0].message.content

	# Example prompt
	prompt = "Explain the importance of fast language models"
	response = generate_response(prompt)

	# Print the response
	print(response)
	import gradio as gr

	def gradio_chatbot(query):
	return generate_response(query)

	gr.Interface(fn=gradio_chatbot, inputs="text", outputs="text").launch()