Razzaqi3143's picture
Update app.py
6f87096 verified
from pdfminer.high_level import extract_text
def extract_pdf_text(file_path):
return extract_text(file_path)
#from google.colab import drive
import zipfile
import os
# Path to the uploaded zip file
zip_file_path = './data.zip'
extract_folder = './data'
# Unzip the file if the directory does not already exist
if not os.path.exists(extract_folder):
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
zip_ref.extractall(extract_folder)
import os
pdf_folder = './data'
# List all PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]
# Process each PDF file
for pdf_file in pdf_files:
pdf_path = os.path.join(pdf_folder, pdf_file)
# Your code to process PDF
#drive.mount('/content/drive')
#import os
#pdf_folder = '/content/drive/MyDrive'
pdf_texts = []
for pdf_file in os.listdir(pdf_folder):
if pdf_file.endswith('.pdf'):
full_path = os.path.join(pdf_folder, pdf_file)
pdf_texts.append(extract_pdf_text(full_path))
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = [text_splitter.split_text(text) for text in pdf_texts]
chunks = [chunk for sublist in chunks for chunk in sublist] # Flatten list
from sentence_transformers import SentenceTransformer
import torch
import torch.nn.functional as F
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2') # You can replace this with another model if needed
# Define your text chunks
chunks = [
'Sample text chunk 1',
'Sample text chunk 2',
'Sample text chunk 3'
]
# Get embeddings for the text chunks
embeddings = model.encode(chunks)
# Convert embeddings to tensor and normalize
embeddings_tensor = torch.tensor(embeddings)
embeddings_tensor = F.normalize(embeddings_tensor, p=2, dim=-1)
# Print the embeddings tensor
print(embeddings_tensor)
import faiss
import numpy as np
dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
def retrieve(query, k=5):
query_embedding = embedding_model.embed_query(query)
distances, indices = index.search(np.array([query_embedding]), k)
return [chunks[i] for i in indices[0]]
import os
from groq import Groq
# Set up the API key
os.environ["GROQ_API_KEY"] = "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"
# Initialize the Groq client
client = Groq(api_key=os.environ["GROQ_API_KEY"])
# Define the chat completion request
def generate_response(prompt):
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": prompt,
}
],
model="llama3-8b-8192", # Replace with your specific model if needed
)
return chat_completion.choices[0].message.content
# Example prompt
prompt = "Explain the importance of fast language models"
response = generate_response(prompt)
# Print the response
print(response)
import gradio as gr
def gradio_chatbot(query):
return generate_response(query)
gr.Interface(fn=gradio_chatbot, inputs="text", outputs="text").launch()