Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| from pypdf import PdfReader | |
| import re | |
| from sentence_transformers import SentenceTransformer | |
| import csv | |
| import google.generativeai as genai | |
| # Configure your API key | |
| genai.configure(api_key="AIzaSyBgsd2j_InSYc7Zm8qIIe7yqWPworfbCS8") | |
| def extract_text_data(path): | |
| reader = PdfReader(path) | |
| text = '' | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| def clean_text(text): | |
| text = text.replace('\u2029\u2029', '\n') | |
| text = text.replace('\u2029', ' ') | |
| text = text.replace('\u2010', '-') | |
| text = text.replace(r"\'", "'") | |
| return text | |
| def chunk_text(text, chunk_size=500, overlap=100): | |
| clean = clean_text(text) # Ensure text is preprocessed | |
| words = clean.split() # Split by words to avoid breaking mid-word | |
| chunks = [] | |
| start = 0 # Start index for chunking | |
| while start < len(words): | |
| end = start + chunk_size # Define chunk endpoint | |
| chunk = " ".join(words[start:end]) # Get words within the chunk | |
| chunks.append(chunk.strip()) # Strip extra spaces | |
| start += chunk_size - overlap # Move start forward with overlap | |
| return chunks | |
| def generate_embeddings(chunks, model_name="all-MiniLM-L6-v2"): | |
| model = SentenceTransformer(model_name) | |
| embeddings = model.encode(chunks) | |
| return embeddings | |
| def store_in_database(chunks, embeddings): | |
| with open("embeddings.csv", "w", newline="", encoding="utf-8") as f: | |
| writer = csv.writer(f) | |
| writer.writerow(["text", "embedding"]) | |
| for chunk, embedding in zip(chunks, embeddings): | |
| embedding = np.array(embedding) | |
| writer.writerow([chunk, ",".join(map(str, embedding))]) | |
| return | |
| def cosine_similarity(vector1, vector2): | |
| dot_product = np.dot(vector1, vector2) | |
| normVector1 = np.linalg.norm(vector1) | |
| normVector2 = np.linalg.norm(vector2) | |
| similarity = dot_product / (normVector1 * normVector2) | |
| return similarity | |
| def load_from_database(filepath): | |
| chunks = [] | |
| embeddings = [] | |
| with open(filepath, "r", newline="") as f: | |
| reader = csv.reader(f) | |
| next(reader) # Skip header | |
| for row in reader: | |
| chunk = row[0] | |
| embedding = np.array(list(map(float, row[1].split(",")))) | |
| chunks.append(chunk) | |
| embeddings.append(embedding) | |
| return chunks, np.array(embeddings) | |
| def semantic_search(queryEmbedding, topK=5): | |
| dbChunks, dbEmbeddings = load_from_database("embeddings.csv") | |
| similarities = [cosine_similarity(dbEmbedding, queryEmbedding) for dbEmbedding in dbEmbeddings] | |
| topIndex = np.argsort(similarities)[-topK:][::-1] | |
| topChunks = [dbChunks[i] for i in topIndex] | |
| return topChunks | |
| def insert_in_LMM_prompt(retrievedContext, query, model_name="gemini-1.5-flash-001"): | |
| prompt = f""" | |
| You are a helpful and responsible AI assistant providing professional guidance for healthcare staff. | |
| The user has provided a knowledge base with relevant medical training materials. | |
| Use only the retrieved context below to answer the question factually and safely. | |
| Context: | |
| {retrievedContext} | |
| Question: | |
| {query} | |
| Answer: | |
| """ | |
| model = genai.GenerativeModel(model_name) | |
| response = model.generate_content(prompt) | |
| return response.text | |
| def pipeline(filePath, query): | |
| text = extract_text_data(filePath) | |
| chunks = chunk_text(text) | |
| fileEmbeddings = generate_embeddings(chunks) | |
| store_in_database(chunks, fileEmbeddings) | |
| queryEmbeddings = generate_embeddings([query])[0] | |
| relevantData = semantic_search(queryEmbeddings) | |
| answer = insert_in_LMM_prompt(relevantData, query) | |
| return answer | |
| def gradio_interface(file, question): | |
| return pipeline(file.name, question) | |
| # Create the Gradio interface | |
| iface = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=[ | |
| gr.File(label="Upload PDF"), | |
| gr.Textbox(label="Ask a Question") | |
| ], | |
| outputs="text", | |
| live=False, # Disable live updates | |
| title="RAG System Web App", # Title of the app | |
| description="Upload a PDF and ask a question to extract information from it.", # Optional description | |
| allow_flagging="never", | |
| ) | |
| # Launch the interface | |
| iface.launch() | |