TharaKavin commited on
Commit
4a8b2d1
Β·
verified Β·
1 Parent(s): d7430ac

Upload 6 files

Browse files
Files changed (6) hide show
  1. embedder.py +40 -0
  2. gradioui.py +50 -0
  3. llm.py +25 -0
  4. requirements.txt +10 -0
  5. scraper.py +17 -0
  6. utils.py +11 -0
embedder.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import faiss
3
+ import numpy as np
4
+
5
+ class VectorStore:
6
+ def __init__(self):
7
+ self.index = None
8
+ self.chunks = []
9
+ self.model = None # lazy load
10
+
11
+ def load_model(self):
12
+ if self.model is None:
13
+ print("Loading model...")
14
+ self.model = SentenceTransformer("all-MiniLM-L6-v2")
15
+
16
+ def create_index(self, chunks):
17
+ self.load_model()
18
+
19
+ self.chunks = chunks
20
+ embeddings = self.model.encode(chunks)
21
+
22
+ if len(embeddings.shape) == 1:
23
+ embeddings = np.array([embeddings])
24
+ else:
25
+ embeddings = np.array(embeddings)
26
+
27
+ dim = embeddings.shape[1]
28
+ self.index = faiss.IndexFlatL2(dim)
29
+ self.index.add(embeddings)
30
+
31
+ def retrieve(self, query, k=3):
32
+ self.load_model()
33
+
34
+ query_embedding = self.model.encode([query])
35
+
36
+ if len(query_embedding.shape) == 1:
37
+ query_embedding = np.array([query_embedding])
38
+
39
+ distances, indices = self.index.search(query_embedding, k)
40
+ return [self.chunks[i] for i in indices[0]]
gradioui.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from scraper import scrape_url
3
+ from utils import chunk_text
4
+ from embedder import VectorStore
5
+ from llm import generate_answer
6
+
7
+ def rag_pipeline(url, question):
8
+ try:
9
+ # 1. Scrape
10
+ text = scrape_url(url)
11
+
12
+ if not text.strip():
13
+ return "❌ Failed to extract content. Try another site."
14
+
15
+ # 2. Chunk
16
+ chunks = chunk_text(text)
17
+
18
+ if len(chunks) == 0:
19
+ return "❌ No usable content found."
20
+
21
+ # 3. Embed
22
+ vector_store = VectorStore()
23
+ vector_store.create_index(chunks)
24
+
25
+ # 4. Retrieve
26
+ context_chunks = vector_store.retrieve(question)
27
+ context = "\n".join(context_chunks)
28
+
29
+ # 5. Generate
30
+ answer = generate_answer(context, question)
31
+
32
+ return answer
33
+
34
+ except Exception as e:
35
+ return f"❌ Error: {str(e)}"
36
+
37
+
38
+ iface = gr.Interface(
39
+ fn=rag_pipeline,
40
+ inputs=[
41
+ gr.Textbox(label="🌐 Website URL"),
42
+ gr.Textbox(label="❓ Ask a Question")
43
+ ],
44
+ outputs=gr.Textbox(label="πŸ€– Answer"),
45
+ title="🌐 Web RAG Chatbot",
46
+ description="Ask questions about any website using AI"
47
+ )
48
+
49
+ if __name__ == "__main__":
50
+ iface.launch()
llm.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from groq import Groq
2
+ import os
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
+
6
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
7
+
8
+ def generate_answer(context, question):
9
+ prompt = f"""
10
+ You are an AI assistant. Answer ONLY from the given context.
11
+
12
+ Context:
13
+ {context}
14
+
15
+ Question:
16
+ {question}
17
+ """
18
+
19
+ response = client.chat.completions.create(
20
+ model="openai/gpt-oss-20b",
21
+ messages=[{"role": "user", "content": prompt}],
22
+ temperature=0.3
23
+ )
24
+
25
+ return response.choices[0].message.content
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ fastapi
3
+ uvicorn
4
+ scrapling[all]
5
+ sentence-transformers
6
+ faiss-cpu
7
+ groq
8
+ python-multipart
9
+ curl_cffi
10
+ python-dotenv
scraper.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from scrapling.fetchers import Fetcher
2
+
3
+ def scrape_url(url: str) -> str:
4
+ try:
5
+ page = Fetcher.get(url)
6
+
7
+ # βœ… Extract text properly
8
+ texts = page.css("body *::text").getall()
9
+
10
+ # Clean text
11
+ cleaned = [t.strip() for t in texts if t.strip()]
12
+
13
+ return " ".join(cleaned)
14
+
15
+ except Exception as e:
16
+ print("SCRAPING ERROR:", e)
17
+ return ""
utils.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def chunk_text(text, chunk_size=200): # smaller chunk
2
+ words = text.split()
3
+
4
+ if len(words) == 0:
5
+ return []
6
+
7
+ chunks = []
8
+ for i in range(0, len(words), chunk_size):
9
+ chunks.append(" ".join(words[i:i + chunk_size]))
10
+
11
+ return chunks