NNEngine commited on
Commit
0ff7449
·
verified ·
1 Parent(s): 0334799

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import faiss
4
+ import torch
5
+
6
+ from sentence_transformers import SentenceTransformer
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
8
+
9
+
10
+ # =====================================
11
+ # 1. LOAD DOCUMENTS
12
+ # =====================================
13
+ def load_documents(path="documents.txt"):
14
+ with open(path, "r", encoding="utf-8") as f:
15
+ docs = f.readlines()
16
+ return [doc.strip() for doc in docs if doc.strip()]
17
+
18
+ documents = load_documents()
19
+
20
+
21
+ # =====================================
22
+ # 2. LOAD EMBEDDING MODEL (HF Open Source)
23
+ # =====================================
24
+ embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
25
+
26
+ doc_embeddings = embedding_model.encode(documents, convert_to_numpy=True)
27
+ dimension = doc_embeddings.shape[1]
28
+
29
+
30
+ # =====================================
31
+ # 3. BUILD FAISS INDEX
32
+ # =====================================
33
+ index = faiss.IndexFlatL2(dimension)
34
+ index.add(doc_embeddings)
35
+
36
+
37
+ # =====================================
38
+ # 4. LOAD OPEN-SOURCE LLM (HF)
39
+ # =====================================
40
+ MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # change if needed
41
+
42
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
43
+
44
+ model = AutoModelForCausalLM.from_pretrained(
45
+ MODEL_NAME,
46
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
47
+ device_map="auto"
48
+ )
49
+
50
+ generator = pipeline(
51
+ "text-generation",
52
+ model=model,
53
+ tokenizer=tokenizer
54
+ )
55
+
56
+
57
+ # =====================================
58
+ # 5. RETRIEVAL FUNCTION
59
+ # =====================================
60
+ def retrieve(query, top_k=3):
61
+ query_embedding = embedding_model.encode([query], convert_to_numpy=True)
62
+ distances, indices = index.search(query_embedding, top_k)
63
+
64
+ retrieved_docs = [documents[i] for i in indices[0]]
65
+ return retrieved_docs
66
+
67
+
68
+ # =====================================
69
+ # 6. GENERIC LLM CALL
70
+ # =====================================
71
+ def call_llm(prompt):
72
+ response = generator(
73
+ prompt,
74
+ max_new_tokens=300,
75
+ do_sample=True,
76
+ temperature=0.7,
77
+ top_p=0.9
78
+ )
79
+
80
+ return response[0]["generated_text"]
81
+
82
+
83
+ # =====================================
84
+ # 7. RAG PIPELINE
85
+ # =====================================
86
+ def rag_pipeline(query):
87
+ retrieved_docs = retrieve(query)
88
+ context = "\n".join(retrieved_docs)
89
+
90
+ prompt = f"""
91
+ You are a helpful AI assistant.
92
+ Answer ONLY from the provided context.
93
+
94
+ Context:
95
+ {context}
96
+
97
+ Question:
98
+ {query}
99
+
100
+ Answer:
101
+ """
102
+
103
+ answer = call_llm(prompt)
104
+
105
+ return answer
106
+
107
+
108
+ # =====================================
109
+ # 8. GRADIO UI
110
+ # =====================================
111
+ with gr.Blocks() as demo:
112
+ gr.Markdown("# 🧠 Open Source RAG (HF Only)")
113
+
114
+ query = gr.Textbox(label="Ask your question")
115
+ output = gr.Textbox(label="Answer")
116
+
117
+ query.submit(rag_pipeline, query, output)
118
+
119
+ demo.launch()