antitheft159 commited on
Commit
478ef55
1 Parent(s): 02e8704

Upload laurelstring_gpt2_tttg_159.py

Browse files
Files changed (1) hide show
  1. laurelstring_gpt2_tttg_159.py +207 -0
laurelstring_gpt2_tttg_159.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """laurelString/gpt2/tttg.159
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/16-bSqq2kMNO8X0BjNA0-bCckjnx1Ler_
8
+ """
9
+
10
+ ! pip install sentence_transformers==2.2.2
11
+
12
+ !pip install -qq -U langchain
13
+ !pip install -qq -U langchaing-community
14
+ !pip install -qq -U tiktoken
15
+ !pip install -qq -U pypdf
16
+ !pip install -qq -U faiss-gpu
17
+ !pip install -qq -U InstructorEmbedding
18
+ !pip install -qq -U accelerate
19
+ !pip install -qq -U bitsandbytes
20
+
21
+ import warnings
22
+ warnings.filterwarnings("ignore")
23
+
24
+ import os
25
+ import glob
26
+ import textwrap
27
+ import time
28
+
29
+ import langchain
30
+
31
+ from langchain.document_loaders import PyPDFLoader, DirectoryLoader
32
+
33
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
34
+
35
+ from langchain import PropmtTemplate, LLMChain
36
+
37
+ from langchain.vectorstores import FAISS
38
+
39
+ from langchain.llms import HuggingFacePipeline
40
+ from langchain.embeddings import HuggingFaceInstructEmbeddings
41
+
42
+ from langchain.chains import Retrieva1QA
43
+
44
+ import torch
45
+ import transformers
46
+ from transformers import (
47
+ AutoTokenizer, AutoModelForCausalLM,
48
+ BitsAndBytesConfig,
49
+ pipeline
50
+ )
51
+
52
+ class RAG:
53
+ temperature = 0,
54
+ top_p = 0.95,
55
+ repetition_penalty = 1.15
56
+
57
+ split_chunk_size = 800
58
+ split_overlap = 0
59
+
60
+ embeddings_model_repo = 'sentence-transformers/all-MiniLM-L6-v2'
61
+
62
+ k = 5
63
+
64
+ PDFs_path = '/kaggle/input/physics9thclass/'
65
+ Embeddings_path = '/kaggle/working/embeddingfinal/'
66
+ Persist_directory = './books-vectorb'
67
+
68
+ model_repo = 'darl149/llama-2-13b-chat-hf'
69
+ tokenizer = AutoTokenizer.from_pretrained(model_repo, use_fast=True)
70
+ model = AutoModelForCausalLM.from_pretrained(
71
+ model_repo,
72
+ load_in_4bit = True,
73
+ device_map = 'auto',
74
+ torch_dtype = torch.float16,
75
+ low_cpu_mem_usage = True,
76
+ trust_remote_code = True
77
+ )
78
+
79
+ max_len = 2048
80
+
81
+ pipe = pipeline(
82
+ task = "text-generation",
83
+ model = model,
84
+ tokenizer = tokenizer,
85
+ pad_token_id = tokenizer.eos_token_id,
86
+ max_length = max_len,
87
+ temperature = RAG.temperature,
88
+ top_p = RAG.top_p
89
+ repetition_penalty = RAG.repetition_penalty
90
+ )
91
+
92
+ llm = HuggingFacePipeline(pipeline = pipe)
93
+
94
+ query = """Give me the detail on momentum and torque and how they are different."""
95
+ llm.invoke(query, truncation=True)
96
+
97
+ loader = DircetoryLoader(
98
+ RAG.Embeddings_path,
99
+ glob="./*.pdf",
100
+ loader_cls=PyPDFLoader,
101
+ show_progress=True,
102
+ use_multithreading=True
103
+ )
104
+
105
+ documents = loader.load()
106
+
107
+ print(f'We have {len(documents)} pages in total')
108
+
109
+ documents[100].page_content
110
+
111
+ text_splitter = RecursiveCharacterTextSplitter(
112
+ chunk_size = RAG.split_chunk_size,
113
+ chunk_overlap = RAG.split_documents(documents)
114
+
115
+ print(f'We have created {len(texts)} chunks from {len(documents)} pages')
116
+ )
117
+
118
+ if not os.path.exists(RAG.Embeddings_path + '/index.faiss'):
119
+
120
+ embeddings = HuggingFaceInstructEmbeddings(
121
+ model_name = RAG.embeddings_model_repo,
122
+ model_kwargs = {"device": "cuda"}
123
+ )
124
+ vectordb.save_local(f"{RAG.Persist_directory}/faiss_index_hp")
125
+
126
+ embeddings = HuggingFaceInstructEmbeddings(
127
+ model_name = RAG.embeddings_model_repo,
128
+ model_kwargs = {"device": "cuda"}
129
+ )
130
+
131
+ vectordb = FAISS.load_local(
132
+ RAG.Persist_directory + '/faiss_index_hp',
133
+ embeddings,
134
+ allow_dangerous_deserialization=True
135
+ )
136
+
137
+ vectordb.similarity_search('quantum')
138
+
139
+ prompt_template = """Suppose you are a Teaching assitant.
140
+ Your task is to gave answers to the asked questions with sympathy, empathy and kind words.
141
+ Start by something like good question or very good point etc.
142
+ Ensure your response is directed at the person asking the question, assuming they are not another teacher but a student seeking guidance.
143
+ At the end of the answer, give best wishe like "I hope you understand. If not, I'll be glad to explain to you again,"
144
+ Please try to be as concise as you can and use no more words than 150.
145
+ Important Note: Please provide as accurate answers as you can and for numerical problems provide explanation.
146
+ Try to follow the following pieces of context as much as you can but you can also use your own information.
147
+
148
+ {context}
149
+
150
+ Question: {question}
151
+ Answer:"""
152
+
153
+ PROMPT = PrompTemplate(
154
+ template = prompt_template,
155
+ input_variables = ["context", "question"]
156
+ )
157
+
158
+ retriver = vectordb.as_retriever(search_kwargs = {
159
+ "k": RAG.k, "search_type" : "similarity"})
160
+
161
+ qa_chain = RetrievalQA.from_chain_type(
162
+ llm = llm,
163
+ chain_type = "stuff", # map_reduce, map_rerank,stuff, refine
164
+ retriever = retriever,
165
+ chain_type_kwargs = {"prompt": PROMPT},
166
+ return_source_documents = True,
167
+ verbose = False
168
+ )
169
+
170
+ question = "First law of motion has another name what it is."
171
+ vectordb.max_marginal_relevance_search(question, k = RAG.k)
172
+
173
+ def wrap_text_preserve_newlines(text, width=700):
174
+ lines = text.split('\n')
175
+
176
+ wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
177
+
178
+ wrapped_text = '\n'.join(wrapped_lines)
179
+
180
+ return wrapped_text
181
+
182
+ def process_llm_response(llm_response):
183
+ answer_full = llm_response['result']
184
+ answer_start = answer_full.find("Answer:") + 1en("Answer:")
185
+ answer = answer_full[answer_start:].strip()
186
+
187
+ answer = wrap_text_preserve_newlines(answer)
188
+ return answer
189
+
190
+ def llm_ans(query):
191
+
192
+ llm_response = qa_chain.invoke(query)
193
+ ans = process_llm_response(llm_response)
194
+ end = time.time()
195
+
196
+ return ans
197
+
198
+ query = "Firt law of motion has another name what it is."
199
+ print(llm_ans(query))
200
+
201
+ query = """Firt law of motion has another name what it is."""
202
+ llm.invoke(query,truncation=True)
203
+
204
+ query = "The concrete roof of a house of thickness 20 cm has an area 200 m2. The temperature inside the house is 15° C and outside is 35° C. find the rate at which thermal energy conducted through the roof in Js-1. The value of k for concrete is 0.65 Wm1K1."
205
+ print(llm_ans(query))
206
+
207
+ query = """The concrete roof of a house of thickness 20 cm has an area 200 m2. The temperature inside the house is 15° C and outside is 35° C. find the rate at which thermal energy conducted through the roof in Js-1. The value of k for concrete is 0.65 Wm1K1."""