NilavoBoral commited on
Commit
3ab3d0b
1 Parent(s): ec0dd25

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from langchain.embeddings.huggingface import HuggingFaceEmbeddings
3
+ import os
4
+ import pinecone
5
+ import time
6
+ from datasets import load_dataset
7
+ from PyPDF2 import PdfReader
8
+ from langchain.text_splitter import CharacterTextSplitter
9
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
10
+ from typing_extensions import Concatenate
11
+ from torch import cuda, bfloat16
12
+ import transformers
13
+ from langchain.llms import HuggingFacePipeline
14
+ from langchain.vectorstores import Pinecone
15
+ from langchain.chains import RetrievalQA
16
+ import gradio as gr
17
+
18
+ # Define the model from Hugging Face
19
+ model_id = 'meta-llama/Llama-2-13b-chat-hf'
20
+
21
+ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
22
+
23
+ # set quantization configuration to load large model with less GPU memory
24
+ # this requires the `bitsandbytes` library
25
+ bnb_config = transformers.BitsAndBytesConfig(
26
+ load_in_4bit=True,
27
+ bnb_4bit_quant_type='nf4',
28
+ bnb_4bit_use_double_quant=True,
29
+ bnb_4bit_compute_dtype=bfloat16
30
+ )
31
+
32
+ # begin initializing HF items, need auth token for these
33
+ hf_auth = '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'
34
+ model_config = transformers.AutoConfig.from_pretrained(
35
+ model_id,
36
+ use_auth_token=hf_auth
37
+ )
38
+
39
+ model = transformers.AutoModelForCausalLM.from_pretrained(
40
+ model_id,
41
+ trust_remote_code=True,
42
+ config=model_config,
43
+ quantization_config=bnb_config,
44
+ device_map='auto',
45
+ use_auth_token=hf_auth
46
+ )
47
+ model.eval()
48
+
49
+
50
+ # Define the tokenizer from Hugging Face
51
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
52
+ model_id,
53
+ use_auth_token=hf_auth
54
+ )
55
+
56
+
57
+ generate_text = transformers.pipeline(
58
+ model=model, tokenizer=tokenizer,
59
+ return_full_text=True, # langchain expects the full text
60
+ task='text-generation',
61
+ # we pass model parameters here too
62
+ temperature=0.0, # 'randomness' of outputs, 0.0 is the min and 1.0 the max
63
+ max_new_tokens=512, # mex number of tokens to generate in the output
64
+ repetition_penalty=1.1 # without this output begins repeating
65
+ )
66
+
67
+ llm = HuggingFacePipeline(pipeline=generate_text)
68
+
69
+
70
+ # get API key from app.pinecone.io and environment from console
71
+ pinecone.init(
72
+ environment="gcp-starter",
73
+ api_key="a7dddfc1-8eb3-477e-bc69-0b52f0ee201a"
74
+ )
75
+
76
+ index_name = 'rag-llama-2-paper'
77
+ index = pinecone.Index(index_name)
78
+
79
+ embed_model_id = 'sentence-transformers/all-MiniLM-L6-v2'
80
+ device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
81
+ embed_model = HuggingFaceEmbeddings(
82
+ model_name=embed_model_id,
83
+ model_kwargs={'device': device},
84
+ encode_kwargs={'device': device, 'batch_size': 32}
85
+ )
86
+
87
+ text_field = 'text' # field in metadata that contains text content
88
+ vectorstore = Pinecone(
89
+ index, embed_model.embed_query, text_field
90
+ )
91
+
92
+ rag_pipeline = RetrievalQA.from_chain_type(
93
+ llm=llm, chain_type='stuff',
94
+ retriever=vectorstore.as_retriever()
95
+ )
96
+
97
+ # Function to generate text using the model
98
+ def answer(Question):
99
+ return rag_pipeline(Question)['result']
100
+
101
+
102
+ # Create a Gradio interface
103
+ iface = gr.Interface(
104
+ fn=answer,
105
+ inputs=gr.Textbox(Question="Ask your query"),
106
+ outputs=gr.Textbox(),
107
+ title="Know Llama-2",
108
+ description="Ask the Llama-2-13b model anything about itself.",
109
+ )
110
+
111
+ # Launch the Gradio app
112
+ iface.launch()